file_types: filter macOS AppleDouble + .DS_Store from media predicates #99

Merged
cameron merged 1 commits from feature/filter-fs-metadata into master 2026-05-18 17:12:42 +00:00
Showing only changes of commit b843a4a366 - Show all commits

View File

@@ -22,8 +22,38 @@ pub fn needs_ffmpeg_thumbnail(path: &Path) -> bool {
/// Supported video file extensions
pub const VIDEO_EXTENSIONS: &[&str] = &["mp4", "mov", "avi", "mkv"];
/// Filenames that are filesystem metadata, not real media — exact
/// basename match. Extend if a new platform sidecar appears (Windows
/// Thumbs.db / desktop.ini live here too if those libraries land).
const METADATA_FILENAMES: &[&str] = &[".DS_Store"];
/// True if the basename is a filesystem metadata sidecar that should be
/// invisible to every media predicate.
///
/// macOS writes `._<name>` AppleDouble companions when copying to
/// non-HFS volumes — each holds the extended attributes of `<name>`,
/// NOT a copy of the bytes. Same extension as the real file, so a
/// pure-extension match treats `._photo.jpg` as a JPEG, ships it to
/// the decoder, and accumulates failed rows: face_detections
/// `status='failed'`, clip_embedding `status='failed'`, plus a
/// pointless `image_exif` row whose `content_hash` will be the hash
/// of the metadata blob. The downstream noise (failed-row counts that
/// never go to zero, 422 bursts to Apollo, evictor timer reset by
/// those 422s) is the visible damage. `.DS_Store` is the per-directory
/// version (Finder view state) — no extension, but cheap to guard
/// here too in case some future predicate matches by content type.
pub fn is_filesystem_metadata(path: &Path) -> bool {
let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
return false;
};
name.starts_with("._") || METADATA_FILENAMES.contains(&name)
}
/// Check if a path has an image extension
pub fn is_image_file(path: &Path) -> bool {
if is_filesystem_metadata(path) {
return false;
}
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
let ext_lower = ext.to_lowercase();
IMAGE_EXTENSIONS.contains(&ext_lower.as_str())
@@ -34,6 +64,9 @@ pub fn is_image_file(path: &Path) -> bool {
/// Check if a path has a video extension
pub fn is_video_file(path: &Path) -> bool {
if is_filesystem_metadata(path) {
return false;
}
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
let ext_lower = ext.to_lowercase();
VIDEO_EXTENSIONS.contains(&ext_lower.as_str())
@@ -98,4 +131,46 @@ mod tests {
assert!(!is_media_file(Path::new("document.txt")));
assert!(!is_media_file(Path::new("no_extension")));
}
#[test]
fn test_apple_double_excluded_from_media() {
// The bug-of-record: ImageApi was shipping macOS AppleDouble
// sidecars to Apollo's CLIP/face decoders, accumulating failed
// rows and pinning Apollo's eviction timer with the 422 burst.
// Predicate-level guard means every downstream walker
// (face_watch, backfill, clip_watch, watcher) inherits the fix
// without touching their filters.
assert!(!is_image_file(Path::new("._photo.jpg")));
assert!(!is_image_file(Path::new("dir/._photo.JPG")));
assert!(!is_image_file(Path::new("a/b/._DSC_2182-S.jpg")));
assert!(!is_video_file(Path::new("._video.mp4")));
assert!(!is_media_file(Path::new("._photo.png")));
// A real file that merely starts with "_" (no leading dot) is
// not AppleDouble — must NOT be filtered.
assert!(is_image_file(Path::new("_photo.jpg")));
}
#[test]
fn test_ds_store_excluded() {
// Finder per-directory metadata. No image extension so
// is_image_file would already say false; the guard makes the
// predicate's *reason* explicit and covers a hypothetical
// future caller matching by basename.
assert!(!is_image_file(Path::new(".DS_Store")));
assert!(!is_video_file(Path::new(".DS_Store")));
assert!(!is_media_file(Path::new("some/dir/.DS_Store")));
assert!(is_filesystem_metadata(Path::new(".DS_Store")));
assert!(is_filesystem_metadata(Path::new("dir/.DS_Store")));
}
#[test]
fn test_dotfiles_other_than_apple_double_are_unaffected() {
// We deliberately scope to `._*` + the exact .DS_Store name —
// not all dotfiles — because a user could plausibly name a
// cover image `.cover.jpg` and we shouldn't silently drop it.
// If that turns out to be wrong, broaden here; for now,
// narrow + explicit > broad + surprising.
assert!(is_image_file(Path::new(".cover.jpg")));
assert!(!is_filesystem_metadata(Path::new(".cover.jpg")));
}
}