From b843a4a366ea93082fdcec2cc563a9716e845b39 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Sun, 17 May 2026 19:49:23 -0400 Subject: [PATCH] file_types: filter macOS AppleDouble + .DS_Store from media predicates MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Symptom: Apollo's logs showed bursts of 422 decode_failed from ImageApi's CLIP backfill — e.g. `._DSC_2182-S.jpg`. macOS writes `._` AppleDouble sidecars when copying to non-HFS volumes (SMB, FAT, exFAT), and they carry the original file's extension even though their bytes are extended-attribute metadata, not the image. ImageApi's walker matched them via the extension predicate, sent them through the ingest pipeline, and accumulated failed rows in face_detections + clip_embedding while pinning Apollo's eviction timer with the 422 burst. Fix: predicate-level guard in is_image_file / is_video_file (and by inheritance is_media_file). Every walker that already gates on these (face_watch, backfill, clip_watch, watcher, files, probe_clip_search) inherits the skip without per-callsite edits. Narrow scope on purpose — `._*` prefix + the exact `.DS_Store` basename — rather than blanket dotfile filtering, because a user could plausibly name a cover image `.cover.jpg`. Existing rows are not cleaned by this change. To purge what already accumulated (one-shot, run from your DB shell after deploying): DELETE FROM image_exif WHERE file_path LIKE '%/._%' OR file_path LIKE '%/.DS_Store'; DELETE FROM face_detections WHERE rel_path LIKE '%/._%' OR rel_path LIKE '%/.DS_Store'; DELETE FROM tagged_photo WHERE file_path LIKE '%/._%' OR file_path LIKE '%/.DS_Store'; DELETE FROM favorites WHERE path LIKE '%/._%' OR path LIKE '%/.DS_Store'; The maintenance pipeline's missing-file scan would NOT catch these on its own — the files exist on disk (they're real macOS metadata, just not images), so stat() returns Ok and the row sticks. --- src/file_types.rs | 75 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/src/file_types.rs b/src/file_types.rs index f312916..33f71dd 100644 --- a/src/file_types.rs +++ b/src/file_types.rs @@ -22,8 +22,38 @@ pub fn needs_ffmpeg_thumbnail(path: &Path) -> bool { /// Supported video file extensions pub const VIDEO_EXTENSIONS: &[&str] = &["mp4", "mov", "avi", "mkv"]; +/// Filenames that are filesystem metadata, not real media — exact +/// basename match. Extend if a new platform sidecar appears (Windows +/// Thumbs.db / desktop.ini live here too if those libraries land). +const METADATA_FILENAMES: &[&str] = &[".DS_Store"]; + +/// True if the basename is a filesystem metadata sidecar that should be +/// invisible to every media predicate. +/// +/// macOS writes `._` AppleDouble companions when copying to +/// non-HFS volumes — each holds the extended attributes of ``, +/// NOT a copy of the bytes. Same extension as the real file, so a +/// pure-extension match treats `._photo.jpg` as a JPEG, ships it to +/// the decoder, and accumulates failed rows: face_detections +/// `status='failed'`, clip_embedding `status='failed'`, plus a +/// pointless `image_exif` row whose `content_hash` will be the hash +/// of the metadata blob. The downstream noise (failed-row counts that +/// never go to zero, 422 bursts to Apollo, evictor timer reset by +/// those 422s) is the visible damage. `.DS_Store` is the per-directory +/// version (Finder view state) — no extension, but cheap to guard +/// here too in case some future predicate matches by content type. +pub fn is_filesystem_metadata(path: &Path) -> bool { + let Some(name) = path.file_name().and_then(|n| n.to_str()) else { + return false; + }; + name.starts_with("._") || METADATA_FILENAMES.contains(&name) +} + /// Check if a path has an image extension pub fn is_image_file(path: &Path) -> bool { + if is_filesystem_metadata(path) { + return false; + } if let Some(ext) = path.extension().and_then(|e| e.to_str()) { let ext_lower = ext.to_lowercase(); IMAGE_EXTENSIONS.contains(&ext_lower.as_str()) @@ -34,6 +64,9 @@ pub fn is_image_file(path: &Path) -> bool { /// Check if a path has a video extension pub fn is_video_file(path: &Path) -> bool { + if is_filesystem_metadata(path) { + return false; + } if let Some(ext) = path.extension().and_then(|e| e.to_str()) { let ext_lower = ext.to_lowercase(); VIDEO_EXTENSIONS.contains(&ext_lower.as_str()) @@ -98,4 +131,46 @@ mod tests { assert!(!is_media_file(Path::new("document.txt"))); assert!(!is_media_file(Path::new("no_extension"))); } + + #[test] + fn test_apple_double_excluded_from_media() { + // The bug-of-record: ImageApi was shipping macOS AppleDouble + // sidecars to Apollo's CLIP/face decoders, accumulating failed + // rows and pinning Apollo's eviction timer with the 422 burst. + // Predicate-level guard means every downstream walker + // (face_watch, backfill, clip_watch, watcher) inherits the fix + // without touching their filters. + assert!(!is_image_file(Path::new("._photo.jpg"))); + assert!(!is_image_file(Path::new("dir/._photo.JPG"))); + assert!(!is_image_file(Path::new("a/b/._DSC_2182-S.jpg"))); + assert!(!is_video_file(Path::new("._video.mp4"))); + assert!(!is_media_file(Path::new("._photo.png"))); + // A real file that merely starts with "_" (no leading dot) is + // not AppleDouble — must NOT be filtered. + assert!(is_image_file(Path::new("_photo.jpg"))); + } + + #[test] + fn test_ds_store_excluded() { + // Finder per-directory metadata. No image extension so + // is_image_file would already say false; the guard makes the + // predicate's *reason* explicit and covers a hypothetical + // future caller matching by basename. + assert!(!is_image_file(Path::new(".DS_Store"))); + assert!(!is_video_file(Path::new(".DS_Store"))); + assert!(!is_media_file(Path::new("some/dir/.DS_Store"))); + assert!(is_filesystem_metadata(Path::new(".DS_Store"))); + assert!(is_filesystem_metadata(Path::new("dir/.DS_Store"))); + } + + #[test] + fn test_dotfiles_other_than_apple_double_are_unaffected() { + // We deliberately scope to `._*` + the exact .DS_Store name — + // not all dotfiles — because a user could plausibly name a + // cover image `.cover.jpg` and we shouldn't silently drop it. + // If that turns out to be wrong, broaden here; for now, + // narrow + explicit > broad + surprising. + assert!(is_image_file(Path::new(".cover.jpg"))); + assert!(!is_filesystem_metadata(Path::new(".cover.jpg"))); + } } -- 2.49.1