file_types: filter macOS AppleDouble + .DS_Store from media predicates

Symptom: Apollo's logs showed bursts of 422 decode_failed from
ImageApi's CLIP backfill — e.g. `._DSC_2182-S.jpg`. macOS writes
`._<name>` AppleDouble sidecars when copying to non-HFS volumes
(SMB, FAT, exFAT), and they carry the original file's extension
even though their bytes are extended-attribute metadata, not the
image. ImageApi's walker matched them via the extension predicate,
sent them through the ingest pipeline, and accumulated failed rows
in face_detections + clip_embedding while pinning Apollo's eviction
timer with the 422 burst.

Fix: predicate-level guard in is_image_file / is_video_file (and
by inheritance is_media_file). Every walker that already gates on
these (face_watch, backfill, clip_watch, watcher, files,
probe_clip_search) inherits the skip without per-callsite edits.
Narrow scope on purpose — `._*` prefix + the exact `.DS_Store`
basename — rather than blanket dotfile filtering, because a user
could plausibly name a cover image `.cover.jpg`.

Existing rows are not cleaned by this change. To purge what
already accumulated (one-shot, run from your DB shell after
deploying):

  DELETE FROM image_exif
   WHERE file_path LIKE '%/._%' OR file_path LIKE '%/.DS_Store';
  DELETE FROM face_detections
   WHERE rel_path LIKE '%/._%' OR rel_path LIKE '%/.DS_Store';
  DELETE FROM tagged_photo
   WHERE file_path LIKE '%/._%' OR file_path LIKE '%/.DS_Store';
  DELETE FROM favorites
   WHERE path LIKE '%/._%' OR path LIKE '%/.DS_Store';

The maintenance pipeline's missing-file scan would NOT catch these
on its own — the files exist on disk (they're real macOS metadata,
just not images), so stat() returns Ok and the row sticks.
This commit is contained in:
Cameron Cordes
2026-05-17 19:49:23 -04:00
committed by Cameron
parent d275150db6
commit b843a4a366

View File

@@ -22,8 +22,38 @@ pub fn needs_ffmpeg_thumbnail(path: &Path) -> bool {
/// Supported video file extensions
pub const VIDEO_EXTENSIONS: &[&str] = &["mp4", "mov", "avi", "mkv"];
/// Filenames that are filesystem metadata, not real media — exact
/// basename match. Extend if a new platform sidecar appears (Windows
/// Thumbs.db / desktop.ini live here too if those libraries land).
const METADATA_FILENAMES: &[&str] = &[".DS_Store"];
/// True if the basename is a filesystem metadata sidecar that should be
/// invisible to every media predicate.
///
/// macOS writes `._<name>` AppleDouble companions when copying to
/// non-HFS volumes — each holds the extended attributes of `<name>`,
/// NOT a copy of the bytes. Same extension as the real file, so a
/// pure-extension match treats `._photo.jpg` as a JPEG, ships it to
/// the decoder, and accumulates failed rows: face_detections
/// `status='failed'`, clip_embedding `status='failed'`, plus a
/// pointless `image_exif` row whose `content_hash` will be the hash
/// of the metadata blob. The downstream noise (failed-row counts that
/// never go to zero, 422 bursts to Apollo, evictor timer reset by
/// those 422s) is the visible damage. `.DS_Store` is the per-directory
/// version (Finder view state) — no extension, but cheap to guard
/// here too in case some future predicate matches by content type.
pub fn is_filesystem_metadata(path: &Path) -> bool {
let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
return false;
};
name.starts_with("._") || METADATA_FILENAMES.contains(&name)
}
/// Check if a path has an image extension
pub fn is_image_file(path: &Path) -> bool {
if is_filesystem_metadata(path) {
return false;
}
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
let ext_lower = ext.to_lowercase();
IMAGE_EXTENSIONS.contains(&ext_lower.as_str())
@@ -34,6 +64,9 @@ pub fn is_image_file(path: &Path) -> bool {
/// Check if a path has a video extension
pub fn is_video_file(path: &Path) -> bool {
if is_filesystem_metadata(path) {
return false;
}
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
let ext_lower = ext.to_lowercase();
VIDEO_EXTENSIONS.contains(&ext_lower.as_str())
@@ -98,4 +131,46 @@ mod tests {
assert!(!is_media_file(Path::new("document.txt")));
assert!(!is_media_file(Path::new("no_extension")));
}
#[test]
fn test_apple_double_excluded_from_media() {
// The bug-of-record: ImageApi was shipping macOS AppleDouble
// sidecars to Apollo's CLIP/face decoders, accumulating failed
// rows and pinning Apollo's eviction timer with the 422 burst.
// Predicate-level guard means every downstream walker
// (face_watch, backfill, clip_watch, watcher) inherits the fix
// without touching their filters.
assert!(!is_image_file(Path::new("._photo.jpg")));
assert!(!is_image_file(Path::new("dir/._photo.JPG")));
assert!(!is_image_file(Path::new("a/b/._DSC_2182-S.jpg")));
assert!(!is_video_file(Path::new("._video.mp4")));
assert!(!is_media_file(Path::new("._photo.png")));
// A real file that merely starts with "_" (no leading dot) is
// not AppleDouble — must NOT be filtered.
assert!(is_image_file(Path::new("_photo.jpg")));
}
#[test]
fn test_ds_store_excluded() {
// Finder per-directory metadata. No image extension so
// is_image_file would already say false; the guard makes the
// predicate's *reason* explicit and covers a hypothetical
// future caller matching by basename.
assert!(!is_image_file(Path::new(".DS_Store")));
assert!(!is_video_file(Path::new(".DS_Store")));
assert!(!is_media_file(Path::new("some/dir/.DS_Store")));
assert!(is_filesystem_metadata(Path::new(".DS_Store")));
assert!(is_filesystem_metadata(Path::new("dir/.DS_Store")));
}
#[test]
fn test_dotfiles_other_than_apple_double_are_unaffected() {
// We deliberately scope to `._*` + the exact .DS_Store name —
// not all dotfiles — because a user could plausibly name a
// cover image `.cover.jpg` and we shouldn't silently drop it.
// If that turns out to be wrong, broaden here; for now,
// narrow + explicit > broad + surprising.
assert!(is_image_file(Path::new(".cover.jpg")));
assert!(!is_filesystem_metadata(Path::new(".cover.jpg")));
}
}