file_types: filter macOS AppleDouble + .DS_Store from media predicates
Symptom: Apollo's logs showed bursts of 422 decode_failed from ImageApi's CLIP backfill — e.g. `._DSC_2182-S.jpg`. macOS writes `._<name>` AppleDouble sidecars when copying to non-HFS volumes (SMB, FAT, exFAT), and they carry the original file's extension even though their bytes are extended-attribute metadata, not the image. ImageApi's walker matched them via the extension predicate, sent them through the ingest pipeline, and accumulated failed rows in face_detections + clip_embedding while pinning Apollo's eviction timer with the 422 burst. Fix: predicate-level guard in is_image_file / is_video_file (and by inheritance is_media_file). Every walker that already gates on these (face_watch, backfill, clip_watch, watcher, files, probe_clip_search) inherits the skip without per-callsite edits. Narrow scope on purpose — `._*` prefix + the exact `.DS_Store` basename — rather than blanket dotfile filtering, because a user could plausibly name a cover image `.cover.jpg`. Existing rows are not cleaned by this change. To purge what already accumulated (one-shot, run from your DB shell after deploying): DELETE FROM image_exif WHERE file_path LIKE '%/._%' OR file_path LIKE '%/.DS_Store'; DELETE FROM face_detections WHERE rel_path LIKE '%/._%' OR rel_path LIKE '%/.DS_Store'; DELETE FROM tagged_photo WHERE file_path LIKE '%/._%' OR file_path LIKE '%/.DS_Store'; DELETE FROM favorites WHERE path LIKE '%/._%' OR path LIKE '%/.DS_Store'; The maintenance pipeline's missing-file scan would NOT catch these on its own — the files exist on disk (they're real macOS metadata, just not images), so stat() returns Ok and the row sticks.
This commit is contained in:
@@ -22,8 +22,38 @@ pub fn needs_ffmpeg_thumbnail(path: &Path) -> bool {
|
|||||||
/// Supported video file extensions
|
/// Supported video file extensions
|
||||||
pub const VIDEO_EXTENSIONS: &[&str] = &["mp4", "mov", "avi", "mkv"];
|
pub const VIDEO_EXTENSIONS: &[&str] = &["mp4", "mov", "avi", "mkv"];
|
||||||
|
|
||||||
|
/// Filenames that are filesystem metadata, not real media — exact
|
||||||
|
/// basename match. Extend if a new platform sidecar appears (Windows
|
||||||
|
/// Thumbs.db / desktop.ini live here too if those libraries land).
|
||||||
|
const METADATA_FILENAMES: &[&str] = &[".DS_Store"];
|
||||||
|
|
||||||
|
/// True if the basename is a filesystem metadata sidecar that should be
|
||||||
|
/// invisible to every media predicate.
|
||||||
|
///
|
||||||
|
/// macOS writes `._<name>` AppleDouble companions when copying to
|
||||||
|
/// non-HFS volumes — each holds the extended attributes of `<name>`,
|
||||||
|
/// NOT a copy of the bytes. Same extension as the real file, so a
|
||||||
|
/// pure-extension match treats `._photo.jpg` as a JPEG, ships it to
|
||||||
|
/// the decoder, and accumulates failed rows: face_detections
|
||||||
|
/// `status='failed'`, clip_embedding `status='failed'`, plus a
|
||||||
|
/// pointless `image_exif` row whose `content_hash` will be the hash
|
||||||
|
/// of the metadata blob. The downstream noise (failed-row counts that
|
||||||
|
/// never go to zero, 422 bursts to Apollo, evictor timer reset by
|
||||||
|
/// those 422s) is the visible damage. `.DS_Store` is the per-directory
|
||||||
|
/// version (Finder view state) — no extension, but cheap to guard
|
||||||
|
/// here too in case some future predicate matches by content type.
|
||||||
|
pub fn is_filesystem_metadata(path: &Path) -> bool {
|
||||||
|
let Some(name) = path.file_name().and_then(|n| n.to_str()) else {
|
||||||
|
return false;
|
||||||
|
};
|
||||||
|
name.starts_with("._") || METADATA_FILENAMES.contains(&name)
|
||||||
|
}
|
||||||
|
|
||||||
/// Check if a path has an image extension
|
/// Check if a path has an image extension
|
||||||
pub fn is_image_file(path: &Path) -> bool {
|
pub fn is_image_file(path: &Path) -> bool {
|
||||||
|
if is_filesystem_metadata(path) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
|
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
|
||||||
let ext_lower = ext.to_lowercase();
|
let ext_lower = ext.to_lowercase();
|
||||||
IMAGE_EXTENSIONS.contains(&ext_lower.as_str())
|
IMAGE_EXTENSIONS.contains(&ext_lower.as_str())
|
||||||
@@ -34,6 +64,9 @@ pub fn is_image_file(path: &Path) -> bool {
|
|||||||
|
|
||||||
/// Check if a path has a video extension
|
/// Check if a path has a video extension
|
||||||
pub fn is_video_file(path: &Path) -> bool {
|
pub fn is_video_file(path: &Path) -> bool {
|
||||||
|
if is_filesystem_metadata(path) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
|
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
|
||||||
let ext_lower = ext.to_lowercase();
|
let ext_lower = ext.to_lowercase();
|
||||||
VIDEO_EXTENSIONS.contains(&ext_lower.as_str())
|
VIDEO_EXTENSIONS.contains(&ext_lower.as_str())
|
||||||
@@ -98,4 +131,46 @@ mod tests {
|
|||||||
assert!(!is_media_file(Path::new("document.txt")));
|
assert!(!is_media_file(Path::new("document.txt")));
|
||||||
assert!(!is_media_file(Path::new("no_extension")));
|
assert!(!is_media_file(Path::new("no_extension")));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_apple_double_excluded_from_media() {
|
||||||
|
// The bug-of-record: ImageApi was shipping macOS AppleDouble
|
||||||
|
// sidecars to Apollo's CLIP/face decoders, accumulating failed
|
||||||
|
// rows and pinning Apollo's eviction timer with the 422 burst.
|
||||||
|
// Predicate-level guard means every downstream walker
|
||||||
|
// (face_watch, backfill, clip_watch, watcher) inherits the fix
|
||||||
|
// without touching their filters.
|
||||||
|
assert!(!is_image_file(Path::new("._photo.jpg")));
|
||||||
|
assert!(!is_image_file(Path::new("dir/._photo.JPG")));
|
||||||
|
assert!(!is_image_file(Path::new("a/b/._DSC_2182-S.jpg")));
|
||||||
|
assert!(!is_video_file(Path::new("._video.mp4")));
|
||||||
|
assert!(!is_media_file(Path::new("._photo.png")));
|
||||||
|
// A real file that merely starts with "_" (no leading dot) is
|
||||||
|
// not AppleDouble — must NOT be filtered.
|
||||||
|
assert!(is_image_file(Path::new("_photo.jpg")));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_ds_store_excluded() {
|
||||||
|
// Finder per-directory metadata. No image extension so
|
||||||
|
// is_image_file would already say false; the guard makes the
|
||||||
|
// predicate's *reason* explicit and covers a hypothetical
|
||||||
|
// future caller matching by basename.
|
||||||
|
assert!(!is_image_file(Path::new(".DS_Store")));
|
||||||
|
assert!(!is_video_file(Path::new(".DS_Store")));
|
||||||
|
assert!(!is_media_file(Path::new("some/dir/.DS_Store")));
|
||||||
|
assert!(is_filesystem_metadata(Path::new(".DS_Store")));
|
||||||
|
assert!(is_filesystem_metadata(Path::new("dir/.DS_Store")));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_dotfiles_other_than_apple_double_are_unaffected() {
|
||||||
|
// We deliberately scope to `._*` + the exact .DS_Store name —
|
||||||
|
// not all dotfiles — because a user could plausibly name a
|
||||||
|
// cover image `.cover.jpg` and we shouldn't silently drop it.
|
||||||
|
// If that turns out to be wrong, broaden here; for now,
|
||||||
|
// narrow + explicit > broad + surprising.
|
||||||
|
assert!(is_image_file(Path::new(".cover.jpg")));
|
||||||
|
assert!(!is_filesystem_metadata(Path::new(".cover.jpg")));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user