diff --git a/src/file_scan.rs b/src/file_scan.rs new file mode 100644 index 0000000..a8bcc05 --- /dev/null +++ b/src/file_scan.rs @@ -0,0 +1,200 @@ +//! File enumeration for the indexer pass. +//! +//! Walks a library root and returns the `(absolute_path, forward_slash_rel_path)` +//! pairs that belong in `image_exif`. Pruning `EXCLUDED_DIRS` happens here at +//! WalkDir time via `filter_entry` so whole subtrees (Synology's `@eaDir`, +//! `.thumbnails`, the operator's configured excludes) are never descended — +//! vs walking the full tree and discarding leaves, which on a Synology mount +//! with thousands of `@eaDir` subdirs is the difference between scanning N +//! files and N×3. +//! +//! Previously inlined in `main.rs::process_new_files` without the exclusion +//! filter — paths like `/@eaDir/.../SYNOFILE_THUMB_*.jpg` ended up in +//! `image_exif` and looped through `face_watch::filter_excluded` every tick, +//! since no `face_detections` row would ever be written for a path dropped +//! at runtime. + +use std::path::{Path, PathBuf}; +use std::time::SystemTime; + +use walkdir::WalkDir; + +use crate::file_types; +use crate::memories::PathExcluder; + +/// Walk `base_path`, prune `EXCLUDED_DIRS` subtrees, and return +/// `(absolute_path, forward_slash_rel_path)` for every image / video file +/// that should be indexed. +/// +/// `modified_since` keeps only files modified at or after the instant — +/// used by the watcher's quick-scan tick to skip the long tail. Files +/// whose metadata can't be read are kept; the caller's batch EXIF lookup +/// dedups against existing rows. +pub fn enumerate_indexable_files( + base_path: &Path, + excluded_dirs: &[String], + modified_since: Option, +) -> Vec<(PathBuf, String)> { + let excluder = PathExcluder::new(base_path, excluded_dirs); + + WalkDir::new(base_path) + .into_iter() + // Prune whole subtrees so WalkDir doesn't descend into excluded + // dirs at all. Always allow depth 0 (the root itself); under a + // pathological config that excludes the base, downstream filters + // would still drop everything anyway. + .filter_entry(|entry| entry.depth() == 0 || !excluder.is_excluded(entry.path())) + .filter_map(|entry| entry.ok()) + .filter(|entry| entry.file_type().is_file()) + .filter(|entry| match modified_since { + Some(since) => entry + .metadata() + .ok() + .and_then(|m| m.modified().ok()) + .map(|m| m >= since) + .unwrap_or(true), + None => true, + }) + .filter(|entry| { + file_types::direntry_is_image(entry) || file_types::direntry_is_video(entry) + }) + .filter_map(|entry| { + let file_path = entry.path().to_path_buf(); + // Forward-slash rel_path regardless of OS so DB comparisons + // against the batch EXIF lookup line up. + let rel = file_path + .strip_prefix(base_path) + .ok()? + .to_str()? + .replace('\\', "/"); + Some((file_path, rel)) + }) + .collect() +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + use std::time::Duration; + + /// Build a tempdir with `paths` (relative). Each touched file is empty; + /// directory components are created automatically. + fn make_tree(paths: &[&str]) -> tempfile::TempDir { + let dir = tempfile::tempdir().expect("tempdir"); + for p in paths { + let abs = dir.path().join(p); + if let Some(parent) = abs.parent() { + fs::create_dir_all(parent).expect("mkdir -p"); + } + fs::File::create(&abs).expect("touch"); + } + dir + } + + fn rel_paths(found: &[(PathBuf, String)]) -> Vec { + let mut v: Vec = found.iter().map(|(_, r)| r.clone()).collect(); + v.sort(); + v + } + + #[test] + fn excludes_eadir_subtree() { + // The bug: Synology's @eaDir gets walked into and its + // SYNOFILE_THUMB_*.jpg leaves end up in image_exif. With + // filter_entry pruning, the subtree is never descended. + let dir = make_tree(&[ + "vacation/IMG_0001.jpg", + "vacation/@eaDir/IMG_0001.jpg/SYNOFILE_THUMB_S.jpg", + "vacation/@eaDir/IMG_0001.jpg/SYNOFILE_THUMB_XL.jpg", + "@eaDir/top_level_thumb.jpg", + ]); + let found = enumerate_indexable_files( + dir.path(), + &["@eaDir".to_string()], + None, + ); + assert_eq!(rel_paths(&found), vec!["vacation/IMG_0001.jpg".to_string()]); + } + + #[test] + fn excludes_nested_pattern() { + // .thumbnails as a component pattern (not an absolute dir). + let dir = make_tree(&[ + "a/b/photo.jpg", + "a/.thumbnails/cached.jpg", + "a/b/.thumbnails/nested.jpg", + ]); + let found = enumerate_indexable_files( + dir.path(), + &[".thumbnails".to_string()], + None, + ); + assert_eq!(rel_paths(&found), vec!["a/b/photo.jpg".to_string()]); + } + + #[test] + fn excludes_absolute_under_base() { + // Leading-'/' entries are interpreted as paths under the library + // root (see PathExcluder::new). + let dir = make_tree(&[ + "private/secret.jpg", + "public/keep.jpg", + ]); + let found = enumerate_indexable_files( + dir.path(), + &["/private".to_string()], + None, + ); + assert_eq!(rel_paths(&found), vec!["public/keep.jpg".to_string()]); + } + + #[test] + fn filters_non_media() { + let dir = make_tree(&[ + "a.jpg", + "b.mp4", + "c.txt", + "d", // no extension + "e.jpg.bak", // wrong ext + ]); + let found = enumerate_indexable_files(dir.path(), &[], None); + assert_eq!(rel_paths(&found), vec!["a.jpg".to_string(), "b.mp4".to_string()]); + } + + #[test] + fn modified_since_filters_old_files() { + let dir = make_tree(&["old.jpg", "new.jpg"]); + // Backdate "old.jpg" to a known instant. Use filetime via a portable + // touch: set both atime and mtime to a fixed past time using + // std::fs::File metadata — simpler to set the cutoff into the future + // for "old" and the present for "new" semantically. + // + // Simplest reliable approach: capture mtime of new.jpg, sleep + // briefly, recreate it, and use the original mtime as the cutoff. + // That way "old.jpg" is older than the cutoff and "new.jpg" is at + // or after. + let new_path = dir.path().join("new.jpg"); + // Force a measurable gap so filesystems with low-resolution mtime + // don't collapse them into the same instant. + std::thread::sleep(Duration::from_millis(20)); + let cutoff = SystemTime::now(); + std::thread::sleep(Duration::from_millis(20)); + // Bump new.jpg's mtime by rewriting it. + fs::write(&new_path, b"x").expect("rewrite"); + + let found = enumerate_indexable_files(dir.path(), &[], Some(cutoff)); + assert_eq!(rel_paths(&found), vec!["new.jpg".to_string()]); + } + + #[test] + fn rel_path_is_forward_slash() { + // Sanity on a nested path. On Unix this is already '/'; the + // assertion guards a future Windows port from regressing. + let dir = make_tree(&["a/b/c.jpg"]); + let found = enumerate_indexable_files(dir.path(), &[], None); + let (_abs, rel) = &found[0]; + assert_eq!(rel, "a/b/c.jpg"); + assert!(!rel.contains('\\')); + } +} diff --git a/src/lib.rs b/src/lib.rs index 12de818..19b1c3c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,6 +14,7 @@ pub mod error; pub mod exif; pub mod face_watch; pub mod faces; +pub mod file_scan; pub mod file_types; pub mod files; pub mod geo; diff --git a/src/main.rs b/src/main.rs index 3e85cbd..2fdcdcd 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1974,37 +1974,11 @@ fn process_new_files( let thumbnail_directory = Path::new(&thumbs); let base_path = Path::new(&library.root_path); - // Collect all image and video files, optionally filtered by modification time - let files: Vec<(PathBuf, String)> = WalkDir::new(base_path) - .into_iter() - .filter_map(|entry| entry.ok()) - .filter(|entry| entry.file_type().is_file()) - .filter(|entry| { - // Filter by modification time if specified - if let Some(since) = modified_since { - if let Ok(metadata) = entry.metadata() - && let Ok(modified) = metadata.modified() - { - return modified >= since; - } - // If we can't get metadata, include the file to be safe - return true; - } - true - }) - .filter(|entry| is_image(entry) || is_video(entry)) - .filter_map(|entry| { - let file_path = entry.path().to_path_buf(); - // Canonical rel_path is forward-slash regardless of OS so DB - // comparisons against the batch EXIF lookup line up. - let relative_path = file_path - .strip_prefix(base_path) - .ok()? - .to_str()? - .replace('\\', "/"); - Some((file_path, relative_path)) - }) - .collect(); + // Walk, prune EXCLUDED_DIRS subtrees, and apply image/video + modified_since + // filters. See `file_scan` for why exclusion has to happen at WalkDir + // time (filter_entry) rather than at face-detect time. + let files: Vec<(PathBuf, String)> = + image_api::file_scan::enumerate_indexable_files(base_path, excluded_dirs, modified_since); if files.is_empty() { debug!("No files to process");