diff --git a/src/file_scan.rs b/src/file_scan.rs new file mode 100644 index 0000000..9e6e79d --- /dev/null +++ b/src/file_scan.rs @@ -0,0 +1,235 @@ +//! File enumeration for the indexer pass. +//! +//! Walks a library root and returns the `(absolute_path, forward_slash_rel_path)` +//! pairs that belong in `image_exif`. Pruning `EXCLUDED_DIRS` happens here at +//! WalkDir time via `filter_entry` so whole subtrees (Synology's `@eaDir`, +//! `.thumbnails`, the operator's configured excludes) are never descended — +//! vs walking the full tree and discarding leaves, which on a Synology mount +//! with thousands of `@eaDir` subdirs is the difference between scanning N +//! files and N×3. +//! +//! Previously inlined in `main.rs::process_new_files` without the exclusion +//! filter — paths like `/@eaDir/.../SYNOFILE_THUMB_*.jpg` ended up in +//! `image_exif` and looped through `face_watch::filter_excluded` every tick, +//! since no `face_detections` row would ever be written for a path dropped +//! at runtime. + +use std::path::{Path, PathBuf}; +use std::time::SystemTime; + +use walkdir::{DirEntry, WalkDir}; + +use crate::file_types; +use crate::memories::PathExcluder; + +/// Walk `base_path`, prune `EXCLUDED_DIRS` subtrees, and return every file +/// entry (any extension). The shared primitive for any code that walks a +/// library root — thumbnail generation, media counts, orphan-playlist +/// reverse lookups, the indexer happy-path, etc. Higher-level helpers +/// (e.g. `enumerate_indexable_files`) layer media-type / mtime filters +/// on top. +/// +/// Pruning happens via `filter_entry` so excluded subtrees are never +/// descended at all. On a Synology mount with thousands of `@eaDir` +/// dirs, that's the difference between visiting N files and ~3N. +pub fn walk_library_files(base_path: &Path, excluded_dirs: &[String]) -> Vec { + let excluder = PathExcluder::new(base_path, excluded_dirs); + WalkDir::new(base_path) + .into_iter() + // Always allow depth 0 (the root). Under a pathological config + // that excludes the base itself, downstream filters drop everything + // anyway — but yielding nothing here would also be silently wrong. + .filter_entry(move |entry| entry.depth() == 0 || !excluder.is_excluded(entry.path())) + .filter_map(|entry| entry.ok()) + .filter(|entry| entry.file_type().is_file()) + .collect() +} + +/// Walk `base_path`, prune `EXCLUDED_DIRS` subtrees, and return +/// `(absolute_path, forward_slash_rel_path)` for every image / video file +/// that should be indexed. +/// +/// `modified_since` keeps only files modified at or after the instant — +/// used by the watcher's quick-scan tick to skip the long tail. Files +/// whose metadata can't be read are kept; the caller's batch EXIF lookup +/// dedups against existing rows. +pub fn enumerate_indexable_files( + base_path: &Path, + excluded_dirs: &[String], + modified_since: Option, +) -> Vec<(PathBuf, String)> { + walk_library_files(base_path, excluded_dirs) + .into_iter() + .filter(|entry| match modified_since { + Some(since) => entry + .metadata() + .ok() + .and_then(|m| m.modified().ok()) + .map(|m| m >= since) + .unwrap_or(true), + None => true, + }) + .filter(|entry| { + file_types::direntry_is_image(entry) || file_types::direntry_is_video(entry) + }) + .filter_map(|entry| { + let file_path = entry.path().to_path_buf(); + // Forward-slash rel_path regardless of OS so DB comparisons + // against the batch EXIF lookup line up. + let rel = file_path + .strip_prefix(base_path) + .ok()? + .to_str()? + .replace('\\', "/"); + Some((file_path, rel)) + }) + .collect() +} + +#[cfg(test)] +mod tests { + use super::*; + use std::fs; + use std::time::Duration; + + /// Build a tempdir with `paths` (relative). Each touched file is empty; + /// directory components are created automatically. + fn make_tree(paths: &[&str]) -> tempfile::TempDir { + let dir = tempfile::tempdir().expect("tempdir"); + for p in paths { + let abs = dir.path().join(p); + if let Some(parent) = abs.parent() { + fs::create_dir_all(parent).expect("mkdir -p"); + } + fs::File::create(&abs).expect("touch"); + } + dir + } + + fn rel_paths(found: &[(PathBuf, String)]) -> Vec { + let mut v: Vec = found.iter().map(|(_, r)| r.clone()).collect(); + v.sort(); + v + } + + #[test] + fn excludes_eadir_subtree() { + // The bug: Synology's @eaDir gets walked into and its + // SYNOFILE_THUMB_*.jpg leaves end up in image_exif. With + // filter_entry pruning, the subtree is never descended. + let dir = make_tree(&[ + "vacation/IMG_0001.jpg", + "vacation/@eaDir/IMG_0001.jpg/SYNOFILE_THUMB_S.jpg", + "vacation/@eaDir/IMG_0001.jpg/SYNOFILE_THUMB_XL.jpg", + "@eaDir/top_level_thumb.jpg", + ]); + let found = enumerate_indexable_files(dir.path(), &["@eaDir".to_string()], None); + assert_eq!(rel_paths(&found), vec!["vacation/IMG_0001.jpg".to_string()]); + } + + #[test] + fn excludes_nested_pattern() { + // .thumbnails as a component pattern (not an absolute dir). + let dir = make_tree(&[ + "a/b/photo.jpg", + "a/.thumbnails/cached.jpg", + "a/b/.thumbnails/nested.jpg", + ]); + let found = enumerate_indexable_files(dir.path(), &[".thumbnails".to_string()], None); + assert_eq!(rel_paths(&found), vec!["a/b/photo.jpg".to_string()]); + } + + #[test] + fn excludes_absolute_under_base() { + // Leading-'/' entries are interpreted as paths under the library + // root (see PathExcluder::new). + let dir = make_tree(&["private/secret.jpg", "public/keep.jpg"]); + let found = enumerate_indexable_files(dir.path(), &["/private".to_string()], None); + assert_eq!(rel_paths(&found), vec!["public/keep.jpg".to_string()]); + } + + #[test] + fn filters_non_media() { + let dir = make_tree(&[ + "a.jpg", + "b.mp4", + "c.txt", + "d", // no extension + "e.jpg.bak", // wrong ext + ]); + let found = enumerate_indexable_files(dir.path(), &[], None); + assert_eq!( + rel_paths(&found), + vec!["a.jpg".to_string(), "b.mp4".to_string()] + ); + } + + #[test] + fn modified_since_filters_old_files() { + let dir = make_tree(&["old.jpg", "new.jpg"]); + // Backdate "old.jpg" to a known instant. Use filetime via a portable + // touch: set both atime and mtime to a fixed past time using + // std::fs::File metadata — simpler to set the cutoff into the future + // for "old" and the present for "new" semantically. + // + // Simplest reliable approach: capture mtime of new.jpg, sleep + // briefly, recreate it, and use the original mtime as the cutoff. + // That way "old.jpg" is older than the cutoff and "new.jpg" is at + // or after. + let new_path = dir.path().join("new.jpg"); + // Force a measurable gap so filesystems with low-resolution mtime + // don't collapse them into the same instant. + std::thread::sleep(Duration::from_millis(20)); + let cutoff = SystemTime::now(); + std::thread::sleep(Duration::from_millis(20)); + // Bump new.jpg's mtime by rewriting it. + fs::write(&new_path, b"x").expect("rewrite"); + + let found = enumerate_indexable_files(dir.path(), &[], Some(cutoff)); + assert_eq!(rel_paths(&found), vec!["new.jpg".to_string()]); + } + + #[test] + fn walk_library_files_excludes_subtrees_and_returns_all_extensions() { + // The lower-level primitive: any extension survives, but excluded + // subtrees are pruned. Used by thumbnail gen and media-count + // gauges, which need non-media files too (e.g., walks through + // sidecar XMPs alongside the photos). + let dir = make_tree(&[ + "vacation/IMG_0001.jpg", + "vacation/IMG_0001.xmp", + "vacation/@eaDir/IMG_0001.jpg/SYNOFILE_THUMB_S.jpg", + "notes.txt", + ]); + let mut got: Vec = walk_library_files(dir.path(), &["@eaDir".to_string()]) + .into_iter() + .map(|e| { + e.path() + .strip_prefix(dir.path()) + .unwrap() + .to_string_lossy() + .replace('\\', "/") + }) + .collect(); + got.sort(); + assert_eq!( + got, + vec![ + "notes.txt".to_string(), + "vacation/IMG_0001.jpg".to_string(), + "vacation/IMG_0001.xmp".to_string(), + ] + ); + } + + #[test] + fn rel_path_is_forward_slash() { + // Sanity on a nested path. On Unix this is already '/'; the + // assertion guards a future Windows port from regressing. + let dir = make_tree(&["a/b/c.jpg"]); + let found = enumerate_indexable_files(dir.path(), &[], None); + let (_abs, rel) = &found[0]; + assert_eq!(rel, "a/b/c.jpg"); + assert!(!rel.contains('\\')); + } +} diff --git a/src/files.rs b/src/files.rs index 9ab1468..c4ac10c 100644 --- a/src/files.rs +++ b/src/files.rs @@ -1397,7 +1397,7 @@ impl Handler for StreamActor { // The stub in lib.rs is a no-op; the real generation is driven by // the file watcher tick in main.rs, which has access to the // configured libraries. - create_thumbnails(&[]) + create_thumbnails(&[], &[]) } } diff --git a/src/lib.rs b/src/lib.rs index 12de818..2c384e1 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -14,6 +14,7 @@ pub mod error; pub mod exif; pub mod face_watch; pub mod faces; +pub mod file_scan; pub mod file_types; pub mod files; pub mod geo; @@ -39,11 +40,11 @@ pub use state::AppState; use std::path::Path; use walkdir::DirEntry; -pub fn create_thumbnails(_libs: &[libraries::Library]) { +pub fn create_thumbnails(_libs: &[libraries::Library], _excluded_dirs: &[String]) { // Stub - implemented in main.rs } -pub fn update_media_counts(_media_dir: &Path) { +pub fn update_media_counts(_media_dir: &Path, _excluded_dirs: &[String]) { // Stub - implemented in main.rs } diff --git a/src/main.rs b/src/main.rs index 3e85cbd..d546c16 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1292,7 +1292,7 @@ fn generate_image_thumbnail(src: &Path, thumb_path: &Path) -> std::io::Result<() Ok(()) } -fn create_thumbnails(libs: &[libraries::Library]) { +fn create_thumbnails(libs: &[libraries::Library], excluded_dirs: &[String]) { let tracer = global_tracer(); let span = tracer.start("creating thumbnails"); @@ -1306,12 +1306,10 @@ fn create_thumbnails(libs: &[libraries::Library]) { ); let images = PathBuf::from(&lib.root_path); - WalkDir::new(&images) - .into_iter() - .collect::>>() + // Prune EXCLUDED_DIRS so we don't generate thumbnails-of-thumbnails + // for Synology @eaDir trees. file_scan handles filter_entry pruning. + image_api::file_scan::walk_library_files(&images, excluded_dirs) .into_par_iter() - .filter_map(|entry| entry.ok()) - .filter(|entry| entry.file_type().is_file()) .for_each(|entry| { let src = entry.path(); let Ok(relative_path) = src.strip_prefix(&images) else { @@ -1367,17 +1365,17 @@ fn create_thumbnails(libs: &[libraries::Library]) { debug!("Finished making thumbnails"); for lib in libs { - update_media_counts(Path::new(&lib.root_path)); + update_media_counts(Path::new(&lib.root_path), excluded_dirs); } } -fn update_media_counts(media_dir: &Path) { +fn update_media_counts(media_dir: &Path, excluded_dirs: &[String]) { let mut image_count = 0; let mut video_count = 0; - for ref entry in WalkDir::new(media_dir).into_iter().filter_map(|e| e.ok()) { - if is_image(entry) { + for entry in image_api::file_scan::walk_library_files(media_dir, excluded_dirs) { + if is_image(&entry) { image_count += 1; - } else if is_video(entry) { + } else if is_video(&entry) { video_count += 1; } } @@ -1426,8 +1424,9 @@ fn main() -> std::io::Result<()> { // so missed files are filled in over successive scans. { let libs = app_data.libraries.clone(); + let excluded = app_data.excluded_dirs.clone(); std::thread::spawn(move || { - create_thumbnails(&libs); + create_thumbnails(&libs, &excluded); }); } // generate_video_gifs().await; @@ -1466,7 +1465,7 @@ fn main() -> std::io::Result<()> { ); // Start orphaned playlist cleanup job - cleanup_orphaned_playlists(); + cleanup_orphaned_playlists(app_state.excluded_dirs.clone()); // Spawn background job to generate daily conversation summaries { @@ -1658,8 +1657,8 @@ fn run_migrations( } /// Clean up orphaned HLS playlists and segments whose source videos no longer exist -fn cleanup_orphaned_playlists() { - std::thread::spawn(|| { +fn cleanup_orphaned_playlists(excluded_dirs: Vec) { + std::thread::spawn(move || { let video_path = dotenv::var("VIDEO_PATH").expect("VIDEO_PATH must be set"); let base_path = dotenv::var("BASE_PATH").expect("BASE_PATH must be set"); @@ -1704,13 +1703,14 @@ fn cleanup_orphaned_playlists() { if let Some(filename) = playlist_path.file_stem() { let video_filename = filename.to_string_lossy(); - // Search for this video file in BASE_PATH + // Search for this video file in BASE_PATH, respecting + // EXCLUDED_DIRS so we don't false-resurrect playlists for + // videos that only exist inside an excluded subtree. let mut video_exists = false; - for entry in WalkDir::new(&base_path) - .into_iter() - .filter_map(|e| e.ok()) - .filter(|e| e.file_type().is_file()) - { + for entry in image_api::file_scan::walk_library_files( + Path::new(&base_path), + &excluded_dirs, + ) { if let Some(entry_stem) = entry.path().file_stem() && entry_stem == filename && is_video_file(entry.path()) @@ -1922,7 +1922,7 @@ fn watch_files( } // Update media counts per library (metric aggregates across all) - update_media_counts(Path::new(&lib.root_path)); + update_media_counts(Path::new(&lib.root_path), &excluded_dirs); } if is_full_scan { @@ -1974,37 +1974,11 @@ fn process_new_files( let thumbnail_directory = Path::new(&thumbs); let base_path = Path::new(&library.root_path); - // Collect all image and video files, optionally filtered by modification time - let files: Vec<(PathBuf, String)> = WalkDir::new(base_path) - .into_iter() - .filter_map(|entry| entry.ok()) - .filter(|entry| entry.file_type().is_file()) - .filter(|entry| { - // Filter by modification time if specified - if let Some(since) = modified_since { - if let Ok(metadata) = entry.metadata() - && let Ok(modified) = metadata.modified() - { - return modified >= since; - } - // If we can't get metadata, include the file to be safe - return true; - } - true - }) - .filter(|entry| is_image(entry) || is_video(entry)) - .filter_map(|entry| { - let file_path = entry.path().to_path_buf(); - // Canonical rel_path is forward-slash regardless of OS so DB - // comparisons against the batch EXIF lookup line up. - let relative_path = file_path - .strip_prefix(base_path) - .ok()? - .to_str()? - .replace('\\', "/"); - Some((file_path, relative_path)) - }) - .collect(); + // Walk, prune EXCLUDED_DIRS subtrees, and apply image/video + modified_since + // filters. See `file_scan` for why exclusion has to happen at WalkDir + // time (filter_entry) rather than at face-detect time. + let files: Vec<(PathBuf, String)> = + image_api::file_scan::enumerate_indexable_files(base_path, excluded_dirs, modified_since); if files.is_empty() { debug!("No files to process"); @@ -2255,7 +2229,7 @@ fn process_new_files( // Generate thumbnails for all files that need them if new_files_found { info!("Processing thumbnails for new files..."); - create_thumbnails(std::slice::from_ref(library)); + create_thumbnails(std::slice::from_ref(library), excluded_dirs); } // Reconciliation: on a full scan, prune image_exif rows whose rel_path no @@ -2335,7 +2309,8 @@ fn backfill_unhashed_backlog( // library's tick. Negligible cost given the cap. let rows: Vec<(i32, String)> = { let mut dao = exif_dao.lock().expect("Unable to lock ExifDao"); - dao.get_rows_missing_hash(context, cap + 1).unwrap_or_default() + dao.get_rows_missing_hash(context, cap + 1) + .unwrap_or_default() }; if rows.is_empty() { return 0; @@ -2361,9 +2336,13 @@ fn backfill_unhashed_backlog( match content_hash::compute(&abs) { Ok(id) => { let mut dao = exif_dao.lock().expect("Unable to lock ExifDao"); - if let Err(e) = - dao.backfill_content_hash(context, library.id, rel_path, &id.content_hash, id.size_bytes) - { + if let Err(e) = dao.backfill_content_hash( + context, + library.id, + rel_path, + &id.content_hash, + id.size_bytes, + ) { warn!( "face_watch: backfill_content_hash failed for {}: {:?}", rel_path, e @@ -2374,7 +2353,11 @@ fn backfill_unhashed_backlog( } } Err(e) => { - debug!("face_watch: hash compute failed for {} ({:?})", abs.display(), e); + debug!( + "face_watch: hash compute failed for {} ({:?})", + abs.display(), + e + ); errors += 1; } } diff --git a/src/video/mod.rs b/src/video/mod.rs index 1089301..8d9c3b0 100644 --- a/src/video/mod.rs +++ b/src/video/mod.rs @@ -24,6 +24,17 @@ pub async fn generate_video_gifs() { fs::create_dir_all(gif_base_path).expect("There was an issue creating directory"); let files = PathBuf::from(dotenv::var("BASE_PATH").unwrap()); + // EXCLUDED_DIRS read here for the update_media_counts call below. + // The WalkDir walk itself is left raw — this function is currently + // dead code (`#[allow(dead_code)]`) and not reachable from main. + // If revived, swap to file_scan::walk_library_files (dual lib/bin + // module path makes that non-trivial here). + let excluded_dirs: Vec = std::env::var("EXCLUDED_DIRS") + .unwrap_or_default() + .split(',') + .filter(|s| !s.trim().is_empty()) + .map(|s| s.trim().to_string()) + .collect(); let ffmpeg = Ffmpeg; for file in WalkDir::new(&files) @@ -62,6 +73,6 @@ pub async fn generate_video_gifs() { info!("Finished making video gifs in {:?}", start.elapsed()); - update_media_counts(&files); + update_media_counts(&files, &excluded_dirs); }); }