indexer: prune EXCLUDED_DIRS at WalkDir time, extract enumerate_indexable_files #63

Merged
cameron merged 2 commits from feature/exclude-dirs-at-index-time into master 2026-04-30 20:24:19 +00:00
3 changed files with 206 additions and 31 deletions
Showing only changes of commit 5bf49568f1 - Show all commits

200
src/file_scan.rs Normal file
View File

@@ -0,0 +1,200 @@
//! File enumeration for the indexer pass.
//!
//! Walks a library root and returns the `(absolute_path, forward_slash_rel_path)`
//! pairs that belong in `image_exif`. Pruning `EXCLUDED_DIRS` happens here at
//! WalkDir time via `filter_entry` so whole subtrees (Synology's `@eaDir`,
//! `.thumbnails`, the operator's configured excludes) are never descended —
//! vs walking the full tree and discarding leaves, which on a Synology mount
//! with thousands of `@eaDir` subdirs is the difference between scanning N
//! files and N×3.
//!
//! Previously inlined in `main.rs::process_new_files` without the exclusion
//! filter — paths like `<lib>/@eaDir/.../SYNOFILE_THUMB_*.jpg` ended up in
//! `image_exif` and looped through `face_watch::filter_excluded` every tick,
//! since no `face_detections` row would ever be written for a path dropped
//! at runtime.
use std::path::{Path, PathBuf};
use std::time::SystemTime;
use walkdir::WalkDir;
use crate::file_types;
use crate::memories::PathExcluder;
/// Walk `base_path`, prune `EXCLUDED_DIRS` subtrees, and return
/// `(absolute_path, forward_slash_rel_path)` for every image / video file
/// that should be indexed.
///
/// `modified_since` keeps only files modified at or after the instant —
/// used by the watcher's quick-scan tick to skip the long tail. Files
/// whose metadata can't be read are kept; the caller's batch EXIF lookup
/// dedups against existing rows.
pub fn enumerate_indexable_files(
base_path: &Path,
excluded_dirs: &[String],
modified_since: Option<SystemTime>,
) -> Vec<(PathBuf, String)> {
let excluder = PathExcluder::new(base_path, excluded_dirs);
WalkDir::new(base_path)
.into_iter()
// Prune whole subtrees so WalkDir doesn't descend into excluded
// dirs at all. Always allow depth 0 (the root itself); under a
// pathological config that excludes the base, downstream filters
// would still drop everything anyway.
.filter_entry(|entry| entry.depth() == 0 || !excluder.is_excluded(entry.path()))
.filter_map(|entry| entry.ok())
.filter(|entry| entry.file_type().is_file())
.filter(|entry| match modified_since {
Some(since) => entry
.metadata()
.ok()
.and_then(|m| m.modified().ok())
.map(|m| m >= since)
.unwrap_or(true),
None => true,
})
.filter(|entry| {
file_types::direntry_is_image(entry) || file_types::direntry_is_video(entry)
})
.filter_map(|entry| {
let file_path = entry.path().to_path_buf();
// Forward-slash rel_path regardless of OS so DB comparisons
// against the batch EXIF lookup line up.
let rel = file_path
.strip_prefix(base_path)
.ok()?
.to_str()?
.replace('\\', "/");
Some((file_path, rel))
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
use std::time::Duration;
/// Build a tempdir with `paths` (relative). Each touched file is empty;
/// directory components are created automatically.
fn make_tree(paths: &[&str]) -> tempfile::TempDir {
let dir = tempfile::tempdir().expect("tempdir");
for p in paths {
let abs = dir.path().join(p);
if let Some(parent) = abs.parent() {
fs::create_dir_all(parent).expect("mkdir -p");
}
fs::File::create(&abs).expect("touch");
}
dir
}
fn rel_paths(found: &[(PathBuf, String)]) -> Vec<String> {
let mut v: Vec<String> = found.iter().map(|(_, r)| r.clone()).collect();
v.sort();
v
}
#[test]
fn excludes_eadir_subtree() {
// The bug: Synology's @eaDir gets walked into and its
// SYNOFILE_THUMB_*.jpg leaves end up in image_exif. With
// filter_entry pruning, the subtree is never descended.
let dir = make_tree(&[
"vacation/IMG_0001.jpg",
"vacation/@eaDir/IMG_0001.jpg/SYNOFILE_THUMB_S.jpg",
"vacation/@eaDir/IMG_0001.jpg/SYNOFILE_THUMB_XL.jpg",
"@eaDir/top_level_thumb.jpg",
]);
let found = enumerate_indexable_files(
dir.path(),
&["@eaDir".to_string()],
None,
);
assert_eq!(rel_paths(&found), vec!["vacation/IMG_0001.jpg".to_string()]);
}
#[test]
fn excludes_nested_pattern() {
// .thumbnails as a component pattern (not an absolute dir).
let dir = make_tree(&[
"a/b/photo.jpg",
"a/.thumbnails/cached.jpg",
"a/b/.thumbnails/nested.jpg",
]);
let found = enumerate_indexable_files(
dir.path(),
&[".thumbnails".to_string()],
None,
);
assert_eq!(rel_paths(&found), vec!["a/b/photo.jpg".to_string()]);
}
#[test]
fn excludes_absolute_under_base() {
// Leading-'/' entries are interpreted as paths under the library
// root (see PathExcluder::new).
let dir = make_tree(&[
"private/secret.jpg",
"public/keep.jpg",
]);
let found = enumerate_indexable_files(
dir.path(),
&["/private".to_string()],
None,
);
assert_eq!(rel_paths(&found), vec!["public/keep.jpg".to_string()]);
}
#[test]
fn filters_non_media() {
let dir = make_tree(&[
"a.jpg",
"b.mp4",
"c.txt",
"d", // no extension
"e.jpg.bak", // wrong ext
]);
let found = enumerate_indexable_files(dir.path(), &[], None);
assert_eq!(rel_paths(&found), vec!["a.jpg".to_string(), "b.mp4".to_string()]);
}
#[test]
fn modified_since_filters_old_files() {
let dir = make_tree(&["old.jpg", "new.jpg"]);
// Backdate "old.jpg" to a known instant. Use filetime via a portable
// touch: set both atime and mtime to a fixed past time using
// std::fs::File metadata — simpler to set the cutoff into the future
// for "old" and the present for "new" semantically.
//
// Simplest reliable approach: capture mtime of new.jpg, sleep
// briefly, recreate it, and use the original mtime as the cutoff.
// That way "old.jpg" is older than the cutoff and "new.jpg" is at
// or after.
let new_path = dir.path().join("new.jpg");
// Force a measurable gap so filesystems with low-resolution mtime
// don't collapse them into the same instant.
std::thread::sleep(Duration::from_millis(20));
let cutoff = SystemTime::now();
std::thread::sleep(Duration::from_millis(20));
// Bump new.jpg's mtime by rewriting it.
fs::write(&new_path, b"x").expect("rewrite");
let found = enumerate_indexable_files(dir.path(), &[], Some(cutoff));
assert_eq!(rel_paths(&found), vec!["new.jpg".to_string()]);
}
#[test]
fn rel_path_is_forward_slash() {
// Sanity on a nested path. On Unix this is already '/'; the
// assertion guards a future Windows port from regressing.
let dir = make_tree(&["a/b/c.jpg"]);
let found = enumerate_indexable_files(dir.path(), &[], None);
let (_abs, rel) = &found[0];
assert_eq!(rel, "a/b/c.jpg");
assert!(!rel.contains('\\'));
}
}

View File

@@ -14,6 +14,7 @@ pub mod error;
pub mod exif;
pub mod face_watch;
pub mod faces;
pub mod file_scan;
pub mod file_types;
pub mod files;
pub mod geo;

View File

@@ -1974,37 +1974,11 @@ fn process_new_files(
let thumbnail_directory = Path::new(&thumbs);
let base_path = Path::new(&library.root_path);
// Collect all image and video files, optionally filtered by modification time
let files: Vec<(PathBuf, String)> = WalkDir::new(base_path)
.into_iter()
.filter_map(|entry| entry.ok())
.filter(|entry| entry.file_type().is_file())
.filter(|entry| {
// Filter by modification time if specified
if let Some(since) = modified_since {
if let Ok(metadata) = entry.metadata()
&& let Ok(modified) = metadata.modified()
{
return modified >= since;
}
// If we can't get metadata, include the file to be safe
return true;
}
true
})
.filter(|entry| is_image(entry) || is_video(entry))
.filter_map(|entry| {
let file_path = entry.path().to_path_buf();
// Canonical rel_path is forward-slash regardless of OS so DB
// comparisons against the batch EXIF lookup line up.
let relative_path = file_path
.strip_prefix(base_path)
.ok()?
.to_str()?
.replace('\\', "/");
Some((file_path, relative_path))
})
.collect();
// Walk, prune EXCLUDED_DIRS subtrees, and apply image/video + modified_since
// filters. See `file_scan` for why exclusion has to happen at WalkDir
// time (filter_entry) rather than at face-detect time.
let files: Vec<(PathBuf, String)> =
image_api::file_scan::enumerate_indexable_files(base_path, excluded_dirs, modified_since);
if files.is_empty() {
debug!("No files to process");