indexer: prune EXCLUDED_DIRS at WalkDir time, extract enumerate_indexable_files
Synology drops `@eaDir/.../SYNOFILE_THUMB_*.jpg` files alongside every
photo. The face-detect pipeline already filters those out via
`face_watch::filter_excluded`, but the filter runs *after* the indexer
has already inserted rows into `image_exif`. Result: phantom rows whose
content_hash never matches a `face_detections` row, so the anti-join in
`list_unscanned_candidates` returns them every tick. They're filtered
out at runtime, no marker is written, and the cycle repeats forever —
log spam, wrong stats denominator, and on a real Synology library the
phantom rows balloon into the hundreds of thousands.
Move the exclusion to the WalkDir pass, where filter_entry can prune
whole subtrees instead of walking and discarding leaves. Extract the
pre-existing 30-line walker chain in main.rs::process_new_files into
`file_scan::enumerate_indexable_files` so it's testable in isolation.
Six tests cover the bug (eadir prune), nested patterns, absolute-under-base
syntax, non-media filtering, modified_since semantics, and forward-slash
rel_path normalization.
Out of scope (other WalkDir callers in main.rs that don't yet apply
EXCLUDED_DIRS — thumbnail gen at 1309, media scan at 1377, video
playlist scan at 1685, and two nested walks at 1709 / 1743): separate
audit PR.
Operator note: existing phantom rows still need a one-shot cleanup —
DELETE FROM face_detections WHERE content_hash IN (
SELECT content_hash FROM image_exif WHERE rel_path LIKE '%/@eaDir/%'
);
DELETE FROM image_exif WHERE rel_path LIKE '%/@eaDir/%' OR rel_path LIKE '@eaDir/%';
Run before attaching a fresh Synology-sourced library.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
200
src/file_scan.rs
Normal file
200
src/file_scan.rs
Normal file
@@ -0,0 +1,200 @@
|
||||
//! File enumeration for the indexer pass.
|
||||
//!
|
||||
//! Walks a library root and returns the `(absolute_path, forward_slash_rel_path)`
|
||||
//! pairs that belong in `image_exif`. Pruning `EXCLUDED_DIRS` happens here at
|
||||
//! WalkDir time via `filter_entry` so whole subtrees (Synology's `@eaDir`,
|
||||
//! `.thumbnails`, the operator's configured excludes) are never descended —
|
||||
//! vs walking the full tree and discarding leaves, which on a Synology mount
|
||||
//! with thousands of `@eaDir` subdirs is the difference between scanning N
|
||||
//! files and N×3.
|
||||
//!
|
||||
//! Previously inlined in `main.rs::process_new_files` without the exclusion
|
||||
//! filter — paths like `<lib>/@eaDir/.../SYNOFILE_THUMB_*.jpg` ended up in
|
||||
//! `image_exif` and looped through `face_watch::filter_excluded` every tick,
|
||||
//! since no `face_detections` row would ever be written for a path dropped
|
||||
//! at runtime.
|
||||
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::SystemTime;
|
||||
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::file_types;
|
||||
use crate::memories::PathExcluder;
|
||||
|
||||
/// Walk `base_path`, prune `EXCLUDED_DIRS` subtrees, and return
|
||||
/// `(absolute_path, forward_slash_rel_path)` for every image / video file
|
||||
/// that should be indexed.
|
||||
///
|
||||
/// `modified_since` keeps only files modified at or after the instant —
|
||||
/// used by the watcher's quick-scan tick to skip the long tail. Files
|
||||
/// whose metadata can't be read are kept; the caller's batch EXIF lookup
|
||||
/// dedups against existing rows.
|
||||
pub fn enumerate_indexable_files(
|
||||
base_path: &Path,
|
||||
excluded_dirs: &[String],
|
||||
modified_since: Option<SystemTime>,
|
||||
) -> Vec<(PathBuf, String)> {
|
||||
let excluder = PathExcluder::new(base_path, excluded_dirs);
|
||||
|
||||
WalkDir::new(base_path)
|
||||
.into_iter()
|
||||
// Prune whole subtrees so WalkDir doesn't descend into excluded
|
||||
// dirs at all. Always allow depth 0 (the root itself); under a
|
||||
// pathological config that excludes the base, downstream filters
|
||||
// would still drop everything anyway.
|
||||
.filter_entry(|entry| entry.depth() == 0 || !excluder.is_excluded(entry.path()))
|
||||
.filter_map(|entry| entry.ok())
|
||||
.filter(|entry| entry.file_type().is_file())
|
||||
.filter(|entry| match modified_since {
|
||||
Some(since) => entry
|
||||
.metadata()
|
||||
.ok()
|
||||
.and_then(|m| m.modified().ok())
|
||||
.map(|m| m >= since)
|
||||
.unwrap_or(true),
|
||||
None => true,
|
||||
})
|
||||
.filter(|entry| {
|
||||
file_types::direntry_is_image(entry) || file_types::direntry_is_video(entry)
|
||||
})
|
||||
.filter_map(|entry| {
|
||||
let file_path = entry.path().to_path_buf();
|
||||
// Forward-slash rel_path regardless of OS so DB comparisons
|
||||
// against the batch EXIF lookup line up.
|
||||
let rel = file_path
|
||||
.strip_prefix(base_path)
|
||||
.ok()?
|
||||
.to_str()?
|
||||
.replace('\\', "/");
|
||||
Some((file_path, rel))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::fs;
|
||||
use std::time::Duration;
|
||||
|
||||
/// Build a tempdir with `paths` (relative). Each touched file is empty;
|
||||
/// directory components are created automatically.
|
||||
fn make_tree(paths: &[&str]) -> tempfile::TempDir {
|
||||
let dir = tempfile::tempdir().expect("tempdir");
|
||||
for p in paths {
|
||||
let abs = dir.path().join(p);
|
||||
if let Some(parent) = abs.parent() {
|
||||
fs::create_dir_all(parent).expect("mkdir -p");
|
||||
}
|
||||
fs::File::create(&abs).expect("touch");
|
||||
}
|
||||
dir
|
||||
}
|
||||
|
||||
fn rel_paths(found: &[(PathBuf, String)]) -> Vec<String> {
|
||||
let mut v: Vec<String> = found.iter().map(|(_, r)| r.clone()).collect();
|
||||
v.sort();
|
||||
v
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn excludes_eadir_subtree() {
|
||||
// The bug: Synology's @eaDir gets walked into and its
|
||||
// SYNOFILE_THUMB_*.jpg leaves end up in image_exif. With
|
||||
// filter_entry pruning, the subtree is never descended.
|
||||
let dir = make_tree(&[
|
||||
"vacation/IMG_0001.jpg",
|
||||
"vacation/@eaDir/IMG_0001.jpg/SYNOFILE_THUMB_S.jpg",
|
||||
"vacation/@eaDir/IMG_0001.jpg/SYNOFILE_THUMB_XL.jpg",
|
||||
"@eaDir/top_level_thumb.jpg",
|
||||
]);
|
||||
let found = enumerate_indexable_files(
|
||||
dir.path(),
|
||||
&["@eaDir".to_string()],
|
||||
None,
|
||||
);
|
||||
assert_eq!(rel_paths(&found), vec!["vacation/IMG_0001.jpg".to_string()]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn excludes_nested_pattern() {
|
||||
// .thumbnails as a component pattern (not an absolute dir).
|
||||
let dir = make_tree(&[
|
||||
"a/b/photo.jpg",
|
||||
"a/.thumbnails/cached.jpg",
|
||||
"a/b/.thumbnails/nested.jpg",
|
||||
]);
|
||||
let found = enumerate_indexable_files(
|
||||
dir.path(),
|
||||
&[".thumbnails".to_string()],
|
||||
None,
|
||||
);
|
||||
assert_eq!(rel_paths(&found), vec!["a/b/photo.jpg".to_string()]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn excludes_absolute_under_base() {
|
||||
// Leading-'/' entries are interpreted as paths under the library
|
||||
// root (see PathExcluder::new).
|
||||
let dir = make_tree(&[
|
||||
"private/secret.jpg",
|
||||
"public/keep.jpg",
|
||||
]);
|
||||
let found = enumerate_indexable_files(
|
||||
dir.path(),
|
||||
&["/private".to_string()],
|
||||
None,
|
||||
);
|
||||
assert_eq!(rel_paths(&found), vec!["public/keep.jpg".to_string()]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filters_non_media() {
|
||||
let dir = make_tree(&[
|
||||
"a.jpg",
|
||||
"b.mp4",
|
||||
"c.txt",
|
||||
"d", // no extension
|
||||
"e.jpg.bak", // wrong ext
|
||||
]);
|
||||
let found = enumerate_indexable_files(dir.path(), &[], None);
|
||||
assert_eq!(rel_paths(&found), vec!["a.jpg".to_string(), "b.mp4".to_string()]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn modified_since_filters_old_files() {
|
||||
let dir = make_tree(&["old.jpg", "new.jpg"]);
|
||||
// Backdate "old.jpg" to a known instant. Use filetime via a portable
|
||||
// touch: set both atime and mtime to a fixed past time using
|
||||
// std::fs::File metadata — simpler to set the cutoff into the future
|
||||
// for "old" and the present for "new" semantically.
|
||||
//
|
||||
// Simplest reliable approach: capture mtime of new.jpg, sleep
|
||||
// briefly, recreate it, and use the original mtime as the cutoff.
|
||||
// That way "old.jpg" is older than the cutoff and "new.jpg" is at
|
||||
// or after.
|
||||
let new_path = dir.path().join("new.jpg");
|
||||
// Force a measurable gap so filesystems with low-resolution mtime
|
||||
// don't collapse them into the same instant.
|
||||
std::thread::sleep(Duration::from_millis(20));
|
||||
let cutoff = SystemTime::now();
|
||||
std::thread::sleep(Duration::from_millis(20));
|
||||
// Bump new.jpg's mtime by rewriting it.
|
||||
fs::write(&new_path, b"x").expect("rewrite");
|
||||
|
||||
let found = enumerate_indexable_files(dir.path(), &[], Some(cutoff));
|
||||
assert_eq!(rel_paths(&found), vec!["new.jpg".to_string()]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rel_path_is_forward_slash() {
|
||||
// Sanity on a nested path. On Unix this is already '/'; the
|
||||
// assertion guards a future Windows port from regressing.
|
||||
let dir = make_tree(&["a/b/c.jpg"]);
|
||||
let found = enumerate_indexable_files(dir.path(), &[], None);
|
||||
let (_abs, rel) = &found[0];
|
||||
assert_eq!(rel, "a/b/c.jpg");
|
||||
assert!(!rel.contains('\\'));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user