indexer: prune EXCLUDED_DIRS at WalkDir time, extract enumerate_indexable_files #63
200
src/file_scan.rs
Normal file
200
src/file_scan.rs
Normal file
@@ -0,0 +1,200 @@
|
||||
//! File enumeration for the indexer pass.
|
||||
//!
|
||||
//! Walks a library root and returns the `(absolute_path, forward_slash_rel_path)`
|
||||
//! pairs that belong in `image_exif`. Pruning `EXCLUDED_DIRS` happens here at
|
||||
//! WalkDir time via `filter_entry` so whole subtrees (Synology's `@eaDir`,
|
||||
//! `.thumbnails`, the operator's configured excludes) are never descended —
|
||||
//! vs walking the full tree and discarding leaves, which on a Synology mount
|
||||
//! with thousands of `@eaDir` subdirs is the difference between scanning N
|
||||
//! files and N×3.
|
||||
//!
|
||||
//! Previously inlined in `main.rs::process_new_files` without the exclusion
|
||||
//! filter — paths like `<lib>/@eaDir/.../SYNOFILE_THUMB_*.jpg` ended up in
|
||||
//! `image_exif` and looped through `face_watch::filter_excluded` every tick,
|
||||
//! since no `face_detections` row would ever be written for a path dropped
|
||||
//! at runtime.
|
||||
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::time::SystemTime;
|
||||
|
||||
use walkdir::WalkDir;
|
||||
|
||||
use crate::file_types;
|
||||
use crate::memories::PathExcluder;
|
||||
|
||||
/// Walk `base_path`, prune `EXCLUDED_DIRS` subtrees, and return
|
||||
/// `(absolute_path, forward_slash_rel_path)` for every image / video file
|
||||
/// that should be indexed.
|
||||
///
|
||||
/// `modified_since` keeps only files modified at or after the instant —
|
||||
/// used by the watcher's quick-scan tick to skip the long tail. Files
|
||||
/// whose metadata can't be read are kept; the caller's batch EXIF lookup
|
||||
/// dedups against existing rows.
|
||||
pub fn enumerate_indexable_files(
|
||||
base_path: &Path,
|
||||
excluded_dirs: &[String],
|
||||
modified_since: Option<SystemTime>,
|
||||
) -> Vec<(PathBuf, String)> {
|
||||
let excluder = PathExcluder::new(base_path, excluded_dirs);
|
||||
|
||||
WalkDir::new(base_path)
|
||||
.into_iter()
|
||||
// Prune whole subtrees so WalkDir doesn't descend into excluded
|
||||
// dirs at all. Always allow depth 0 (the root itself); under a
|
||||
// pathological config that excludes the base, downstream filters
|
||||
// would still drop everything anyway.
|
||||
.filter_entry(|entry| entry.depth() == 0 || !excluder.is_excluded(entry.path()))
|
||||
.filter_map(|entry| entry.ok())
|
||||
.filter(|entry| entry.file_type().is_file())
|
||||
.filter(|entry| match modified_since {
|
||||
Some(since) => entry
|
||||
.metadata()
|
||||
.ok()
|
||||
.and_then(|m| m.modified().ok())
|
||||
.map(|m| m >= since)
|
||||
.unwrap_or(true),
|
||||
None => true,
|
||||
})
|
||||
.filter(|entry| {
|
||||
file_types::direntry_is_image(entry) || file_types::direntry_is_video(entry)
|
||||
})
|
||||
.filter_map(|entry| {
|
||||
let file_path = entry.path().to_path_buf();
|
||||
// Forward-slash rel_path regardless of OS so DB comparisons
|
||||
// against the batch EXIF lookup line up.
|
||||
let rel = file_path
|
||||
.strip_prefix(base_path)
|
||||
.ok()?
|
||||
.to_str()?
|
||||
.replace('\\', "/");
|
||||
Some((file_path, rel))
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use std::fs;
|
||||
use std::time::Duration;
|
||||
|
||||
/// Build a tempdir with `paths` (relative). Each touched file is empty;
|
||||
/// directory components are created automatically.
|
||||
fn make_tree(paths: &[&str]) -> tempfile::TempDir {
|
||||
let dir = tempfile::tempdir().expect("tempdir");
|
||||
for p in paths {
|
||||
let abs = dir.path().join(p);
|
||||
if let Some(parent) = abs.parent() {
|
||||
fs::create_dir_all(parent).expect("mkdir -p");
|
||||
}
|
||||
fs::File::create(&abs).expect("touch");
|
||||
}
|
||||
dir
|
||||
}
|
||||
|
||||
fn rel_paths(found: &[(PathBuf, String)]) -> Vec<String> {
|
||||
let mut v: Vec<String> = found.iter().map(|(_, r)| r.clone()).collect();
|
||||
v.sort();
|
||||
v
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn excludes_eadir_subtree() {
|
||||
// The bug: Synology's @eaDir gets walked into and its
|
||||
// SYNOFILE_THUMB_*.jpg leaves end up in image_exif. With
|
||||
// filter_entry pruning, the subtree is never descended.
|
||||
let dir = make_tree(&[
|
||||
"vacation/IMG_0001.jpg",
|
||||
"vacation/@eaDir/IMG_0001.jpg/SYNOFILE_THUMB_S.jpg",
|
||||
"vacation/@eaDir/IMG_0001.jpg/SYNOFILE_THUMB_XL.jpg",
|
||||
"@eaDir/top_level_thumb.jpg",
|
||||
]);
|
||||
let found = enumerate_indexable_files(
|
||||
dir.path(),
|
||||
&["@eaDir".to_string()],
|
||||
None,
|
||||
);
|
||||
assert_eq!(rel_paths(&found), vec!["vacation/IMG_0001.jpg".to_string()]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn excludes_nested_pattern() {
|
||||
// .thumbnails as a component pattern (not an absolute dir).
|
||||
let dir = make_tree(&[
|
||||
"a/b/photo.jpg",
|
||||
"a/.thumbnails/cached.jpg",
|
||||
"a/b/.thumbnails/nested.jpg",
|
||||
]);
|
||||
let found = enumerate_indexable_files(
|
||||
dir.path(),
|
||||
&[".thumbnails".to_string()],
|
||||
None,
|
||||
);
|
||||
assert_eq!(rel_paths(&found), vec!["a/b/photo.jpg".to_string()]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn excludes_absolute_under_base() {
|
||||
// Leading-'/' entries are interpreted as paths under the library
|
||||
// root (see PathExcluder::new).
|
||||
let dir = make_tree(&[
|
||||
"private/secret.jpg",
|
||||
"public/keep.jpg",
|
||||
]);
|
||||
let found = enumerate_indexable_files(
|
||||
dir.path(),
|
||||
&["/private".to_string()],
|
||||
None,
|
||||
);
|
||||
assert_eq!(rel_paths(&found), vec!["public/keep.jpg".to_string()]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filters_non_media() {
|
||||
let dir = make_tree(&[
|
||||
"a.jpg",
|
||||
"b.mp4",
|
||||
"c.txt",
|
||||
"d", // no extension
|
||||
"e.jpg.bak", // wrong ext
|
||||
]);
|
||||
let found = enumerate_indexable_files(dir.path(), &[], None);
|
||||
assert_eq!(rel_paths(&found), vec!["a.jpg".to_string(), "b.mp4".to_string()]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn modified_since_filters_old_files() {
|
||||
let dir = make_tree(&["old.jpg", "new.jpg"]);
|
||||
// Backdate "old.jpg" to a known instant. Use filetime via a portable
|
||||
// touch: set both atime and mtime to a fixed past time using
|
||||
// std::fs::File metadata — simpler to set the cutoff into the future
|
||||
// for "old" and the present for "new" semantically.
|
||||
//
|
||||
// Simplest reliable approach: capture mtime of new.jpg, sleep
|
||||
// briefly, recreate it, and use the original mtime as the cutoff.
|
||||
// That way "old.jpg" is older than the cutoff and "new.jpg" is at
|
||||
// or after.
|
||||
let new_path = dir.path().join("new.jpg");
|
||||
// Force a measurable gap so filesystems with low-resolution mtime
|
||||
// don't collapse them into the same instant.
|
||||
std::thread::sleep(Duration::from_millis(20));
|
||||
let cutoff = SystemTime::now();
|
||||
std::thread::sleep(Duration::from_millis(20));
|
||||
// Bump new.jpg's mtime by rewriting it.
|
||||
fs::write(&new_path, b"x").expect("rewrite");
|
||||
|
||||
let found = enumerate_indexable_files(dir.path(), &[], Some(cutoff));
|
||||
assert_eq!(rel_paths(&found), vec!["new.jpg".to_string()]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn rel_path_is_forward_slash() {
|
||||
// Sanity on a nested path. On Unix this is already '/'; the
|
||||
// assertion guards a future Windows port from regressing.
|
||||
let dir = make_tree(&["a/b/c.jpg"]);
|
||||
let found = enumerate_indexable_files(dir.path(), &[], None);
|
||||
let (_abs, rel) = &found[0];
|
||||
assert_eq!(rel, "a/b/c.jpg");
|
||||
assert!(!rel.contains('\\'));
|
||||
}
|
||||
}
|
||||
@@ -14,6 +14,7 @@ pub mod error;
|
||||
pub mod exif;
|
||||
pub mod face_watch;
|
||||
pub mod faces;
|
||||
pub mod file_scan;
|
||||
pub mod file_types;
|
||||
pub mod files;
|
||||
pub mod geo;
|
||||
|
||||
36
src/main.rs
36
src/main.rs
@@ -1974,37 +1974,11 @@ fn process_new_files(
|
||||
let thumbnail_directory = Path::new(&thumbs);
|
||||
let base_path = Path::new(&library.root_path);
|
||||
|
||||
// Collect all image and video files, optionally filtered by modification time
|
||||
let files: Vec<(PathBuf, String)> = WalkDir::new(base_path)
|
||||
.into_iter()
|
||||
.filter_map(|entry| entry.ok())
|
||||
.filter(|entry| entry.file_type().is_file())
|
||||
.filter(|entry| {
|
||||
// Filter by modification time if specified
|
||||
if let Some(since) = modified_since {
|
||||
if let Ok(metadata) = entry.metadata()
|
||||
&& let Ok(modified) = metadata.modified()
|
||||
{
|
||||
return modified >= since;
|
||||
}
|
||||
// If we can't get metadata, include the file to be safe
|
||||
return true;
|
||||
}
|
||||
true
|
||||
})
|
||||
.filter(|entry| is_image(entry) || is_video(entry))
|
||||
.filter_map(|entry| {
|
||||
let file_path = entry.path().to_path_buf();
|
||||
// Canonical rel_path is forward-slash regardless of OS so DB
|
||||
// comparisons against the batch EXIF lookup line up.
|
||||
let relative_path = file_path
|
||||
.strip_prefix(base_path)
|
||||
.ok()?
|
||||
.to_str()?
|
||||
.replace('\\', "/");
|
||||
Some((file_path, relative_path))
|
||||
})
|
||||
.collect();
|
||||
// Walk, prune EXCLUDED_DIRS subtrees, and apply image/video + modified_since
|
||||
// filters. See `file_scan` for why exclusion has to happen at WalkDir
|
||||
// time (filter_entry) rather than at face-detect time.
|
||||
let files: Vec<(PathBuf, String)> =
|
||||
image_api::file_scan::enumerate_indexable_files(base_path, excluded_dirs, modified_since);
|
||||
|
||||
if files.is_empty() {
|
||||
debug!("No files to process");
|
||||
|
||||
Reference in New Issue
Block a user