indexer: prune EXCLUDED_DIRS at WalkDir time, extract enumerate_indexable_files #63
200
src/file_scan.rs
Normal file
200
src/file_scan.rs
Normal file
@@ -0,0 +1,200 @@
|
|||||||
|
//! File enumeration for the indexer pass.
|
||||||
|
//!
|
||||||
|
//! Walks a library root and returns the `(absolute_path, forward_slash_rel_path)`
|
||||||
|
//! pairs that belong in `image_exif`. Pruning `EXCLUDED_DIRS` happens here at
|
||||||
|
//! WalkDir time via `filter_entry` so whole subtrees (Synology's `@eaDir`,
|
||||||
|
//! `.thumbnails`, the operator's configured excludes) are never descended —
|
||||||
|
//! vs walking the full tree and discarding leaves, which on a Synology mount
|
||||||
|
//! with thousands of `@eaDir` subdirs is the difference between scanning N
|
||||||
|
//! files and N×3.
|
||||||
|
//!
|
||||||
|
//! Previously inlined in `main.rs::process_new_files` without the exclusion
|
||||||
|
//! filter — paths like `<lib>/@eaDir/.../SYNOFILE_THUMB_*.jpg` ended up in
|
||||||
|
//! `image_exif` and looped through `face_watch::filter_excluded` every tick,
|
||||||
|
//! since no `face_detections` row would ever be written for a path dropped
|
||||||
|
//! at runtime.
|
||||||
|
|
||||||
|
use std::path::{Path, PathBuf};
|
||||||
|
use std::time::SystemTime;
|
||||||
|
|
||||||
|
use walkdir::WalkDir;
|
||||||
|
|
||||||
|
use crate::file_types;
|
||||||
|
use crate::memories::PathExcluder;
|
||||||
|
|
||||||
|
/// Walk `base_path`, prune `EXCLUDED_DIRS` subtrees, and return
|
||||||
|
/// `(absolute_path, forward_slash_rel_path)` for every image / video file
|
||||||
|
/// that should be indexed.
|
||||||
|
///
|
||||||
|
/// `modified_since` keeps only files modified at or after the instant —
|
||||||
|
/// used by the watcher's quick-scan tick to skip the long tail. Files
|
||||||
|
/// whose metadata can't be read are kept; the caller's batch EXIF lookup
|
||||||
|
/// dedups against existing rows.
|
||||||
|
pub fn enumerate_indexable_files(
|
||||||
|
base_path: &Path,
|
||||||
|
excluded_dirs: &[String],
|
||||||
|
modified_since: Option<SystemTime>,
|
||||||
|
) -> Vec<(PathBuf, String)> {
|
||||||
|
let excluder = PathExcluder::new(base_path, excluded_dirs);
|
||||||
|
|
||||||
|
WalkDir::new(base_path)
|
||||||
|
.into_iter()
|
||||||
|
// Prune whole subtrees so WalkDir doesn't descend into excluded
|
||||||
|
// dirs at all. Always allow depth 0 (the root itself); under a
|
||||||
|
// pathological config that excludes the base, downstream filters
|
||||||
|
// would still drop everything anyway.
|
||||||
|
.filter_entry(|entry| entry.depth() == 0 || !excluder.is_excluded(entry.path()))
|
||||||
|
.filter_map(|entry| entry.ok())
|
||||||
|
.filter(|entry| entry.file_type().is_file())
|
||||||
|
.filter(|entry| match modified_since {
|
||||||
|
Some(since) => entry
|
||||||
|
.metadata()
|
||||||
|
.ok()
|
||||||
|
.and_then(|m| m.modified().ok())
|
||||||
|
.map(|m| m >= since)
|
||||||
|
.unwrap_or(true),
|
||||||
|
None => true,
|
||||||
|
})
|
||||||
|
.filter(|entry| {
|
||||||
|
file_types::direntry_is_image(entry) || file_types::direntry_is_video(entry)
|
||||||
|
})
|
||||||
|
.filter_map(|entry| {
|
||||||
|
let file_path = entry.path().to_path_buf();
|
||||||
|
// Forward-slash rel_path regardless of OS so DB comparisons
|
||||||
|
// against the batch EXIF lookup line up.
|
||||||
|
let rel = file_path
|
||||||
|
.strip_prefix(base_path)
|
||||||
|
.ok()?
|
||||||
|
.to_str()?
|
||||||
|
.replace('\\', "/");
|
||||||
|
Some((file_path, rel))
|
||||||
|
})
|
||||||
|
.collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use std::fs;
|
||||||
|
use std::time::Duration;
|
||||||
|
|
||||||
|
/// Build a tempdir with `paths` (relative). Each touched file is empty;
|
||||||
|
/// directory components are created automatically.
|
||||||
|
fn make_tree(paths: &[&str]) -> tempfile::TempDir {
|
||||||
|
let dir = tempfile::tempdir().expect("tempdir");
|
||||||
|
for p in paths {
|
||||||
|
let abs = dir.path().join(p);
|
||||||
|
if let Some(parent) = abs.parent() {
|
||||||
|
fs::create_dir_all(parent).expect("mkdir -p");
|
||||||
|
}
|
||||||
|
fs::File::create(&abs).expect("touch");
|
||||||
|
}
|
||||||
|
dir
|
||||||
|
}
|
||||||
|
|
||||||
|
fn rel_paths(found: &[(PathBuf, String)]) -> Vec<String> {
|
||||||
|
let mut v: Vec<String> = found.iter().map(|(_, r)| r.clone()).collect();
|
||||||
|
v.sort();
|
||||||
|
v
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn excludes_eadir_subtree() {
|
||||||
|
// The bug: Synology's @eaDir gets walked into and its
|
||||||
|
// SYNOFILE_THUMB_*.jpg leaves end up in image_exif. With
|
||||||
|
// filter_entry pruning, the subtree is never descended.
|
||||||
|
let dir = make_tree(&[
|
||||||
|
"vacation/IMG_0001.jpg",
|
||||||
|
"vacation/@eaDir/IMG_0001.jpg/SYNOFILE_THUMB_S.jpg",
|
||||||
|
"vacation/@eaDir/IMG_0001.jpg/SYNOFILE_THUMB_XL.jpg",
|
||||||
|
"@eaDir/top_level_thumb.jpg",
|
||||||
|
]);
|
||||||
|
let found = enumerate_indexable_files(
|
||||||
|
dir.path(),
|
||||||
|
&["@eaDir".to_string()],
|
||||||
|
None,
|
||||||
|
);
|
||||||
|
assert_eq!(rel_paths(&found), vec!["vacation/IMG_0001.jpg".to_string()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn excludes_nested_pattern() {
|
||||||
|
// .thumbnails as a component pattern (not an absolute dir).
|
||||||
|
let dir = make_tree(&[
|
||||||
|
"a/b/photo.jpg",
|
||||||
|
"a/.thumbnails/cached.jpg",
|
||||||
|
"a/b/.thumbnails/nested.jpg",
|
||||||
|
]);
|
||||||
|
let found = enumerate_indexable_files(
|
||||||
|
dir.path(),
|
||||||
|
&[".thumbnails".to_string()],
|
||||||
|
None,
|
||||||
|
);
|
||||||
|
assert_eq!(rel_paths(&found), vec!["a/b/photo.jpg".to_string()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn excludes_absolute_under_base() {
|
||||||
|
// Leading-'/' entries are interpreted as paths under the library
|
||||||
|
// root (see PathExcluder::new).
|
||||||
|
let dir = make_tree(&[
|
||||||
|
"private/secret.jpg",
|
||||||
|
"public/keep.jpg",
|
||||||
|
]);
|
||||||
|
let found = enumerate_indexable_files(
|
||||||
|
dir.path(),
|
||||||
|
&["/private".to_string()],
|
||||||
|
None,
|
||||||
|
);
|
||||||
|
assert_eq!(rel_paths(&found), vec!["public/keep.jpg".to_string()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn filters_non_media() {
|
||||||
|
let dir = make_tree(&[
|
||||||
|
"a.jpg",
|
||||||
|
"b.mp4",
|
||||||
|
"c.txt",
|
||||||
|
"d", // no extension
|
||||||
|
"e.jpg.bak", // wrong ext
|
||||||
|
]);
|
||||||
|
let found = enumerate_indexable_files(dir.path(), &[], None);
|
||||||
|
assert_eq!(rel_paths(&found), vec!["a.jpg".to_string(), "b.mp4".to_string()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn modified_since_filters_old_files() {
|
||||||
|
let dir = make_tree(&["old.jpg", "new.jpg"]);
|
||||||
|
// Backdate "old.jpg" to a known instant. Use filetime via a portable
|
||||||
|
// touch: set both atime and mtime to a fixed past time using
|
||||||
|
// std::fs::File metadata — simpler to set the cutoff into the future
|
||||||
|
// for "old" and the present for "new" semantically.
|
||||||
|
//
|
||||||
|
// Simplest reliable approach: capture mtime of new.jpg, sleep
|
||||||
|
// briefly, recreate it, and use the original mtime as the cutoff.
|
||||||
|
// That way "old.jpg" is older than the cutoff and "new.jpg" is at
|
||||||
|
// or after.
|
||||||
|
let new_path = dir.path().join("new.jpg");
|
||||||
|
// Force a measurable gap so filesystems with low-resolution mtime
|
||||||
|
// don't collapse them into the same instant.
|
||||||
|
std::thread::sleep(Duration::from_millis(20));
|
||||||
|
let cutoff = SystemTime::now();
|
||||||
|
std::thread::sleep(Duration::from_millis(20));
|
||||||
|
// Bump new.jpg's mtime by rewriting it.
|
||||||
|
fs::write(&new_path, b"x").expect("rewrite");
|
||||||
|
|
||||||
|
let found = enumerate_indexable_files(dir.path(), &[], Some(cutoff));
|
||||||
|
assert_eq!(rel_paths(&found), vec!["new.jpg".to_string()]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn rel_path_is_forward_slash() {
|
||||||
|
// Sanity on a nested path. On Unix this is already '/'; the
|
||||||
|
// assertion guards a future Windows port from regressing.
|
||||||
|
let dir = make_tree(&["a/b/c.jpg"]);
|
||||||
|
let found = enumerate_indexable_files(dir.path(), &[], None);
|
||||||
|
let (_abs, rel) = &found[0];
|
||||||
|
assert_eq!(rel, "a/b/c.jpg");
|
||||||
|
assert!(!rel.contains('\\'));
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -14,6 +14,7 @@ pub mod error;
|
|||||||
pub mod exif;
|
pub mod exif;
|
||||||
pub mod face_watch;
|
pub mod face_watch;
|
||||||
pub mod faces;
|
pub mod faces;
|
||||||
|
pub mod file_scan;
|
||||||
pub mod file_types;
|
pub mod file_types;
|
||||||
pub mod files;
|
pub mod files;
|
||||||
pub mod geo;
|
pub mod geo;
|
||||||
|
|||||||
36
src/main.rs
36
src/main.rs
@@ -1974,37 +1974,11 @@ fn process_new_files(
|
|||||||
let thumbnail_directory = Path::new(&thumbs);
|
let thumbnail_directory = Path::new(&thumbs);
|
||||||
let base_path = Path::new(&library.root_path);
|
let base_path = Path::new(&library.root_path);
|
||||||
|
|
||||||
// Collect all image and video files, optionally filtered by modification time
|
// Walk, prune EXCLUDED_DIRS subtrees, and apply image/video + modified_since
|
||||||
let files: Vec<(PathBuf, String)> = WalkDir::new(base_path)
|
// filters. See `file_scan` for why exclusion has to happen at WalkDir
|
||||||
.into_iter()
|
// time (filter_entry) rather than at face-detect time.
|
||||||
.filter_map(|entry| entry.ok())
|
let files: Vec<(PathBuf, String)> =
|
||||||
.filter(|entry| entry.file_type().is_file())
|
image_api::file_scan::enumerate_indexable_files(base_path, excluded_dirs, modified_since);
|
||||||
.filter(|entry| {
|
|
||||||
// Filter by modification time if specified
|
|
||||||
if let Some(since) = modified_since {
|
|
||||||
if let Ok(metadata) = entry.metadata()
|
|
||||||
&& let Ok(modified) = metadata.modified()
|
|
||||||
{
|
|
||||||
return modified >= since;
|
|
||||||
}
|
|
||||||
// If we can't get metadata, include the file to be safe
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
true
|
|
||||||
})
|
|
||||||
.filter(|entry| is_image(entry) || is_video(entry))
|
|
||||||
.filter_map(|entry| {
|
|
||||||
let file_path = entry.path().to_path_buf();
|
|
||||||
// Canonical rel_path is forward-slash regardless of OS so DB
|
|
||||||
// comparisons against the batch EXIF lookup line up.
|
|
||||||
let relative_path = file_path
|
|
||||||
.strip_prefix(base_path)
|
|
||||||
.ok()?
|
|
||||||
.to_str()?
|
|
||||||
.replace('\\', "/");
|
|
||||||
Some((file_path, relative_path))
|
|
||||||
})
|
|
||||||
.collect();
|
|
||||||
|
|
||||||
if files.is_empty() {
|
if files.is_empty() {
|
||||||
debug!("No files to process");
|
debug!("No files to process");
|
||||||
|
|||||||
Reference in New Issue
Block a user