indexer: prune EXCLUDED_DIRS at WalkDir time, extract enumerate_indexable_files #63

Merged
cameron merged 2 commits from feature/exclude-dirs-at-index-time into master 2026-04-30 20:24:19 +00:00
5 changed files with 293 additions and 63 deletions

235
src/file_scan.rs Normal file
View File

@@ -0,0 +1,235 @@
//! File enumeration for the indexer pass.
//!
//! Walks a library root and returns the `(absolute_path, forward_slash_rel_path)`
//! pairs that belong in `image_exif`. Pruning `EXCLUDED_DIRS` happens here at
//! WalkDir time via `filter_entry` so whole subtrees (Synology's `@eaDir`,
//! `.thumbnails`, the operator's configured excludes) are never descended —
//! vs walking the full tree and discarding leaves, which on a Synology mount
//! with thousands of `@eaDir` subdirs is the difference between scanning N
//! files and N×3.
//!
//! Previously inlined in `main.rs::process_new_files` without the exclusion
//! filter — paths like `<lib>/@eaDir/.../SYNOFILE_THUMB_*.jpg` ended up in
//! `image_exif` and looped through `face_watch::filter_excluded` every tick,
//! since no `face_detections` row would ever be written for a path dropped
//! at runtime.
use std::path::{Path, PathBuf};
use std::time::SystemTime;
use walkdir::{DirEntry, WalkDir};
use crate::file_types;
use crate::memories::PathExcluder;
/// Walk `base_path`, prune `EXCLUDED_DIRS` subtrees, and return every file
/// entry (any extension). The shared primitive for any code that walks a
/// library root — thumbnail generation, media counts, orphan-playlist
/// reverse lookups, the indexer happy-path, etc. Higher-level helpers
/// (e.g. `enumerate_indexable_files`) layer media-type / mtime filters
/// on top.
///
/// Pruning happens via `filter_entry` so excluded subtrees are never
/// descended at all. On a Synology mount with thousands of `@eaDir`
/// dirs, that's the difference between visiting N files and ~3N.
pub fn walk_library_files(base_path: &Path, excluded_dirs: &[String]) -> Vec<DirEntry> {
let excluder = PathExcluder::new(base_path, excluded_dirs);
WalkDir::new(base_path)
.into_iter()
// Always allow depth 0 (the root). Under a pathological config
// that excludes the base itself, downstream filters drop everything
// anyway — but yielding nothing here would also be silently wrong.
.filter_entry(move |entry| entry.depth() == 0 || !excluder.is_excluded(entry.path()))
.filter_map(|entry| entry.ok())
.filter(|entry| entry.file_type().is_file())
.collect()
}
/// Walk `base_path`, prune `EXCLUDED_DIRS` subtrees, and return
/// `(absolute_path, forward_slash_rel_path)` for every image / video file
/// that should be indexed.
///
/// `modified_since` keeps only files modified at or after the instant —
/// used by the watcher's quick-scan tick to skip the long tail. Files
/// whose metadata can't be read are kept; the caller's batch EXIF lookup
/// dedups against existing rows.
pub fn enumerate_indexable_files(
base_path: &Path,
excluded_dirs: &[String],
modified_since: Option<SystemTime>,
) -> Vec<(PathBuf, String)> {
walk_library_files(base_path, excluded_dirs)
.into_iter()
.filter(|entry| match modified_since {
Some(since) => entry
.metadata()
.ok()
.and_then(|m| m.modified().ok())
.map(|m| m >= since)
.unwrap_or(true),
None => true,
})
.filter(|entry| {
file_types::direntry_is_image(entry) || file_types::direntry_is_video(entry)
})
.filter_map(|entry| {
let file_path = entry.path().to_path_buf();
// Forward-slash rel_path regardless of OS so DB comparisons
// against the batch EXIF lookup line up.
let rel = file_path
.strip_prefix(base_path)
.ok()?
.to_str()?
.replace('\\', "/");
Some((file_path, rel))
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
use std::time::Duration;
/// Build a tempdir with `paths` (relative). Each touched file is empty;
/// directory components are created automatically.
fn make_tree(paths: &[&str]) -> tempfile::TempDir {
let dir = tempfile::tempdir().expect("tempdir");
for p in paths {
let abs = dir.path().join(p);
if let Some(parent) = abs.parent() {
fs::create_dir_all(parent).expect("mkdir -p");
}
fs::File::create(&abs).expect("touch");
}
dir
}
fn rel_paths(found: &[(PathBuf, String)]) -> Vec<String> {
let mut v: Vec<String> = found.iter().map(|(_, r)| r.clone()).collect();
v.sort();
v
}
#[test]
fn excludes_eadir_subtree() {
// The bug: Synology's @eaDir gets walked into and its
// SYNOFILE_THUMB_*.jpg leaves end up in image_exif. With
// filter_entry pruning, the subtree is never descended.
let dir = make_tree(&[
"vacation/IMG_0001.jpg",
"vacation/@eaDir/IMG_0001.jpg/SYNOFILE_THUMB_S.jpg",
"vacation/@eaDir/IMG_0001.jpg/SYNOFILE_THUMB_XL.jpg",
"@eaDir/top_level_thumb.jpg",
]);
let found = enumerate_indexable_files(dir.path(), &["@eaDir".to_string()], None);
assert_eq!(rel_paths(&found), vec!["vacation/IMG_0001.jpg".to_string()]);
}
#[test]
fn excludes_nested_pattern() {
// .thumbnails as a component pattern (not an absolute dir).
let dir = make_tree(&[
"a/b/photo.jpg",
"a/.thumbnails/cached.jpg",
"a/b/.thumbnails/nested.jpg",
]);
let found = enumerate_indexable_files(dir.path(), &[".thumbnails".to_string()], None);
assert_eq!(rel_paths(&found), vec!["a/b/photo.jpg".to_string()]);
}
#[test]
fn excludes_absolute_under_base() {
// Leading-'/' entries are interpreted as paths under the library
// root (see PathExcluder::new).
let dir = make_tree(&["private/secret.jpg", "public/keep.jpg"]);
let found = enumerate_indexable_files(dir.path(), &["/private".to_string()], None);
assert_eq!(rel_paths(&found), vec!["public/keep.jpg".to_string()]);
}
#[test]
fn filters_non_media() {
let dir = make_tree(&[
"a.jpg",
"b.mp4",
"c.txt",
"d", // no extension
"e.jpg.bak", // wrong ext
]);
let found = enumerate_indexable_files(dir.path(), &[], None);
assert_eq!(
rel_paths(&found),
vec!["a.jpg".to_string(), "b.mp4".to_string()]
);
}
#[test]
fn modified_since_filters_old_files() {
let dir = make_tree(&["old.jpg", "new.jpg"]);
// Backdate "old.jpg" to a known instant. Use filetime via a portable
// touch: set both atime and mtime to a fixed past time using
// std::fs::File metadata — simpler to set the cutoff into the future
// for "old" and the present for "new" semantically.
//
// Simplest reliable approach: capture mtime of new.jpg, sleep
// briefly, recreate it, and use the original mtime as the cutoff.
// That way "old.jpg" is older than the cutoff and "new.jpg" is at
// or after.
let new_path = dir.path().join("new.jpg");
// Force a measurable gap so filesystems with low-resolution mtime
// don't collapse them into the same instant.
std::thread::sleep(Duration::from_millis(20));
let cutoff = SystemTime::now();
std::thread::sleep(Duration::from_millis(20));
// Bump new.jpg's mtime by rewriting it.
fs::write(&new_path, b"x").expect("rewrite");
let found = enumerate_indexable_files(dir.path(), &[], Some(cutoff));
assert_eq!(rel_paths(&found), vec!["new.jpg".to_string()]);
}
#[test]
fn walk_library_files_excludes_subtrees_and_returns_all_extensions() {
// The lower-level primitive: any extension survives, but excluded
// subtrees are pruned. Used by thumbnail gen and media-count
// gauges, which need non-media files too (e.g., walks through
// sidecar XMPs alongside the photos).
let dir = make_tree(&[
"vacation/IMG_0001.jpg",
"vacation/IMG_0001.xmp",
"vacation/@eaDir/IMG_0001.jpg/SYNOFILE_THUMB_S.jpg",
"notes.txt",
]);
let mut got: Vec<String> = walk_library_files(dir.path(), &["@eaDir".to_string()])
.into_iter()
.map(|e| {
e.path()
.strip_prefix(dir.path())
.unwrap()
.to_string_lossy()
.replace('\\', "/")
})
.collect();
got.sort();
assert_eq!(
got,
vec![
"notes.txt".to_string(),
"vacation/IMG_0001.jpg".to_string(),
"vacation/IMG_0001.xmp".to_string(),
]
);
}
#[test]
fn rel_path_is_forward_slash() {
// Sanity on a nested path. On Unix this is already '/'; the
// assertion guards a future Windows port from regressing.
let dir = make_tree(&["a/b/c.jpg"]);
let found = enumerate_indexable_files(dir.path(), &[], None);
let (_abs, rel) = &found[0];
assert_eq!(rel, "a/b/c.jpg");
assert!(!rel.contains('\\'));
}
}

View File

@@ -1397,7 +1397,7 @@ impl Handler<RefreshThumbnailsMessage> for StreamActor {
// The stub in lib.rs is a no-op; the real generation is driven by
// the file watcher tick in main.rs, which has access to the
// configured libraries.
create_thumbnails(&[])
create_thumbnails(&[], &[])
}
}

View File

@@ -14,6 +14,7 @@ pub mod error;
pub mod exif;
pub mod face_watch;
pub mod faces;
pub mod file_scan;
pub mod file_types;
pub mod files;
pub mod geo;
@@ -39,11 +40,11 @@ pub use state::AppState;
use std::path::Path;
use walkdir::DirEntry;
pub fn create_thumbnails(_libs: &[libraries::Library]) {
pub fn create_thumbnails(_libs: &[libraries::Library], _excluded_dirs: &[String]) {
// Stub - implemented in main.rs
}
pub fn update_media_counts(_media_dir: &Path) {
pub fn update_media_counts(_media_dir: &Path, _excluded_dirs: &[String]) {
// Stub - implemented in main.rs
}

View File

@@ -1292,7 +1292,7 @@ fn generate_image_thumbnail(src: &Path, thumb_path: &Path) -> std::io::Result<()
Ok(())
}
fn create_thumbnails(libs: &[libraries::Library]) {
fn create_thumbnails(libs: &[libraries::Library], excluded_dirs: &[String]) {
let tracer = global_tracer();
let span = tracer.start("creating thumbnails");
@@ -1306,12 +1306,10 @@ fn create_thumbnails(libs: &[libraries::Library]) {
);
let images = PathBuf::from(&lib.root_path);
WalkDir::new(&images)
.into_iter()
.collect::<Vec<Result<_, _>>>()
// Prune EXCLUDED_DIRS so we don't generate thumbnails-of-thumbnails
// for Synology @eaDir trees. file_scan handles filter_entry pruning.
image_api::file_scan::walk_library_files(&images, excluded_dirs)
.into_par_iter()
.filter_map(|entry| entry.ok())
.filter(|entry| entry.file_type().is_file())
.for_each(|entry| {
let src = entry.path();
let Ok(relative_path) = src.strip_prefix(&images) else {
@@ -1367,17 +1365,17 @@ fn create_thumbnails(libs: &[libraries::Library]) {
debug!("Finished making thumbnails");
for lib in libs {
update_media_counts(Path::new(&lib.root_path));
update_media_counts(Path::new(&lib.root_path), excluded_dirs);
}
}
fn update_media_counts(media_dir: &Path) {
fn update_media_counts(media_dir: &Path, excluded_dirs: &[String]) {
let mut image_count = 0;
let mut video_count = 0;
for ref entry in WalkDir::new(media_dir).into_iter().filter_map(|e| e.ok()) {
if is_image(entry) {
for entry in image_api::file_scan::walk_library_files(media_dir, excluded_dirs) {
if is_image(&entry) {
image_count += 1;
} else if is_video(entry) {
} else if is_video(&entry) {
video_count += 1;
}
}
@@ -1426,8 +1424,9 @@ fn main() -> std::io::Result<()> {
// so missed files are filled in over successive scans.
{
let libs = app_data.libraries.clone();
let excluded = app_data.excluded_dirs.clone();
std::thread::spawn(move || {
create_thumbnails(&libs);
create_thumbnails(&libs, &excluded);
});
}
// generate_video_gifs().await;
@@ -1466,7 +1465,7 @@ fn main() -> std::io::Result<()> {
);
// Start orphaned playlist cleanup job
cleanup_orphaned_playlists();
cleanup_orphaned_playlists(app_state.excluded_dirs.clone());
// Spawn background job to generate daily conversation summaries
{
@@ -1658,8 +1657,8 @@ fn run_migrations(
}
/// Clean up orphaned HLS playlists and segments whose source videos no longer exist
fn cleanup_orphaned_playlists() {
std::thread::spawn(|| {
fn cleanup_orphaned_playlists(excluded_dirs: Vec<String>) {
std::thread::spawn(move || {
let video_path = dotenv::var("VIDEO_PATH").expect("VIDEO_PATH must be set");
let base_path = dotenv::var("BASE_PATH").expect("BASE_PATH must be set");
@@ -1704,13 +1703,14 @@ fn cleanup_orphaned_playlists() {
if let Some(filename) = playlist_path.file_stem() {
let video_filename = filename.to_string_lossy();
// Search for this video file in BASE_PATH
// Search for this video file in BASE_PATH, respecting
// EXCLUDED_DIRS so we don't false-resurrect playlists for
// videos that only exist inside an excluded subtree.
let mut video_exists = false;
for entry in WalkDir::new(&base_path)
.into_iter()
.filter_map(|e| e.ok())
.filter(|e| e.file_type().is_file())
{
for entry in image_api::file_scan::walk_library_files(
Path::new(&base_path),
&excluded_dirs,
) {
if let Some(entry_stem) = entry.path().file_stem()
&& entry_stem == filename
&& is_video_file(entry.path())
@@ -1922,7 +1922,7 @@ fn watch_files(
}
// Update media counts per library (metric aggregates across all)
update_media_counts(Path::new(&lib.root_path));
update_media_counts(Path::new(&lib.root_path), &excluded_dirs);
}
if is_full_scan {
@@ -1974,37 +1974,11 @@ fn process_new_files(
let thumbnail_directory = Path::new(&thumbs);
let base_path = Path::new(&library.root_path);
// Collect all image and video files, optionally filtered by modification time
let files: Vec<(PathBuf, String)> = WalkDir::new(base_path)
.into_iter()
.filter_map(|entry| entry.ok())
.filter(|entry| entry.file_type().is_file())
.filter(|entry| {
// Filter by modification time if specified
if let Some(since) = modified_since {
if let Ok(metadata) = entry.metadata()
&& let Ok(modified) = metadata.modified()
{
return modified >= since;
}
// If we can't get metadata, include the file to be safe
return true;
}
true
})
.filter(|entry| is_image(entry) || is_video(entry))
.filter_map(|entry| {
let file_path = entry.path().to_path_buf();
// Canonical rel_path is forward-slash regardless of OS so DB
// comparisons against the batch EXIF lookup line up.
let relative_path = file_path
.strip_prefix(base_path)
.ok()?
.to_str()?
.replace('\\', "/");
Some((file_path, relative_path))
})
.collect();
// Walk, prune EXCLUDED_DIRS subtrees, and apply image/video + modified_since
// filters. See `file_scan` for why exclusion has to happen at WalkDir
// time (filter_entry) rather than at face-detect time.
let files: Vec<(PathBuf, String)> =
image_api::file_scan::enumerate_indexable_files(base_path, excluded_dirs, modified_since);
if files.is_empty() {
debug!("No files to process");
@@ -2255,7 +2229,7 @@ fn process_new_files(
// Generate thumbnails for all files that need them
if new_files_found {
info!("Processing thumbnails for new files...");
create_thumbnails(std::slice::from_ref(library));
create_thumbnails(std::slice::from_ref(library), excluded_dirs);
}
// Reconciliation: on a full scan, prune image_exif rows whose rel_path no
@@ -2335,7 +2309,8 @@ fn backfill_unhashed_backlog(
// library's tick. Negligible cost given the cap.
let rows: Vec<(i32, String)> = {
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
dao.get_rows_missing_hash(context, cap + 1).unwrap_or_default()
dao.get_rows_missing_hash(context, cap + 1)
.unwrap_or_default()
};
if rows.is_empty() {
return 0;
@@ -2361,9 +2336,13 @@ fn backfill_unhashed_backlog(
match content_hash::compute(&abs) {
Ok(id) => {
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
if let Err(e) =
dao.backfill_content_hash(context, library.id, rel_path, &id.content_hash, id.size_bytes)
{
if let Err(e) = dao.backfill_content_hash(
context,
library.id,
rel_path,
&id.content_hash,
id.size_bytes,
) {
warn!(
"face_watch: backfill_content_hash failed for {}: {:?}",
rel_path, e
@@ -2374,7 +2353,11 @@ fn backfill_unhashed_backlog(
}
}
Err(e) => {
debug!("face_watch: hash compute failed for {} ({:?})", abs.display(), e);
debug!(
"face_watch: hash compute failed for {} ({:?})",
abs.display(),
e
);
errors += 1;
}
}

View File

@@ -24,6 +24,17 @@ pub async fn generate_video_gifs() {
fs::create_dir_all(gif_base_path).expect("There was an issue creating directory");
let files = PathBuf::from(dotenv::var("BASE_PATH").unwrap());
// EXCLUDED_DIRS read here for the update_media_counts call below.
// The WalkDir walk itself is left raw — this function is currently
// dead code (`#[allow(dead_code)]`) and not reachable from main.
// If revived, swap to file_scan::walk_library_files (dual lib/bin
// module path makes that non-trivial here).
let excluded_dirs: Vec<String> = std::env::var("EXCLUDED_DIRS")
.unwrap_or_default()
.split(',')
.filter(|s| !s.trim().is_empty())
.map(|s| s.trim().to_string())
.collect();
let ffmpeg = Ffmpeg;
for file in WalkDir::new(&files)
@@ -62,6 +73,6 @@ pub async fn generate_video_gifs() {
info!("Finished making video gifs in {:?}", start.elapsed());
update_media_counts(&files);
update_media_counts(&files, &excluded_dirs);
});
}