Files
ImageApi/src/file_scan.rs
T
Cameron Cordes 449ce1fda1 chore: resolve all clippy warnings and formatting
- Replace impl ToString with impl Display for InsightJobStatus and
  InsightGenerationType
- Rename from_str → parse to avoid confusion with std::str::FromStr
- Collapse nested if statements (handlers, insight_chat, insight_generator,
  image handlers)
- Use is_multiple_of() instead of manual modulo checks
- Suppress deprecated diesel::dsl::count_distinct (no drop-in replacement
  available in current Diesel version)
- Scope MutexGuard in synthesize_merge to drop before await
- Allow dead_code on generate_no_think, enumerate_indexable_files,
  total_deleted (intended for future use)
- Allow type_complexity on Diesel query result tuples

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-27 13:13:48 -04:00

237 lines
9.1 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//! File enumeration for the indexer pass.
//!
//! Walks a library root and returns the `(absolute_path, forward_slash_rel_path)`
//! pairs that belong in `image_exif`. Pruning `EXCLUDED_DIRS` happens here at
//! WalkDir time via `filter_entry` so whole subtrees (Synology's `@eaDir`,
//! `.thumbnails`, the operator's configured excludes) are never descended —
//! vs walking the full tree and discarding leaves, which on a Synology mount
//! with thousands of `@eaDir` subdirs is the difference between scanning N
//! files and N×3.
//!
//! Previously inlined in `main.rs::process_new_files` without the exclusion
//! filter — paths like `<lib>/@eaDir/.../SYNOFILE_THUMB_*.jpg` ended up in
//! `image_exif` and looped through `face_watch::filter_excluded` every tick,
//! since no `face_detections` row would ever be written for a path dropped
//! at runtime.
use std::path::{Path, PathBuf};
use std::time::SystemTime;
use walkdir::{DirEntry, WalkDir};
use crate::file_types;
use crate::memories::PathExcluder;
/// Walk `base_path`, prune `EXCLUDED_DIRS` subtrees, and return every file
/// entry (any extension). The shared primitive for any code that walks a
/// library root — thumbnail generation, media counts, orphan-playlist
/// reverse lookups, the indexer happy-path, etc. Higher-level helpers
/// (e.g. `enumerate_indexable_files`) layer media-type / mtime filters
/// on top.
///
/// Pruning happens via `filter_entry` so excluded subtrees are never
/// descended at all. On a Synology mount with thousands of `@eaDir`
/// dirs, that's the difference between visiting N files and ~3N.
pub fn walk_library_files(base_path: &Path, excluded_dirs: &[String]) -> Vec<DirEntry> {
let excluder = PathExcluder::new(base_path, excluded_dirs);
WalkDir::new(base_path)
.into_iter()
// Always allow depth 0 (the root). Under a pathological config
// that excludes the base itself, downstream filters drop everything
// anyway — but yielding nothing here would also be silently wrong.
.filter_entry(move |entry| entry.depth() == 0 || !excluder.is_excluded(entry.path()))
.filter_map(|entry| entry.ok())
.filter(|entry| entry.file_type().is_file())
.collect()
}
/// Walk `base_path`, prune `EXCLUDED_DIRS` subtrees, and return
/// `(absolute_path, forward_slash_rel_path)` for every image / video file
/// that should be indexed.
///
/// `modified_since` keeps only files modified at or after the instant —
/// used by the watcher's quick-scan tick to skip the long tail. Files
/// whose metadata can't be read are kept; the caller's batch EXIF lookup
/// dedups against existing rows.
#[allow(dead_code)]
pub fn enumerate_indexable_files(
base_path: &Path,
excluded_dirs: &[String],
modified_since: Option<SystemTime>,
) -> Vec<(PathBuf, String)> {
walk_library_files(base_path, excluded_dirs)
.into_iter()
.filter(|entry| match modified_since {
Some(since) => entry
.metadata()
.ok()
.and_then(|m| m.modified().ok())
.map(|m| m >= since)
.unwrap_or(true),
None => true,
})
.filter(|entry| {
file_types::direntry_is_image(entry) || file_types::direntry_is_video(entry)
})
.filter_map(|entry| {
let file_path = entry.path().to_path_buf();
// Forward-slash rel_path regardless of OS so DB comparisons
// against the batch EXIF lookup line up.
let rel = file_path
.strip_prefix(base_path)
.ok()?
.to_str()?
.replace('\\', "/");
Some((file_path, rel))
})
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
use std::time::Duration;
/// Build a tempdir with `paths` (relative). Each touched file is empty;
/// directory components are created automatically.
fn make_tree(paths: &[&str]) -> tempfile::TempDir {
let dir = tempfile::tempdir().expect("tempdir");
for p in paths {
let abs = dir.path().join(p);
if let Some(parent) = abs.parent() {
fs::create_dir_all(parent).expect("mkdir -p");
}
fs::File::create(&abs).expect("touch");
}
dir
}
fn rel_paths(found: &[(PathBuf, String)]) -> Vec<String> {
let mut v: Vec<String> = found.iter().map(|(_, r)| r.clone()).collect();
v.sort();
v
}
#[test]
fn excludes_eadir_subtree() {
// The bug: Synology's @eaDir gets walked into and its
// SYNOFILE_THUMB_*.jpg leaves end up in image_exif. With
// filter_entry pruning, the subtree is never descended.
let dir = make_tree(&[
"vacation/IMG_0001.jpg",
"vacation/@eaDir/IMG_0001.jpg/SYNOFILE_THUMB_S.jpg",
"vacation/@eaDir/IMG_0001.jpg/SYNOFILE_THUMB_XL.jpg",
"@eaDir/top_level_thumb.jpg",
]);
let found = enumerate_indexable_files(dir.path(), &["@eaDir".to_string()], None);
assert_eq!(rel_paths(&found), vec!["vacation/IMG_0001.jpg".to_string()]);
}
#[test]
fn excludes_nested_pattern() {
// .thumbnails as a component pattern (not an absolute dir).
let dir = make_tree(&[
"a/b/photo.jpg",
"a/.thumbnails/cached.jpg",
"a/b/.thumbnails/nested.jpg",
]);
let found = enumerate_indexable_files(dir.path(), &[".thumbnails".to_string()], None);
assert_eq!(rel_paths(&found), vec!["a/b/photo.jpg".to_string()]);
}
#[test]
fn excludes_absolute_under_base() {
// Leading-'/' entries are interpreted as paths under the library
// root (see PathExcluder::new).
let dir = make_tree(&["private/secret.jpg", "public/keep.jpg"]);
let found = enumerate_indexable_files(dir.path(), &["/private".to_string()], None);
assert_eq!(rel_paths(&found), vec!["public/keep.jpg".to_string()]);
}
#[test]
fn filters_non_media() {
let dir = make_tree(&[
"a.jpg",
"b.mp4",
"c.txt",
"d", // no extension
"e.jpg.bak", // wrong ext
]);
let found = enumerate_indexable_files(dir.path(), &[], None);
assert_eq!(
rel_paths(&found),
vec!["a.jpg".to_string(), "b.mp4".to_string()]
);
}
#[test]
fn modified_since_filters_old_files() {
let dir = make_tree(&["old.jpg", "new.jpg"]);
// Backdate "old.jpg" to a known instant. Use filetime via a portable
// touch: set both atime and mtime to a fixed past time using
// std::fs::File metadata — simpler to set the cutoff into the future
// for "old" and the present for "new" semantically.
//
// Simplest reliable approach: capture mtime of new.jpg, sleep
// briefly, recreate it, and use the original mtime as the cutoff.
// That way "old.jpg" is older than the cutoff and "new.jpg" is at
// or after.
let new_path = dir.path().join("new.jpg");
// Force a measurable gap so filesystems with low-resolution mtime
// don't collapse them into the same instant.
std::thread::sleep(Duration::from_millis(20));
let cutoff = SystemTime::now();
std::thread::sleep(Duration::from_millis(20));
// Bump new.jpg's mtime by rewriting it.
fs::write(&new_path, b"x").expect("rewrite");
let found = enumerate_indexable_files(dir.path(), &[], Some(cutoff));
assert_eq!(rel_paths(&found), vec!["new.jpg".to_string()]);
}
#[test]
fn walk_library_files_excludes_subtrees_and_returns_all_extensions() {
// The lower-level primitive: any extension survives, but excluded
// subtrees are pruned. Used by thumbnail gen and media-count
// gauges, which need non-media files too (e.g., walks through
// sidecar XMPs alongside the photos).
let dir = make_tree(&[
"vacation/IMG_0001.jpg",
"vacation/IMG_0001.xmp",
"vacation/@eaDir/IMG_0001.jpg/SYNOFILE_THUMB_S.jpg",
"notes.txt",
]);
let mut got: Vec<String> = walk_library_files(dir.path(), &["@eaDir".to_string()])
.into_iter()
.map(|e| {
e.path()
.strip_prefix(dir.path())
.unwrap()
.to_string_lossy()
.replace('\\', "/")
})
.collect();
got.sort();
assert_eq!(
got,
vec![
"notes.txt".to_string(),
"vacation/IMG_0001.jpg".to_string(),
"vacation/IMG_0001.xmp".to_string(),
]
);
}
#[test]
fn rel_path_is_forward_slash() {
// Sanity on a nested path. On Unix this is already '/'; the
// assertion guards a future Windows port from regressing.
let dir = make_tree(&["a/b/c.jpg"]);
let found = enumerate_indexable_files(dir.path(), &[], None);
let (_abs, rel) = &found[0];
assert_eq!(rel, "a/b/c.jpg");
assert!(!rel.contains('\\'));
}
}