//! Per-tick drains the watcher runs alongside ingest. //! //! These passes were previously inlined in `main.rs`; they exist because //! a quick scan only walks recently-modified files, so any backlog of //! rows missing a `content_hash` / `date_taken` / face detection //! wouldn't otherwise drain except during the once-an-hour full scan. //! Each function is bounded per call by a `*_PER_TICK` env-var cap. use std::collections::HashMap; use std::path::PathBuf; use std::sync::{Arc, Mutex}; use log::{debug, info, warn}; use crate::content_hash; use crate::database::ExifDao; use crate::date_resolver; use crate::face_watch; use crate::faces; use crate::file_types; use crate::libraries; use crate::tags; /// Compute and persist content_hash for image_exif rows where it's NULL. /// /// Bounded per call by `FACE_HASH_BACKFILL_MAX_PER_TICK` (default 2000) /// so a watcher tick on a large legacy library doesn't block for hours /// blake3-ing every photo at once. Subsequent scans pick up the rest. /// For 50k+ libraries the dedicated `cargo run --bin backfill_hashes` /// is still faster (it doesn't fight a watcher loop for the DAO mutex). /// /// Drains unhashed image_exif rows by querying them directly, independent /// of the filesystem walk. Quick scans only walk recently-modified files, /// so a backlog of pre-existing unhashed rows never enters /// `process_new_files`'s candidate set — left alone, it would only drain /// on full scans (default once an hour). Calling this every tick keeps /// the face-detection backlog moving regardless. /// /// Returns the number of rows successfully backfilled this pass. pub fn backfill_unhashed_backlog( context: &opentelemetry::Context, library: &libraries::Library, exif_dao: &Arc>>, ) -> usize { let cap: i64 = dotenv::var("FACE_HASH_BACKFILL_MAX_PER_TICK") .ok() .and_then(|s| s.parse().ok()) .filter(|n: &i64| *n > 0) .unwrap_or(2000); // Fetch up to cap+1 rows so we can tell "more remain" without a // separate count query. Across libraries — there's no per-library // filter on get_rows_missing_hash today — but we only ever update // rows whose library_id matches the caller's library, so other // libraries' rows just get skipped here and picked up on the next // library's tick. Negligible cost given the cap. let rows: Vec<(i32, String)> = { let mut dao = exif_dao.lock().expect("Unable to lock ExifDao"); dao.get_rows_missing_hash(context, cap + 1) .unwrap_or_default() }; if rows.is_empty() { return 0; } let more_than_cap = rows.len() as i64 > cap; let base_path = std::path::Path::new(&library.root_path); let mut backfilled = 0usize; let mut errors = 0usize; let mut skipped_other_lib = 0usize; for (lib_id, rel_path) in rows.iter().take(cap as usize) { if *lib_id != library.id { skipped_other_lib += 1; continue; } let abs = base_path.join(rel_path); if !abs.exists() { // File walked away — the watcher's reconciliation pass will // remove the orphan exif row eventually. continue; } match content_hash::compute(&abs) { Ok(id) => { let mut dao = exif_dao.lock().expect("Unable to lock ExifDao"); if let Err(e) = dao.backfill_content_hash( context, library.id, rel_path, &id.content_hash, id.size_bytes, ) { warn!( "face_watch: backfill_content_hash failed for {}: {:?}", rel_path, e ); errors += 1; } else { backfilled += 1; } } Err(e) => { debug!( "face_watch: hash compute failed for {} ({:?})", abs.display(), e ); errors += 1; } } } if backfilled > 0 || errors > 0 || more_than_cap { info!( "face_watch: backfill pass for library '{}': hashed {} ({} error(s), {} skipped to other libraries; {} cap, more_remain={})", library.name, backfilled, errors, skipped_other_lib, cap, more_than_cap ); } backfilled } /// Drain image_exif rows whose `date_taken` was never resolved or was /// resolved by the weakest fallback (`fs_time`). Runs the canonical-date /// waterfall — exiftool batch (one subprocess for the whole tick's /// rows) → filename regex → earliest_fs_time — and persists each /// resolution with its source tag. Capped per tick by /// `DATE_BACKFILL_MAX_PER_TICK` (default 500) so a 14k-row library /// drains over a few quick-scan ticks without blocking the watcher. /// /// kamadak-exif is intentionally skipped here: the row already has a /// NULL date_taken because the ingest path's kamadak-exif call returned /// nothing, and re-running it would just produce the same answer. /// exiftool is the meaningful new attempt — it handles videos and /// MakerNote-hosted dates kamadak can't reach. pub fn backfill_missing_date_taken( context: &opentelemetry::Context, library: &libraries::Library, exif_dao: &Arc>>, ) -> usize { let cap: i64 = dotenv::var("DATE_BACKFILL_MAX_PER_TICK") .ok() .and_then(|s| s.parse().ok()) .filter(|n: &i64| *n > 0) .unwrap_or(500); let rows: Vec<(i32, String)> = { let mut dao = exif_dao.lock().expect("Unable to lock ExifDao"); dao.get_rows_needing_date_backfill(context, library.id, cap + 1) .unwrap_or_default() }; if rows.is_empty() { return 0; } let more_than_cap = rows.len() as i64 > cap; let base_path = std::path::Path::new(&library.root_path); // Build absolute paths and drop rows whose files no longer exist — // the missing-file scan in library_maintenance retires deleted rows // separately. Without this filter, NULL-date rows for missing files // would loop through the drain forever (no source can resolve them). let mut existing: Vec<(String, PathBuf)> = Vec::with_capacity(rows.len()); for (_, rel_path) in rows.iter().take(cap as usize) { let abs = base_path.join(rel_path); if abs.exists() { existing.push((rel_path.clone(), abs)); } } if existing.is_empty() { return 0; } // One exiftool subprocess for the whole batch; the resolver falls // through to filename / fs_time per file when exiftool can't supply // a date (or isn't installed at all). let paths: Vec = existing.iter().map(|(_, p)| p.clone()).collect(); let resolved = date_resolver::resolve_dates_batch(&paths, &HashMap::new()); let mut backfilled = 0usize; let mut unresolved = 0usize; let mut by_source: HashMap<&'static str, usize> = HashMap::new(); { let mut dao = exif_dao.lock().expect("Unable to lock ExifDao"); for (rel_path, abs) in &existing { let Some(rd) = resolved.get(abs).copied() else { unresolved += 1; continue; }; match dao.backfill_date_taken( context, library.id, rel_path, rd.timestamp, rd.source.as_str(), ) { Ok(()) => { backfilled += 1; *by_source.entry(rd.source.as_str()).or_insert(0) += 1; } Err(e) => { warn!( "date_backfill: update failed for lib {} {}: {:?}", library.id, rel_path, e ); } } } } if backfilled > 0 || unresolved > 0 || more_than_cap { info!( "date_backfill: library '{}': resolved {} ({:?}), {} unresolved, cap={}, more_remain={}", library.name, backfilled, by_source, unresolved, cap, more_than_cap ); } backfilled } /// Per-tick face-detection drain. Pulls a capped batch of hashed-but- /// unscanned image_exif rows directly via the FaceDao anti-join and /// hands them to the existing detection pass. Runs on every tick (not /// just full scans) so the backlog moves at quick-scan cadence. /// Per-tick CLIP encoding drain. Mirrors `process_face_backlog`: pull /// up to `CLIP_BACKLOG_MAX_PER_TICK` candidates with a known /// `content_hash` but no `clip_embedding`, hand them to /// `clip_watch::run_clip_encoding_pass` for parallel fan-out, and let /// that module write the result back via `backfill_clip_embedding`. /// /// Idempotent — a row stays in the candidate set until its embedding /// lands, so a transient failure (Apollo unreachable, CUDA OOM) just /// defers to the next tick. Permanent failures (un-decodable bytes) /// retry every tick at this point; future Branch may add a status /// column like face_detections has. pub fn process_clip_backlog( context: &opentelemetry::Context, library: &libraries::Library, clip_client: &crate::ai::clip_client::ClipClient, exif_dao: &Arc>>, excluded_dirs: &[String], ) { if !clip_client.is_enabled() { return; } let cap: i64 = dotenv::var("CLIP_BACKLOG_MAX_PER_TICK") .ok() .and_then(|s| s.parse().ok()) .filter(|n: &i64| *n > 0) .unwrap_or(32); let rows: Vec<(String, String)> = { let mut dao = exif_dao.lock().expect("exif dao"); match dao.list_clip_unencoded_candidates(context, library.id, cap) { Ok(r) => r, Err(e) => { warn!( "clip_watch: list_clip_unencoded_candidates failed for library '{}': {:?}", library.name, e ); return; } } }; if rows.is_empty() { return; } info!( "clip_watch: backlog drain — encoding {} candidate(s) for library '{}' (cap={})", rows.len(), library.name, cap ); let candidates: Vec = rows .into_iter() .map( |(rel_path, content_hash)| crate::clip_watch::ClipCandidate { rel_path, content_hash, }, ) .collect(); crate::clip_watch::run_clip_encoding_pass( library, excluded_dirs, clip_client, Arc::clone(exif_dao), candidates, ); } pub fn process_face_backlog( context: &opentelemetry::Context, library: &libraries::Library, face_client: &crate::ai::face_client::FaceClient, face_dao: &Arc>>, tag_dao: &Arc>>, excluded_dirs: &[String], ) { let cap: i64 = dotenv::var("FACE_BACKLOG_MAX_PER_TICK") .ok() .and_then(|s| s.parse().ok()) .filter(|n: &i64| *n > 0) .unwrap_or(64); let rows: Vec<(String, String)> = { let mut dao = face_dao.lock().expect("face dao"); match dao.list_unscanned_candidates(context, library.id, cap) { Ok(r) => r, Err(e) => { warn!( "face_watch: list_unscanned_candidates failed for library '{}': {:?}", library.name, e ); return; } } }; if rows.is_empty() { return; } info!( "face_watch: backlog drain — running detection on {} candidate(s) for library '{}' (cap={})", rows.len(), library.name, cap ); let candidates: Vec = rows .into_iter() .map(|(rel_path, content_hash)| face_watch::FaceCandidate { rel_path, content_hash, }) .collect(); face_watch::run_face_detection_pass( library, excluded_dirs, face_client, Arc::clone(face_dao), Arc::clone(tag_dao), candidates, ); } /// Compute content_hash for any image rows the walker just touched /// whose stored EXIF row is still hash-less. Called from /// `process_new_files` so freshly-ingested files don't have to wait for /// the next standalone `backfill_unhashed_backlog` tick before face /// detection can key on their bytes. /// /// Cap is on **successes only**. An earlier version counted errors too, /// so a pocket of chronically-unhashable files at the front of the /// table (vanished mid-scan, permission denied, etc.) burned the budget /// every tick and the rest of the backlog never advanced. pub fn backfill_missing_content_hashes( context: &opentelemetry::Context, files: &[(PathBuf, String)], library: &libraries::Library, exif_dao: &Arc>>, ) { let image_paths: Vec = files .iter() .filter(|(p, _)| !file_types::is_video_file(p)) .map(|(_, rel)| rel.clone()) .collect(); if image_paths.is_empty() { return; } let exif_records = { let mut dao = exif_dao.lock().expect("Unable to lock ExifDao"); dao.get_exif_batch(context, Some(library.id), &image_paths) .unwrap_or_default() }; // Cheap lookup back from rel_path → absolute file_path so // content_hash::compute can read the bytes. let path_by_rel: HashMap = files.iter().map(|(p, rel)| (rel.clone(), p)).collect(); let cap: usize = dotenv::var("FACE_HASH_BACKFILL_MAX_PER_TICK") .ok() .and_then(|s| s.parse().ok()) .filter(|n: &usize| *n > 0) .unwrap_or(2000); // Count the unhashed backlog up front so we can surface "still needs // backfill: N" in the log — without it, a face-scan that's stuck at // 44% looks stalled when really it's chipping through hashes. let unhashed_total = exif_records .iter() .filter(|r| r.content_hash.is_none()) .count(); let mut backfilled = 0usize; let mut errors = 0usize; for record in &exif_records { if backfilled >= cap { break; } if record.content_hash.is_some() { continue; } let Some(file_path) = path_by_rel.get(&record.file_path) else { // Walked file went missing between the directory scan and now; // next tick will retry naturally. continue; }; match content_hash::compute(file_path) { Ok(id) => { let mut dao = exif_dao.lock().expect("Unable to lock ExifDao"); if let Err(e) = dao.backfill_content_hash( context, library.id, &record.file_path, &id.content_hash, id.size_bytes, ) { warn!( "face_watch: backfill_content_hash failed for {}: {:?}", record.file_path, e ); errors += 1; } else { backfilled += 1; } } Err(e) => { debug!( "face_watch: hash compute failed for {} ({:?})", file_path.display(), e ); errors += 1; } } } // Always log when there's an unhashed backlog so an operator // looking at "scan stuck at 44%" can see backfill is running and // how much remains. Quiet only when there's nothing to do. if unhashed_total > 0 || backfilled > 0 || errors > 0 { let remaining = unhashed_total.saturating_sub(backfilled); info!( "face_watch: backfilled {}/{} content_hash for library '{}' ({} error(s); {} still need backfill; cap={})", backfilled, unhashed_total, library.name, errors, remaining, cap ); } } /// Build the face-detection candidate list for a scan tick. /// /// Returns `(rel_path, content_hash)` for every image file that has a /// content_hash recorded in image_exif but no row in face_detections /// yet. Re-querying image_exif here picks up rows the EXIF write loop /// just inserted alongside any pre-existing rows the watcher walked /// over — covers both new uploads and the initial backlog scan. pub fn build_face_candidates( context: &opentelemetry::Context, library: &libraries::Library, files: &[(PathBuf, String)], exif_dao: &Arc>>, face_dao: &Arc>>, ) -> Vec { // Restrict to image files; videos aren't face-scanned in v1 (kamadak // doesn't even register them in image_exif). let image_paths: Vec = files .iter() .filter(|(p, _)| !file_types::is_video_file(p)) .map(|(_, rel)| rel.clone()) .collect(); if image_paths.is_empty() { return Vec::new(); } let exif_records = { let mut dao = exif_dao.lock().expect("Unable to lock ExifDao"); dao.get_exif_batch(context, Some(library.id), &image_paths) .unwrap_or_default() }; // rel_path → content_hash (only rows with a hash; without one we have // nothing to key face data against). let mut hash_by_path: HashMap = HashMap::with_capacity(exif_records.len()); for record in exif_records { if let Some(h) = record.content_hash { hash_by_path.insert(record.file_path, h); } } let mut candidates = Vec::new(); let mut dao = face_dao.lock().expect("face dao"); for rel_path in image_paths { let Some(hash) = hash_by_path.get(&rel_path) else { continue; }; match dao.already_scanned(context, hash) { Ok(true) => continue, Ok(false) => candidates.push(face_watch::FaceCandidate { rel_path, content_hash: hash.clone(), }), Err(e) => { warn!("face_watch: already_scanned errored for {}: {:?}", hash, e); } } } candidates } #[cfg(test)] mod tests { use super::*; use std::fs; use std::sync::{Arc, Mutex}; use diesel::prelude::*; use tempfile::TempDir; use crate::database::models::{InsertImageExif, InsertLibrary}; use crate::database::test::in_memory_db_connection; use crate::database::{ExifDao, SqliteExifDao, schema}; use crate::faces::{FaceDao, SqliteFaceDao}; use crate::libraries::Library; fn ctx() -> opentelemetry::Context { opentelemetry::Context::new() } /// Build a tempdir-backed library + DAOs sharing a single in-memory /// SQLite connection (so cross-table joins like /// `list_unscanned_candidates` see consistent state). fn setup() -> ( TempDir, Library, Arc>, Arc>>, Arc>>, ) { let tmp = TempDir::new().expect("tempdir"); let mut conn = in_memory_db_connection(); // Migration seeds library id=1 with a placeholder root; rewrite it // to point at the tempdir so `/` resolves to real // files this test creates. diesel::update(schema::libraries::table.filter(schema::libraries::id.eq(1))) .set(schema::libraries::root_path.eq(tmp.path().to_string_lossy().to_string())) .execute(&mut conn) .expect("rewrite library 1 root"); // Add a second library so cross-library skip cases have somewhere // to put their rows. diesel::insert_into(schema::libraries::table) .values(InsertLibrary { name: "other", root_path: "/tmp/other-test-lib", created_at: 0, enabled: true, excluded_dirs: None, }) .execute(&mut conn) .expect("seed second library"); let library = Library { id: 1, name: "main".to_string(), root_path: tmp.path().to_string_lossy().to_string(), enabled: true, excluded_dirs: Vec::new(), }; let shared = Arc::new(Mutex::new(conn)); let exif_dao: Arc>> = Arc::new(Mutex::new(Box::new( SqliteExifDao::from_shared(Arc::clone(&shared)), ))); let face_dao: Arc>> = Arc::new(Mutex::new(Box::new( SqliteFaceDao::from_connection(Arc::clone(&shared)), ))); (tmp, library, shared, exif_dao, face_dao) } fn insert_exif( exif_dao: &Arc>>, lib_id: i32, rel: &str, content_hash: Option<&str>, ) { let mut dao = exif_dao.lock().unwrap(); dao.store_exif( &ctx(), InsertImageExif { library_id: lib_id, file_path: rel.to_string(), camera_make: None, camera_model: None, lens_model: None, width: None, height: None, orientation: None, gps_latitude: None, gps_longitude: None, gps_altitude: None, focal_length: None, aperture: None, shutter_speed: None, iso: None, date_taken: None, created_time: 0, last_modified: 0, content_hash: content_hash.map(|s| s.to_string()), size_bytes: None, phash_64: None, dhash_64: None, date_taken_source: None, }, ) .expect("insert"); } fn write_image(root: &std::path::Path, rel: &str, bytes: &[u8]) { let abs = root.join(rel); if let Some(parent) = abs.parent() { fs::create_dir_all(parent).expect("mkdir"); } fs::write(abs, bytes).expect("write file"); } #[test] fn backfill_unhashed_backlog_hashes_missing_rows_in_this_library() { let (tmp, library, _conn, exif_dao, _face_dao) = setup(); write_image(tmp.path(), "a.jpg", b"alpha-bytes"); write_image(tmp.path(), "b.jpg", b"bravo-bytes"); insert_exif(&exif_dao, 1, "a.jpg", None); insert_exif(&exif_dao, 1, "b.jpg", None); let backfilled = backfill_unhashed_backlog(&ctx(), &library, &exif_dao); assert_eq!(backfilled, 2); let mut dao = exif_dao.lock().unwrap(); let rows = dao .get_exif_batch(&ctx(), Some(1), &["a.jpg".to_string(), "b.jpg".to_string()]) .unwrap(); assert_eq!(rows.len(), 2); for r in rows { assert!( r.content_hash.is_some(), "row {} should have a hash", r.file_path ); } } #[test] fn backfill_unhashed_backlog_skips_other_libraries_and_missing_files() { let (tmp, library, _conn, exif_dao, _face_dao) = setup(); write_image(tmp.path(), "exists.jpg", b"hello"); // Row for this library whose file is missing on disk: insert_exif(&exif_dao, 1, "ghost.jpg", None); insert_exif(&exif_dao, 1, "exists.jpg", None); // Row in the other library — must be skipped (different lib_id). insert_exif(&exif_dao, 2, "other.jpg", None); let backfilled = backfill_unhashed_backlog(&ctx(), &library, &exif_dao); assert_eq!(backfilled, 1, "only the existing in-library file hashes"); let mut dao = exif_dao.lock().unwrap(); let other = dao .get_exif_batch(&ctx(), Some(2), &["other.jpg".to_string()]) .unwrap(); assert_eq!(other.len(), 1); assert!( other[0].content_hash.is_none(), "other-library row must remain unhashed" ); let ghost = dao .get_exif_batch(&ctx(), Some(1), &["ghost.jpg".to_string()]) .unwrap(); assert_eq!(ghost.len(), 1); assert!( ghost[0].content_hash.is_none(), "missing-on-disk row stays unhashed (reconciliation removes it later)" ); } #[test] fn backfill_unhashed_backlog_respects_per_tick_cap() { // Env-var-driven cap; the function reads it on every call, so we // can set it just for this test and unset before returning. // Serial guard: tests in the same binary may share env, but each // backfill call re-reads — and we only care that the cap shape // (success count <= cap, more_remain logged) holds. unsafe { std::env::set_var("FACE_HASH_BACKFILL_MAX_PER_TICK", "2"); } let (tmp, library, _conn, exif_dao, _face_dao) = setup(); for i in 0..5 { let rel = format!("img_{}.jpg", i); write_image(tmp.path(), &rel, format!("bytes-{}", i).as_bytes()); insert_exif(&exif_dao, 1, &rel, None); } let backfilled = backfill_unhashed_backlog(&ctx(), &library, &exif_dao); assert_eq!(backfilled, 2, "cap=2 must bound the per-tick successes"); unsafe { std::env::remove_var("FACE_HASH_BACKFILL_MAX_PER_TICK"); } } #[test] fn backfill_missing_content_hashes_skips_videos_and_hashed_rows() { let (tmp, library, _conn, exif_dao, _face_dao) = setup(); // Two image rows (one already hashed, one not), one video. write_image(tmp.path(), "fresh.jpg", b"fresh-pixels"); write_image(tmp.path(), "already.jpg", b"already-pixels"); write_image(tmp.path(), "clip.mp4", b"video-bytes"); insert_exif(&exif_dao, 1, "fresh.jpg", None); insert_exif(&exif_dao, 1, "already.jpg", Some("pre-existing-hash")); insert_exif(&exif_dao, 1, "clip.mp4", None); let files: Vec<(PathBuf, String)> = vec![ (tmp.path().join("fresh.jpg"), "fresh.jpg".to_string()), (tmp.path().join("already.jpg"), "already.jpg".to_string()), (tmp.path().join("clip.mp4"), "clip.mp4".to_string()), ]; backfill_missing_content_hashes(&ctx(), &files, &library, &exif_dao); let mut dao = exif_dao.lock().unwrap(); let rows = dao .get_exif_batch( &ctx(), Some(1), &[ "fresh.jpg".to_string(), "already.jpg".to_string(), "clip.mp4".to_string(), ], ) .unwrap(); let by_path: HashMap> = rows .into_iter() .map(|r| (r.file_path, r.content_hash)) .collect(); assert!( by_path["fresh.jpg"].is_some(), "fresh image must get a hash" ); assert_eq!( by_path["already.jpg"].as_deref(), Some("pre-existing-hash"), "already-hashed image left untouched" ); assert!( by_path["clip.mp4"].is_none(), "video skipped (not face-scanned, no hash needed via this path)" ); } #[test] fn build_face_candidates_filters_videos_unhashed_and_already_scanned() { let (tmp, library, _conn, exif_dao, face_dao) = setup(); // Seed image_exif with: hashed unscanned, hashed scanned, unhashed, // and a video. Files don't need to exist on disk — the function // doesn't read them, only the DB rows. insert_exif(&exif_dao, 1, "fresh.jpg", Some("hash-fresh")); insert_exif(&exif_dao, 1, "scanned.jpg", Some("hash-scanned")); insert_exif(&exif_dao, 1, "unhashed.jpg", None); insert_exif(&exif_dao, 1, "clip.mp4", Some("hash-video")); // Mark `scanned.jpg`'s hash as already detected. { let mut dao = face_dao.lock().unwrap(); dao.mark_status(&ctx(), 1, "hash-scanned", "scanned.jpg", "no_faces", "test") .expect("mark scanned"); } let files: Vec<(PathBuf, String)> = vec![ (tmp.path().join("fresh.jpg"), "fresh.jpg".to_string()), (tmp.path().join("scanned.jpg"), "scanned.jpg".to_string()), (tmp.path().join("unhashed.jpg"), "unhashed.jpg".to_string()), (tmp.path().join("clip.mp4"), "clip.mp4".to_string()), ]; let candidates = build_face_candidates(&ctx(), &library, &files, &exif_dao, &face_dao); assert_eq!( candidates.len(), 1, "exactly fresh.jpg should be a candidate" ); assert_eq!(candidates[0].rel_path, "fresh.jpg"); assert_eq!(candidates[0].content_hash, "hash-fresh"); } }