diff --git a/src/main.rs b/src/main.rs index 05959ae..4644a6d 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2126,6 +2126,17 @@ fn process_new_files( // disabled (no Apollo integration configured) — Phase 3 wires this // up; the watcher remains usable on legacy deploys. if face_client.is_enabled() { + // Opportunistic content_hash backfill: photos indexed before + // content-hashing landed (or where the hash compute failed + // silently on insert) end up in image_exif with NULL + // content_hash. build_face_candidates keys on content_hash, so + // those files would never become candidates without backfill. + // Idempotent — subsequent scans see the populated hashes and + // no-op. The dedicated `backfill_hashes` binary is still the + // right tool for very large legacy libraries; this branch + // ensures small/medium deploys self-heal without operator + // action. + backfill_missing_content_hashes(&context, &files, library, &exif_dao); let candidates = build_face_candidates(&context, &files, &exif_dao, &face_dao); debug!( "face_watch: scan tick — {} image file(s) walked, {} candidate(s) (library '{}', modified_since={})", @@ -2270,6 +2281,95 @@ fn process_new_files( } } +/// Compute and persist content_hash for image_exif rows where it's NULL. +/// +/// Bounded per call by `FACE_HASH_BACKFILL_MAX_PER_TICK` (default 500) so +/// a watcher tick on a large legacy library doesn't block for hours +/// blake3-ing every photo at once. Subsequent scans pick up the rest. +/// For 50k+ libraries the dedicated `cargo run --bin backfill_hashes` +/// is still faster (it doesn't fight a watcher loop for the DAO mutex). +fn backfill_missing_content_hashes( + context: &opentelemetry::Context, + files: &[(PathBuf, String)], + library: &libraries::Library, + exif_dao: &Arc>>, +) { + let image_paths: Vec = files + .iter() + .filter(|(p, _)| !is_video_file(p)) + .map(|(_, rel)| rel.clone()) + .collect(); + if image_paths.is_empty() { + return; + } + + let exif_records = { + let mut dao = exif_dao.lock().expect("Unable to lock ExifDao"); + dao.get_exif_batch(context, &image_paths) + .unwrap_or_default() + }; + // Cheap lookup back from rel_path → absolute file_path so + // content_hash::compute can read the bytes. + let path_by_rel: HashMap = + files.iter().map(|(p, rel)| (rel.clone(), p)).collect(); + + let cap: usize = dotenv::var("FACE_HASH_BACKFILL_MAX_PER_TICK") + .ok() + .and_then(|s| s.parse().ok()) + .filter(|n: &usize| *n > 0) + .unwrap_or(500); + + let mut backfilled = 0usize; + let mut errors = 0usize; + for record in &exif_records { + if backfilled + errors >= cap { + break; + } + if record.content_hash.is_some() { + continue; + } + let Some(file_path) = path_by_rel.get(&record.file_path) else { + // Walked file went missing between the directory scan and now; + // next tick will retry naturally. + continue; + }; + match content_hash::compute(file_path) { + Ok(id) => { + let mut dao = exif_dao.lock().expect("Unable to lock ExifDao"); + if let Err(e) = dao.backfill_content_hash( + context, + library.id, + &record.file_path, + &id.content_hash, + id.size_bytes, + ) { + warn!( + "face_watch: backfill_content_hash failed for {}: {:?}", + record.file_path, e + ); + errors += 1; + } else { + backfilled += 1; + } + } + Err(e) => { + debug!( + "face_watch: hash compute failed for {} ({:?})", + file_path.display(), + e + ); + errors += 1; + } + } + } + if backfilled > 0 || errors > 0 { + info!( + "face_watch: backfilled content_hash for {} file(s) in library '{}' ({} error(s); cap={})", + backfilled, library.name, errors, cap + ); + } +} + /// Build the face-detection candidate list for a scan tick. /// /// We need `(rel_path, content_hash)` for every image file that has a