faces: backfill missing content_hash from the file watcher
Photos indexed before content-hashing landed (or where the hash compute failed silently on insert) end up in image_exif with NULL content_hash. build_face_candidates keys on content_hash, so those rows would never become face candidates without backfill — symptom: face detection logs nothing despite photos being in the library and the watcher running. The dedicated `backfill_hashes` binary already handles this; this commit lets the watcher self-heal during full scans so the deploy 'just works' for face recognition without operator action. Idempotent — subsequent scans see populated hashes and no-op. Bounded per tick by FACE_HASH_BACKFILL_MAX_PER_TICK (default 500) so a watcher tick on a 50k-photo legacy library doesn't blake3 every file in one shot. For very large backlogs the dedicated binary is still faster (no DAO mutex contention with the watcher loop). Only runs when face_client.is_enabled(), so legacy deploys without APOLLO_FACE_API_BASE_URL keep the same behavior. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
100
src/main.rs
100
src/main.rs
@@ -2126,6 +2126,17 @@ fn process_new_files(
|
||||
// disabled (no Apollo integration configured) — Phase 3 wires this
|
||||
// up; the watcher remains usable on legacy deploys.
|
||||
if face_client.is_enabled() {
|
||||
// Opportunistic content_hash backfill: photos indexed before
|
||||
// content-hashing landed (or where the hash compute failed
|
||||
// silently on insert) end up in image_exif with NULL
|
||||
// content_hash. build_face_candidates keys on content_hash, so
|
||||
// those files would never become candidates without backfill.
|
||||
// Idempotent — subsequent scans see the populated hashes and
|
||||
// no-op. The dedicated `backfill_hashes` binary is still the
|
||||
// right tool for very large legacy libraries; this branch
|
||||
// ensures small/medium deploys self-heal without operator
|
||||
// action.
|
||||
backfill_missing_content_hashes(&context, &files, library, &exif_dao);
|
||||
let candidates = build_face_candidates(&context, &files, &exif_dao, &face_dao);
|
||||
debug!(
|
||||
"face_watch: scan tick — {} image file(s) walked, {} candidate(s) (library '{}', modified_since={})",
|
||||
@@ -2270,6 +2281,95 @@ fn process_new_files(
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute and persist content_hash for image_exif rows where it's NULL.
|
||||
///
|
||||
/// Bounded per call by `FACE_HASH_BACKFILL_MAX_PER_TICK` (default 500) so
|
||||
/// a watcher tick on a large legacy library doesn't block for hours
|
||||
/// blake3-ing every photo at once. Subsequent scans pick up the rest.
|
||||
/// For 50k+ libraries the dedicated `cargo run --bin backfill_hashes`
|
||||
/// is still faster (it doesn't fight a watcher loop for the DAO mutex).
|
||||
fn backfill_missing_content_hashes(
|
||||
context: &opentelemetry::Context,
|
||||
files: &[(PathBuf, String)],
|
||||
library: &libraries::Library,
|
||||
exif_dao: &Arc<Mutex<Box<dyn ExifDao>>>,
|
||||
) {
|
||||
let image_paths: Vec<String> = files
|
||||
.iter()
|
||||
.filter(|(p, _)| !is_video_file(p))
|
||||
.map(|(_, rel)| rel.clone())
|
||||
.collect();
|
||||
if image_paths.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let exif_records = {
|
||||
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
|
||||
dao.get_exif_batch(context, &image_paths)
|
||||
.unwrap_or_default()
|
||||
};
|
||||
// Cheap lookup back from rel_path → absolute file_path so
|
||||
// content_hash::compute can read the bytes.
|
||||
let path_by_rel: HashMap<String, &PathBuf> =
|
||||
files.iter().map(|(p, rel)| (rel.clone(), p)).collect();
|
||||
|
||||
let cap: usize = dotenv::var("FACE_HASH_BACKFILL_MAX_PER_TICK")
|
||||
.ok()
|
||||
.and_then(|s| s.parse().ok())
|
||||
.filter(|n: &usize| *n > 0)
|
||||
.unwrap_or(500);
|
||||
|
||||
let mut backfilled = 0usize;
|
||||
let mut errors = 0usize;
|
||||
for record in &exif_records {
|
||||
if backfilled + errors >= cap {
|
||||
break;
|
||||
}
|
||||
if record.content_hash.is_some() {
|
||||
continue;
|
||||
}
|
||||
let Some(file_path) = path_by_rel.get(&record.file_path) else {
|
||||
// Walked file went missing between the directory scan and now;
|
||||
// next tick will retry naturally.
|
||||
continue;
|
||||
};
|
||||
match content_hash::compute(file_path) {
|
||||
Ok(id) => {
|
||||
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
|
||||
if let Err(e) = dao.backfill_content_hash(
|
||||
context,
|
||||
library.id,
|
||||
&record.file_path,
|
||||
&id.content_hash,
|
||||
id.size_bytes,
|
||||
) {
|
||||
warn!(
|
||||
"face_watch: backfill_content_hash failed for {}: {:?}",
|
||||
record.file_path, e
|
||||
);
|
||||
errors += 1;
|
||||
} else {
|
||||
backfilled += 1;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
debug!(
|
||||
"face_watch: hash compute failed for {} ({:?})",
|
||||
file_path.display(),
|
||||
e
|
||||
);
|
||||
errors += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
if backfilled > 0 || errors > 0 {
|
||||
info!(
|
||||
"face_watch: backfilled content_hash for {} file(s) in library '{}' ({} error(s); cap={})",
|
||||
backfilled, library.name, errors, cap
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Build the face-detection candidate list for a scan tick.
|
||||
///
|
||||
/// We need `(rel_path, content_hash)` for every image file that has a
|
||||
|
||||
Reference in New Issue
Block a user