faces: drain backfill + detection backlog every tick, not just full scans

Symptom: ImageApi restart, then ~60 minutes of silence — no
face_watch lines at all. Cause: backfill + face-detection candidate
build were both gated inside process_new_files, which during quick
scans (every 60s) only walks files modified in the last interval.
The pre-existing unhashed / unscanned backlog never entered the
candidate set, so it only drained on the full-scan path (default
once per hour). Surfaced as "scan stuck at 1101/13118" — most of
those rows were waiting on the next full scan.

Two new per-tick passes that work directly off the DB:

(1) backfill_unhashed_backlog uses ExifDao::get_rows_missing_hash to
    pull unhashed rows in id order, capped (FACE_HASH_BACKFILL_MAX_PER_TICK
    default 2000), and writes content_hash for each. No filesystem
    walk — the walk was the gating filter that hid the backlog.

(2) process_face_backlog uses a new FaceDao::list_unscanned_candidates
    (LEFT-anti-join on content_hash via raw SQL, GROUP BY hash so
    duplicates fire one detect call) to pull a capped batch of
    hashed-but-unscanned rows (FACE_BACKLOG_MAX_PER_TICK default 64)
    and runs the existing face_watch detection pipeline on them.

Both run only when face_client.is_enabled(). The cap on (2) is small
because each candidate is a real Apollo round-trip — 64/tick at 60s
quick interval ≈ 64 detections/min, which paces an 8-core CPU
inference comfortably while keeping a steady flow visible in logs.
process_new_files's own backfill stays in place for the same-tick
flow (a brand-new upload gets hashed AND face-scanned in the tick
where it's discovered) but is now belt-and-suspenders.

Test backstop pinning the new DAO method's filter contract: only
hashed, unscanned, in-library rows are returned; scanned rows,
unhashed rows, and other-library rows are filtered out.
This commit is contained in:
Cameron Cordes
2026-04-30 01:46:49 +00:00
parent c2c1fe5b8b
commit 1971eeccd6
2 changed files with 270 additions and 0 deletions

View File

@@ -99,6 +99,17 @@ pub struct FaceDetectionRow {
pub created_at: i64,
}
/// Row shape for `list_unscanned_candidates`'s raw SQL. Diesel's
/// `sql_query` requires a `QueryableByName` row type with explicit
/// column SQL types; using a tuple isn't supported.
#[derive(diesel::QueryableByName, Debug)]
struct UnscannedRow {
#[diesel(sql_type = diesel::sql_types::Text)]
rel_path: String,
#[diesel(sql_type = diesel::sql_types::Text)]
content_hash: String,
}
#[derive(Insertable, Debug)]
#[diesel(table_name = face_detections)]
struct InsertFaceDetection {
@@ -354,6 +365,18 @@ pub trait FaceDao: Send + Sync {
ctx: &opentelemetry::Context,
content_hash: &str,
) -> anyhow::Result<bool>;
/// Find image_exif rows in `library_id` that have a populated
/// content_hash but no matching face_detections row yet. Used by
/// the watcher's quick-scan path to drain the backlog without
/// re-walking the filesystem. Returns `(rel_path, content_hash)`
/// pairs, capped at `limit`. Distinct on content_hash so the same
/// hash that lives at multiple rel_paths only fires one detection.
fn list_unscanned_candidates(
&mut self,
ctx: &opentelemetry::Context,
library_id: i32,
limit: i64,
) -> anyhow::Result<Vec<(String, String)>>;
fn store_detection(
&mut self,
ctx: &opentelemetry::Context,
@@ -565,6 +588,43 @@ impl FaceDao for SqliteFaceDao {
})
}
fn list_unscanned_candidates(
&mut self,
ctx: &opentelemetry::Context,
library_id: i32,
limit: i64,
) -> anyhow::Result<Vec<(String, String)>> {
let mut conn = self.connection.lock().expect("face dao lock");
trace_db_call(ctx, "query", "list_unscanned_candidates", |span| {
span.set_attribute(KeyValue::new("library_id", library_id as i64));
// Pick the smallest-id rel_path per content_hash so we don't
// fire multiple detect calls for the same hash if it lives
// under several rel_paths in the same library. The
// anti-join (NOT EXISTS) drains hashes that have no row in
// face_detections at all.
let rows: Vec<(String, String)> = diesel::sql_query(
"SELECT rel_path, content_hash \
FROM image_exif e \
WHERE library_id = ? \
AND content_hash IS NOT NULL \
AND NOT EXISTS ( \
SELECT 1 FROM face_detections f \
WHERE f.content_hash = e.content_hash \
) \
GROUP BY content_hash \
LIMIT ?",
)
.bind::<diesel::sql_types::Integer, _>(library_id)
.bind::<diesel::sql_types::BigInt, _>(limit)
.load::<UnscannedRow>(conn.deref_mut())
.with_context(|| "list_unscanned_candidates")?
.into_iter()
.map(|r| (r.rel_path, r.content_hash))
.collect();
Ok(rows)
})
}
fn store_detection(
&mut self,
ctx: &opentelemetry::Context,
@@ -3291,4 +3351,53 @@ mod tests {
assert!(joined.person_name.is_none());
}
#[test]
fn list_unscanned_candidates_filters_to_hashed_unscanned_in_library() {
// The watcher's per-tick backlog drain depends on this query
// returning *only* image_exif rows with a populated
// content_hash and no matching face_detections row in the
// requested library. A regression here would either silently
// re-scan files (waste of inference) or skip files that need
// scanning (the symptom we just shipped a fix for).
let mut dao = fresh_dao();
diesel::sql_query(
"INSERT OR IGNORE INTO libraries (id, name, root_path, created_at) \
VALUES (1, 'main', '/tmp', 0), (2, 'other', '/tmp2', 0)",
)
.execute(dao.connection.lock().unwrap().deref_mut())
.expect("seed libraries");
// Seed image_exif: mix of hashed/unhashed/scanned/cross-library.
diesel::sql_query(
"INSERT INTO image_exif \
(library_id, rel_path, content_hash, created_time, last_modified) VALUES \
(1, 'a.jpg', 'h-a', 0, 0), \
(1, 'b.jpg', 'h-b', 0, 0), \
(1, 'c.jpg', NULL, 0, 0), \
(1, 'd.jpg', 'h-d', 0, 0), \
(2, 'e.jpg', 'h-e', 0, 0)",
)
.execute(dao.connection.lock().unwrap().deref_mut())
.expect("seed image_exif");
// 'b' has been scanned (no_faces marker) — expect it filtered out.
dao.mark_status(&ctx(), 1, "h-b", "b.jpg", "no_faces", "buffalo_l")
.expect("scanned marker");
let cands = dao
.list_unscanned_candidates(&ctx(), 1, 10)
.expect("list unscanned");
let hashes: std::collections::HashSet<_> =
cands.iter().map(|(_, h)| h.clone()).collect();
// Should contain a and d (hashed, unscanned, library 1).
assert!(hashes.contains("h-a"), "missing h-a: {:?}", hashes);
assert!(hashes.contains("h-d"), "missing h-d: {:?}", hashes);
// Should NOT contain b (scanned), c (no hash), e (other library).
assert!(!hashes.contains("h-b"), "expected h-b filtered (scanned)");
assert!(!hashes.contains("h-e"), "expected h-e filtered (other library)");
assert_eq!(cands.len(), 2, "unexpected candidates: {:?}", cands);
}
}