Face Recognition / People Integration #61
109
src/faces.rs
109
src/faces.rs
@@ -99,6 +99,17 @@ pub struct FaceDetectionRow {
|
|||||||
pub created_at: i64,
|
pub created_at: i64,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Row shape for `list_unscanned_candidates`'s raw SQL. Diesel's
|
||||||
|
/// `sql_query` requires a `QueryableByName` row type with explicit
|
||||||
|
/// column SQL types; using a tuple isn't supported.
|
||||||
|
#[derive(diesel::QueryableByName, Debug)]
|
||||||
|
struct UnscannedRow {
|
||||||
|
#[diesel(sql_type = diesel::sql_types::Text)]
|
||||||
|
rel_path: String,
|
||||||
|
#[diesel(sql_type = diesel::sql_types::Text)]
|
||||||
|
content_hash: String,
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Insertable, Debug)]
|
#[derive(Insertable, Debug)]
|
||||||
#[diesel(table_name = face_detections)]
|
#[diesel(table_name = face_detections)]
|
||||||
struct InsertFaceDetection {
|
struct InsertFaceDetection {
|
||||||
@@ -354,6 +365,18 @@ pub trait FaceDao: Send + Sync {
|
|||||||
ctx: &opentelemetry::Context,
|
ctx: &opentelemetry::Context,
|
||||||
content_hash: &str,
|
content_hash: &str,
|
||||||
) -> anyhow::Result<bool>;
|
) -> anyhow::Result<bool>;
|
||||||
|
/// Find image_exif rows in `library_id` that have a populated
|
||||||
|
/// content_hash but no matching face_detections row yet. Used by
|
||||||
|
/// the watcher's quick-scan path to drain the backlog without
|
||||||
|
/// re-walking the filesystem. Returns `(rel_path, content_hash)`
|
||||||
|
/// pairs, capped at `limit`. Distinct on content_hash so the same
|
||||||
|
/// hash that lives at multiple rel_paths only fires one detection.
|
||||||
|
fn list_unscanned_candidates(
|
||||||
|
&mut self,
|
||||||
|
ctx: &opentelemetry::Context,
|
||||||
|
library_id: i32,
|
||||||
|
limit: i64,
|
||||||
|
) -> anyhow::Result<Vec<(String, String)>>;
|
||||||
fn store_detection(
|
fn store_detection(
|
||||||
&mut self,
|
&mut self,
|
||||||
ctx: &opentelemetry::Context,
|
ctx: &opentelemetry::Context,
|
||||||
@@ -565,6 +588,43 @@ impl FaceDao for SqliteFaceDao {
|
|||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn list_unscanned_candidates(
|
||||||
|
&mut self,
|
||||||
|
ctx: &opentelemetry::Context,
|
||||||
|
library_id: i32,
|
||||||
|
limit: i64,
|
||||||
|
) -> anyhow::Result<Vec<(String, String)>> {
|
||||||
|
let mut conn = self.connection.lock().expect("face dao lock");
|
||||||
|
trace_db_call(ctx, "query", "list_unscanned_candidates", |span| {
|
||||||
|
span.set_attribute(KeyValue::new("library_id", library_id as i64));
|
||||||
|
// Pick the smallest-id rel_path per content_hash so we don't
|
||||||
|
// fire multiple detect calls for the same hash if it lives
|
||||||
|
// under several rel_paths in the same library. The
|
||||||
|
// anti-join (NOT EXISTS) drains hashes that have no row in
|
||||||
|
// face_detections at all.
|
||||||
|
let rows: Vec<(String, String)> = diesel::sql_query(
|
||||||
|
"SELECT rel_path, content_hash \
|
||||||
|
FROM image_exif e \
|
||||||
|
WHERE library_id = ? \
|
||||||
|
AND content_hash IS NOT NULL \
|
||||||
|
AND NOT EXISTS ( \
|
||||||
|
SELECT 1 FROM face_detections f \
|
||||||
|
WHERE f.content_hash = e.content_hash \
|
||||||
|
) \
|
||||||
|
GROUP BY content_hash \
|
||||||
|
LIMIT ?",
|
||||||
|
)
|
||||||
|
.bind::<diesel::sql_types::Integer, _>(library_id)
|
||||||
|
.bind::<diesel::sql_types::BigInt, _>(limit)
|
||||||
|
.load::<UnscannedRow>(conn.deref_mut())
|
||||||
|
.with_context(|| "list_unscanned_candidates")?
|
||||||
|
.into_iter()
|
||||||
|
.map(|r| (r.rel_path, r.content_hash))
|
||||||
|
.collect();
|
||||||
|
Ok(rows)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
fn store_detection(
|
fn store_detection(
|
||||||
&mut self,
|
&mut self,
|
||||||
ctx: &opentelemetry::Context,
|
ctx: &opentelemetry::Context,
|
||||||
@@ -3291,4 +3351,53 @@ mod tests {
|
|||||||
assert!(joined.person_name.is_none());
|
assert!(joined.person_name.is_none());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn list_unscanned_candidates_filters_to_hashed_unscanned_in_library() {
|
||||||
|
// The watcher's per-tick backlog drain depends on this query
|
||||||
|
// returning *only* image_exif rows with a populated
|
||||||
|
// content_hash and no matching face_detections row in the
|
||||||
|
// requested library. A regression here would either silently
|
||||||
|
// re-scan files (waste of inference) or skip files that need
|
||||||
|
// scanning (the symptom we just shipped a fix for).
|
||||||
|
let mut dao = fresh_dao();
|
||||||
|
diesel::sql_query(
|
||||||
|
"INSERT OR IGNORE INTO libraries (id, name, root_path, created_at) \
|
||||||
|
VALUES (1, 'main', '/tmp', 0), (2, 'other', '/tmp2', 0)",
|
||||||
|
)
|
||||||
|
.execute(dao.connection.lock().unwrap().deref_mut())
|
||||||
|
.expect("seed libraries");
|
||||||
|
|
||||||
|
// Seed image_exif: mix of hashed/unhashed/scanned/cross-library.
|
||||||
|
diesel::sql_query(
|
||||||
|
"INSERT INTO image_exif \
|
||||||
|
(library_id, rel_path, content_hash, created_time, last_modified) VALUES \
|
||||||
|
(1, 'a.jpg', 'h-a', 0, 0), \
|
||||||
|
(1, 'b.jpg', 'h-b', 0, 0), \
|
||||||
|
(1, 'c.jpg', NULL, 0, 0), \
|
||||||
|
(1, 'd.jpg', 'h-d', 0, 0), \
|
||||||
|
(2, 'e.jpg', 'h-e', 0, 0)",
|
||||||
|
)
|
||||||
|
.execute(dao.connection.lock().unwrap().deref_mut())
|
||||||
|
.expect("seed image_exif");
|
||||||
|
|
||||||
|
// 'b' has been scanned (no_faces marker) — expect it filtered out.
|
||||||
|
dao.mark_status(&ctx(), 1, "h-b", "b.jpg", "no_faces", "buffalo_l")
|
||||||
|
.expect("scanned marker");
|
||||||
|
|
||||||
|
let cands = dao
|
||||||
|
.list_unscanned_candidates(&ctx(), 1, 10)
|
||||||
|
.expect("list unscanned");
|
||||||
|
|
||||||
|
let hashes: std::collections::HashSet<_> =
|
||||||
|
cands.iter().map(|(_, h)| h.clone()).collect();
|
||||||
|
|
||||||
|
// Should contain a and d (hashed, unscanned, library 1).
|
||||||
|
assert!(hashes.contains("h-a"), "missing h-a: {:?}", hashes);
|
||||||
|
assert!(hashes.contains("h-d"), "missing h-d: {:?}", hashes);
|
||||||
|
// Should NOT contain b (scanned), c (no hash), e (other library).
|
||||||
|
assert!(!hashes.contains("h-b"), "expected h-b filtered (scanned)");
|
||||||
|
assert!(!hashes.contains("h-e"), "expected h-e filtered (other library)");
|
||||||
|
assert_eq!(cands.len(), 2, "unexpected candidates: {:?}", cands);
|
||||||
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|||||||
161
src/main.rs
161
src/main.rs
@@ -1861,6 +1861,26 @@ fn watch_files(
|
|||||||
let is_full_scan = since_last_full.as_secs() >= full_interval_secs;
|
let is_full_scan = since_last_full.as_secs() >= full_interval_secs;
|
||||||
|
|
||||||
for lib in &libs {
|
for lib in &libs {
|
||||||
|
// Drain the unhashed-hash backlog AND the face-detection
|
||||||
|
// backlog every tick, regardless of quick/full. Quick
|
||||||
|
// scans only walk recently-modified files, so the
|
||||||
|
// pre-Phase-3 backlog never enters their candidate set
|
||||||
|
// — without these standalone passes, backfill +
|
||||||
|
// detection only progressed during full scans
|
||||||
|
// (default once an hour).
|
||||||
|
if face_client.is_enabled() {
|
||||||
|
let context = opentelemetry::Context::new();
|
||||||
|
backfill_unhashed_backlog(&context, lib, &exif_dao);
|
||||||
|
process_face_backlog(
|
||||||
|
&context,
|
||||||
|
lib,
|
||||||
|
&face_client,
|
||||||
|
&face_dao,
|
||||||
|
&watcher_tag_dao,
|
||||||
|
&excluded_dirs,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
if is_full_scan {
|
if is_full_scan {
|
||||||
info!(
|
info!(
|
||||||
"Running full scan for library '{}' (scan #{})",
|
"Running full scan for library '{}' (scan #{})",
|
||||||
@@ -2288,6 +2308,147 @@ fn process_new_files(
|
|||||||
/// blake3-ing every photo at once. Subsequent scans pick up the rest.
|
/// blake3-ing every photo at once. Subsequent scans pick up the rest.
|
||||||
/// For 50k+ libraries the dedicated `cargo run --bin backfill_hashes`
|
/// For 50k+ libraries the dedicated `cargo run --bin backfill_hashes`
|
||||||
/// is still faster (it doesn't fight a watcher loop for the DAO mutex).
|
/// is still faster (it doesn't fight a watcher loop for the DAO mutex).
|
||||||
|
/// Drain unhashed image_exif rows by querying them directly, independent
|
||||||
|
/// of the filesystem walk. Quick scans only walk recently-modified
|
||||||
|
/// files, so a backlog of pre-existing unhashed rows never enters
|
||||||
|
/// `process_new_files`'s candidate set — left alone, it would only
|
||||||
|
/// drain on full scans (default once an hour). Calling this every tick
|
||||||
|
/// keeps the face-detection backlog moving regardless.
|
||||||
|
///
|
||||||
|
/// Returns the number of rows successfully backfilled this pass.
|
||||||
|
fn backfill_unhashed_backlog(
|
||||||
|
context: &opentelemetry::Context,
|
||||||
|
library: &libraries::Library,
|
||||||
|
exif_dao: &Arc<Mutex<Box<dyn ExifDao>>>,
|
||||||
|
) -> usize {
|
||||||
|
let cap: i64 = dotenv::var("FACE_HASH_BACKFILL_MAX_PER_TICK")
|
||||||
|
.ok()
|
||||||
|
.and_then(|s| s.parse().ok())
|
||||||
|
.filter(|n: &i64| *n > 0)
|
||||||
|
.unwrap_or(2000);
|
||||||
|
|
||||||
|
// Fetch up to cap+1 rows so we can tell "more remain" without a
|
||||||
|
// separate count query. Across libraries — there's no per-library
|
||||||
|
// filter on get_rows_missing_hash today — but we only ever update
|
||||||
|
// rows whose library_id matches the caller's library, so other
|
||||||
|
// libraries' rows just get skipped here and picked up on the next
|
||||||
|
// library's tick. Negligible cost given the cap.
|
||||||
|
let rows: Vec<(i32, String)> = {
|
||||||
|
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
|
||||||
|
dao.get_rows_missing_hash(context, cap + 1).unwrap_or_default()
|
||||||
|
};
|
||||||
|
if rows.is_empty() {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
let more_than_cap = rows.len() as i64 > cap;
|
||||||
|
let base_path = std::path::Path::new(&library.root_path);
|
||||||
|
|
||||||
|
let mut backfilled = 0usize;
|
||||||
|
let mut errors = 0usize;
|
||||||
|
let mut skipped_other_lib = 0usize;
|
||||||
|
for (lib_id, rel_path) in rows.iter().take(cap as usize) {
|
||||||
|
if *lib_id != library.id {
|
||||||
|
skipped_other_lib += 1;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let abs = base_path.join(rel_path);
|
||||||
|
if !abs.exists() {
|
||||||
|
// File walked away — the watcher's reconciliation pass will
|
||||||
|
// remove the orphan exif row eventually.
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
match content_hash::compute(&abs) {
|
||||||
|
Ok(id) => {
|
||||||
|
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
|
||||||
|
if let Err(e) =
|
||||||
|
dao.backfill_content_hash(context, library.id, rel_path, &id.content_hash, id.size_bytes)
|
||||||
|
{
|
||||||
|
warn!(
|
||||||
|
"face_watch: backfill_content_hash failed for {}: {:?}",
|
||||||
|
rel_path, e
|
||||||
|
);
|
||||||
|
errors += 1;
|
||||||
|
} else {
|
||||||
|
backfilled += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
debug!("face_watch: hash compute failed for {} ({:?})", abs.display(), e);
|
||||||
|
errors += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if backfilled > 0 || errors > 0 || more_than_cap {
|
||||||
|
info!(
|
||||||
|
"face_watch: backfill pass for library '{}': hashed {} ({} error(s), {} skipped to other libraries; {} cap, more_remain={})",
|
||||||
|
library.name, backfilled, errors, skipped_other_lib, cap, more_than_cap
|
||||||
|
);
|
||||||
|
}
|
||||||
|
backfilled
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Per-tick face-detection drain. Pulls a capped batch of hashed-but-
|
||||||
|
/// unscanned image_exif rows directly via the FaceDao anti-join and
|
||||||
|
/// hands them to the existing detection pass. Runs on every tick (not
|
||||||
|
/// just full scans) so the backlog moves at quick-scan cadence.
|
||||||
|
fn process_face_backlog(
|
||||||
|
context: &opentelemetry::Context,
|
||||||
|
library: &libraries::Library,
|
||||||
|
face_client: &crate::ai::face_client::FaceClient,
|
||||||
|
face_dao: &Arc<Mutex<Box<dyn faces::FaceDao>>>,
|
||||||
|
tag_dao: &Arc<Mutex<Box<dyn tags::TagDao>>>,
|
||||||
|
excluded_dirs: &[String],
|
||||||
|
) {
|
||||||
|
let cap: i64 = dotenv::var("FACE_BACKLOG_MAX_PER_TICK")
|
||||||
|
.ok()
|
||||||
|
.and_then(|s| s.parse().ok())
|
||||||
|
.filter(|n: &i64| *n > 0)
|
||||||
|
.unwrap_or(64);
|
||||||
|
|
||||||
|
let rows: Vec<(String, String)> = {
|
||||||
|
let mut dao = face_dao.lock().expect("face dao");
|
||||||
|
match dao.list_unscanned_candidates(context, library.id, cap) {
|
||||||
|
Ok(r) => r,
|
||||||
|
Err(e) => {
|
||||||
|
warn!(
|
||||||
|
"face_watch: list_unscanned_candidates failed for library '{}': {:?}",
|
||||||
|
library.name, e
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
if rows.is_empty() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
info!(
|
||||||
|
"face_watch: backlog drain — running detection on {} candidate(s) for library '{}' (cap={})",
|
||||||
|
rows.len(),
|
||||||
|
library.name,
|
||||||
|
cap
|
||||||
|
);
|
||||||
|
|
||||||
|
let candidates: Vec<face_watch::FaceCandidate> = rows
|
||||||
|
.into_iter()
|
||||||
|
.map(|(rel_path, content_hash)| face_watch::FaceCandidate {
|
||||||
|
rel_path,
|
||||||
|
content_hash,
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
face_watch::run_face_detection_pass(
|
||||||
|
library,
|
||||||
|
excluded_dirs,
|
||||||
|
face_client,
|
||||||
|
Arc::clone(face_dao),
|
||||||
|
Arc::clone(tag_dao),
|
||||||
|
candidates,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
fn backfill_missing_content_hashes(
|
fn backfill_missing_content_hashes(
|
||||||
context: &opentelemetry::Context,
|
context: &opentelemetry::Context,
|
||||||
files: &[(PathBuf, String)],
|
files: &[(PathBuf, String)],
|
||||||
|
|||||||
Reference in New Issue
Block a user