From 0840d55c70d79ab92274cc0ca4772e35d2b0c2d3 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Thu, 30 Apr 2026 21:16:30 +0000 Subject: [PATCH] faces: exclude videos from backlog drain and SCANNED denominator MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit list_unscanned_candidates pulled every hashed image_exif row, including videos. filter_excluded then dropped them client-side without writing a marker, so the same set re-appeared every watcher tick — emitting the "backlog drain — running detection on N candidate(s)" log forever and producing no progress. face_stats.total_photos counted the same video rows in the denominator, so the SCANNED percentage was structurally capped below 100%. Add an image-extension SQL predicate (case-insensitive, sourced from file_types::IMAGE_EXTENSIONS) and apply it to both queries. Videos never enter the candidate set, total_photos counts only what can actually be scanned, and 100% becomes reachable. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/faces.rs | 160 ++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 125 insertions(+), 35 deletions(-) diff --git a/src/faces.rs b/src/faces.rs index 20fd700..72ce370 100644 --- a/src/faces.rs +++ b/src/faces.rs @@ -20,9 +20,10 @@ use crate::Claims; use crate::ai::face_client::{DetectMeta, FaceClient, FaceDetectError}; -use crate::exif; use crate::database::schema::{face_detections, image_exif, persons}; use crate::error::IntoHttpError; +use crate::exif; +use crate::file_types; use crate::libraries::{self, Library}; use crate::otel::{extract_context_from_request, global_tracer, trace_db_call}; use crate::state::AppState; @@ -99,9 +100,30 @@ pub struct FaceDetectionRow { pub created_at: i64, } +/// SQL fragment restricting an `image_exif.rel_path` (or `face_detections.rel_path`) +/// column to image extensions. Videos register in `image_exif` with a +/// populated `content_hash` but can never produce a `face_detections` row +/// — applying this filter at query time keeps videos out of the per-tick +/// backlog drain (which would otherwise loop forever — `filter_excluded` +/// drops them client-side without writing a marker) and out of the SCANNED +/// stat denominator (so 100% is reachable). +fn image_path_predicate(col: &str) -> String { + let clauses: Vec = file_types::IMAGE_EXTENSIONS + .iter() + .map(|ext| format!("lower({col}) LIKE '%.{ext}'")) + .collect(); + format!("({})", clauses.join(" OR ")) +} + /// Row shape for `list_unscanned_candidates`'s raw SQL. Diesel's /// `sql_query` requires a `QueryableByName` row type with explicit /// column SQL types; using a tuple isn't supported. +#[derive(diesel::QueryableByName, Debug)] +struct CountRow { + #[diesel(sql_type = diesel::sql_types::BigInt)] + count: i64, +} + #[derive(diesel::QueryableByName, Debug)] struct UnscannedRow { #[diesel(sql_type = diesel::sql_types::Text)] @@ -601,26 +623,32 @@ impl FaceDao for SqliteFaceDao { // fire multiple detect calls for the same hash if it lives // under several rel_paths in the same library. The // anti-join (NOT EXISTS) drains hashes that have no row in - // face_detections at all. - let rows: Vec<(String, String)> = diesel::sql_query( + // face_detections at all. The image-extension predicate + // keeps videos out of the candidate set; without it they'd + // be filtered client-side and re-pulled every tick forever + // because no marker row is written for excluded paths. + let ext_predicate = image_path_predicate("rel_path"); + let sql = format!( "SELECT rel_path, content_hash \ FROM image_exif e \ WHERE library_id = ? \ AND content_hash IS NOT NULL \ + AND {ext_predicate} \ AND NOT EXISTS ( \ SELECT 1 FROM face_detections f \ WHERE f.content_hash = e.content_hash \ ) \ GROUP BY content_hash \ - LIMIT ?", - ) - .bind::(library_id) - .bind::(limit) - .load::(conn.deref_mut()) - .with_context(|| "list_unscanned_candidates")? - .into_iter() - .map(|r| (r.rel_path, r.content_hash)) - .collect(); + LIMIT ?" + ); + let rows: Vec<(String, String)> = diesel::sql_query(sql) + .bind::(library_id) + .bind::(limit) + .load::(conn.deref_mut()) + .with_context(|| "list_unscanned_candidates")? + .into_iter() + .map(|r| (r.rel_path, r.content_hash)) + .collect(); Ok(rows) }) } @@ -1013,14 +1041,29 @@ impl FaceDao for SqliteFaceDao { .first(conn.deref_mut()) .with_context(|| "stats: failed")? }; + // Image-extension filter mirrors `list_unscanned_candidates` so + // SCANNED can actually reach 100%: videos sit in `image_exif` but + // never get a `face_detections` row, so counting them here + // permanently caps the percentage below 100%. let total_photos: i64 = { - let mut q = image_exif::table.into_boxed(); - if let Some(lib) = library_id { - q = q.filter(image_exif::library_id.eq(lib)); - } - q.select(diesel::dsl::count_star()) - .first(conn.deref_mut()) - .with_context(|| "stats: total_photos")? + let ext_predicate = image_path_predicate("rel_path"); + let row: CountRow = if let Some(lib) = library_id { + let sql = format!( + "SELECT COUNT(*) AS count FROM image_exif \ + WHERE library_id = ? AND {ext_predicate}" + ); + diesel::sql_query(sql) + .bind::(lib) + .get_result(conn.deref_mut()) + .with_context(|| "stats: total_photos")? + } else { + let sql = + format!("SELECT COUNT(*) AS count FROM image_exif WHERE {ext_predicate}"); + diesel::sql_query(sql) + .get_result(conn.deref_mut()) + .with_context(|| "stats: total_photos")? + }; + row.count }; let persons_count: i64 = persons::table .select(diesel::dsl::count_star()) @@ -2284,8 +2327,7 @@ async fn update_face_handler( "PATCH /image/faces/{}: crop failed for {:?}: {:?}", id, abs_path, e ); - return HttpResponse::BadRequest() - .body(format!("cannot crop new bbox: {}", e)); + return HttpResponse::BadRequest().body(format!("cannot crop new bbox: {}", e)); } }; let meta = DetectMeta { @@ -2335,8 +2377,7 @@ async fn update_face_handler( return HttpResponse::ServiceUnavailable().body(format!("{}", e)); } Err(FaceDetectError::Disabled) => { - return HttpResponse::ServiceUnavailable() - .body("face client disabled mid-flight"); + return HttpResponse::ServiceUnavailable().body("face client disabled mid-flight"); } } } @@ -3145,6 +3186,39 @@ mod tests { assert_eq!(stats.with_faces, 0); } + #[test] + fn stats_total_photos_excludes_videos() { + // SCANNED counts content_hashes in face_detections; total_photos + // must apply the same image-extension filter as the watcher + // backlog query so the percentage can reach 100%. Without this, + // videos sit in image_exif but never produce a face_detections + // row (Apollo decodes images only) and the bar caps below 100%. + let mut dao = fresh_dao(); + diesel::sql_query( + "INSERT OR IGNORE INTO libraries (id, name, root_path, created_at) \ + VALUES (1, 'main', '/tmp', 0)", + ) + .execute(dao.connection.lock().unwrap().deref_mut()) + .expect("seed libraries"); + + diesel::sql_query( + "INSERT INTO image_exif \ + (library_id, rel_path, content_hash, created_time, last_modified) VALUES \ + (1, 'a.jpg', 'h-a', 0, 0), \ + (1, 'b.JPEG', 'h-b', 0, 0), \ + (1, 'movie.mp4', 'h-mp4', 0, 0), \ + (1, 'clip.MOV', 'h-mov', 0, 0)", + ) + .execute(dao.connection.lock().unwrap().deref_mut()) + .expect("seed image_exif"); + + let stats = dao.stats(&ctx(), Some(1)).expect("stats"); + assert_eq!( + stats.total_photos, 2, + "videos should not count toward total" + ); + } + #[test] fn merge_persons_repoints_faces() { let mut dao = fresh_dao(); @@ -3325,8 +3399,7 @@ mod tests { ) .unwrap(); let row = seed_library_and_face(&mut dao, Some(p.id)); - let joined = - hydrate_face_with_person(&mut dao, &ctx(), row).expect("hydrate assigned"); + let joined = hydrate_face_with_person(&mut dao, &ctx(), row).expect("hydrate assigned"); assert_eq!(joined.person_id, Some(p.id)); assert_eq!(joined.person_name.as_deref(), Some("Alice")); // Bbox + confidence + source must round-trip — these are what @@ -3345,8 +3418,7 @@ mod tests { // previously-assigned row's serialization. let mut dao = fresh_dao(); let row = seed_library_and_face(&mut dao, None); - let joined = - hydrate_face_with_person(&mut dao, &ctx(), row).expect("hydrate unassigned"); + let joined = hydrate_face_with_person(&mut dao, &ctx(), row).expect("hydrate unassigned"); assert!(joined.person_id.is_none()); assert!(joined.person_name.is_none()); } @@ -3367,7 +3439,12 @@ mod tests { .execute(dao.connection.lock().unwrap().deref_mut()) .expect("seed libraries"); - // Seed image_exif: mix of hashed/unhashed/scanned/cross-library. + // Seed image_exif: mix of hashed/unhashed/scanned/cross-library, + // plus a video and a mixed-case image extension. Videos register + // in image_exif but can never produce a face_detections row, so + // the SQL must filter them out — otherwise the per-tick backlog + // drain re-pulls them every tick (no marker is ever written, so + // they loop forever) and the SCANNED stat is permanently capped. diesel::sql_query( "INSERT INTO image_exif \ (library_id, rel_path, content_hash, created_time, last_modified) VALUES \ @@ -3375,6 +3452,9 @@ mod tests { (1, 'b.jpg', 'h-b', 0, 0), \ (1, 'c.jpg', NULL, 0, 0), \ (1, 'd.jpg', 'h-d', 0, 0), \ + (1, 'movie.mp4', 'h-mp4', 0, 0), \ + (1, 'clip.MOV', 'h-mov', 0, 0), \ + (1, 'photo.JPG', 'h-jpg-upper', 0, 0), \ (2, 'e.jpg', 'h-e', 0, 0)", ) .execute(dao.connection.lock().unwrap().deref_mut()) @@ -3388,16 +3468,26 @@ mod tests { .list_unscanned_candidates(&ctx(), 1, 10) .expect("list unscanned"); - let hashes: std::collections::HashSet<_> = - cands.iter().map(|(_, h)| h.clone()).collect(); + let hashes: std::collections::HashSet<_> = cands.iter().map(|(_, h)| h.clone()).collect(); - // Should contain a and d (hashed, unscanned, library 1). + // Should contain a, d, and the upper-case .JPG (image-extension + // match is case-insensitive). assert!(hashes.contains("h-a"), "missing h-a: {:?}", hashes); assert!(hashes.contains("h-d"), "missing h-d: {:?}", hashes); - // Should NOT contain b (scanned), c (no hash), e (other library). + assert!( + hashes.contains("h-jpg-upper"), + "missing h-jpg-upper: {:?}", + hashes + ); + // Should NOT contain b (scanned), c (no hash), e (other library), + // or videos (mp4/mov are not image extensions). assert!(!hashes.contains("h-b"), "expected h-b filtered (scanned)"); - assert!(!hashes.contains("h-e"), "expected h-e filtered (other library)"); - assert_eq!(cands.len(), 2, "unexpected candidates: {:?}", cands); + assert!( + !hashes.contains("h-e"), + "expected h-e filtered (other library)" + ); + assert!(!hashes.contains("h-mp4"), "expected h-mp4 filtered (video)"); + assert!(!hashes.contains("h-mov"), "expected h-mov filtered (video)"); + assert_eq!(cands.len(), 3, "unexpected candidates: {:?}", cands); } - }