faces: exclude videos from backlog drain and SCANNED denominator

list_unscanned_candidates pulled every hashed image_exif row, including
videos. filter_excluded then dropped them client-side without writing a
marker, so the same set re-appeared every watcher tick — emitting the
"backlog drain — running detection on N candidate(s)" log forever and
producing no progress.

face_stats.total_photos counted the same video rows in the denominator,
so the SCANNED percentage was structurally capped below 100%.

Add an image-extension SQL predicate (case-insensitive, sourced from
file_types::IMAGE_EXTENSIONS) and apply it to both queries. Videos
never enter the candidate set, total_photos counts only what can
actually be scanned, and 100% becomes reachable.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Cameron Cordes
2026-04-30 21:16:30 +00:00
parent dbb046dfa8
commit 0840d55c70

View File

@@ -20,9 +20,10 @@
use crate::Claims; use crate::Claims;
use crate::ai::face_client::{DetectMeta, FaceClient, FaceDetectError}; use crate::ai::face_client::{DetectMeta, FaceClient, FaceDetectError};
use crate::exif;
use crate::database::schema::{face_detections, image_exif, persons}; use crate::database::schema::{face_detections, image_exif, persons};
use crate::error::IntoHttpError; use crate::error::IntoHttpError;
use crate::exif;
use crate::file_types;
use crate::libraries::{self, Library}; use crate::libraries::{self, Library};
use crate::otel::{extract_context_from_request, global_tracer, trace_db_call}; use crate::otel::{extract_context_from_request, global_tracer, trace_db_call};
use crate::state::AppState; use crate::state::AppState;
@@ -99,9 +100,30 @@ pub struct FaceDetectionRow {
pub created_at: i64, pub created_at: i64,
} }
/// SQL fragment restricting an `image_exif.rel_path` (or `face_detections.rel_path`)
/// column to image extensions. Videos register in `image_exif` with a
/// populated `content_hash` but can never produce a `face_detections` row
/// — applying this filter at query time keeps videos out of the per-tick
/// backlog drain (which would otherwise loop forever — `filter_excluded`
/// drops them client-side without writing a marker) and out of the SCANNED
/// stat denominator (so 100% is reachable).
fn image_path_predicate(col: &str) -> String {
let clauses: Vec<String> = file_types::IMAGE_EXTENSIONS
.iter()
.map(|ext| format!("lower({col}) LIKE '%.{ext}'"))
.collect();
format!("({})", clauses.join(" OR "))
}
/// Row shape for `list_unscanned_candidates`'s raw SQL. Diesel's /// Row shape for `list_unscanned_candidates`'s raw SQL. Diesel's
/// `sql_query` requires a `QueryableByName` row type with explicit /// `sql_query` requires a `QueryableByName` row type with explicit
/// column SQL types; using a tuple isn't supported. /// column SQL types; using a tuple isn't supported.
#[derive(diesel::QueryableByName, Debug)]
struct CountRow {
#[diesel(sql_type = diesel::sql_types::BigInt)]
count: i64,
}
#[derive(diesel::QueryableByName, Debug)] #[derive(diesel::QueryableByName, Debug)]
struct UnscannedRow { struct UnscannedRow {
#[diesel(sql_type = diesel::sql_types::Text)] #[diesel(sql_type = diesel::sql_types::Text)]
@@ -601,26 +623,32 @@ impl FaceDao for SqliteFaceDao {
// fire multiple detect calls for the same hash if it lives // fire multiple detect calls for the same hash if it lives
// under several rel_paths in the same library. The // under several rel_paths in the same library. The
// anti-join (NOT EXISTS) drains hashes that have no row in // anti-join (NOT EXISTS) drains hashes that have no row in
// face_detections at all. // face_detections at all. The image-extension predicate
let rows: Vec<(String, String)> = diesel::sql_query( // keeps videos out of the candidate set; without it they'd
// be filtered client-side and re-pulled every tick forever
// because no marker row is written for excluded paths.
let ext_predicate = image_path_predicate("rel_path");
let sql = format!(
"SELECT rel_path, content_hash \ "SELECT rel_path, content_hash \
FROM image_exif e \ FROM image_exif e \
WHERE library_id = ? \ WHERE library_id = ? \
AND content_hash IS NOT NULL \ AND content_hash IS NOT NULL \
AND {ext_predicate} \
AND NOT EXISTS ( \ AND NOT EXISTS ( \
SELECT 1 FROM face_detections f \ SELECT 1 FROM face_detections f \
WHERE f.content_hash = e.content_hash \ WHERE f.content_hash = e.content_hash \
) \ ) \
GROUP BY content_hash \ GROUP BY content_hash \
LIMIT ?", LIMIT ?"
) );
.bind::<diesel::sql_types::Integer, _>(library_id) let rows: Vec<(String, String)> = diesel::sql_query(sql)
.bind::<diesel::sql_types::BigInt, _>(limit) .bind::<diesel::sql_types::Integer, _>(library_id)
.load::<UnscannedRow>(conn.deref_mut()) .bind::<diesel::sql_types::BigInt, _>(limit)
.with_context(|| "list_unscanned_candidates")? .load::<UnscannedRow>(conn.deref_mut())
.into_iter() .with_context(|| "list_unscanned_candidates")?
.map(|r| (r.rel_path, r.content_hash)) .into_iter()
.collect(); .map(|r| (r.rel_path, r.content_hash))
.collect();
Ok(rows) Ok(rows)
}) })
} }
@@ -1013,14 +1041,29 @@ impl FaceDao for SqliteFaceDao {
.first(conn.deref_mut()) .first(conn.deref_mut())
.with_context(|| "stats: failed")? .with_context(|| "stats: failed")?
}; };
// Image-extension filter mirrors `list_unscanned_candidates` so
// SCANNED can actually reach 100%: videos sit in `image_exif` but
// never get a `face_detections` row, so counting them here
// permanently caps the percentage below 100%.
let total_photos: i64 = { let total_photos: i64 = {
let mut q = image_exif::table.into_boxed(); let ext_predicate = image_path_predicate("rel_path");
if let Some(lib) = library_id { let row: CountRow = if let Some(lib) = library_id {
q = q.filter(image_exif::library_id.eq(lib)); let sql = format!(
} "SELECT COUNT(*) AS count FROM image_exif \
q.select(diesel::dsl::count_star()) WHERE library_id = ? AND {ext_predicate}"
.first(conn.deref_mut()) );
.with_context(|| "stats: total_photos")? diesel::sql_query(sql)
.bind::<diesel::sql_types::Integer, _>(lib)
.get_result(conn.deref_mut())
.with_context(|| "stats: total_photos")?
} else {
let sql =
format!("SELECT COUNT(*) AS count FROM image_exif WHERE {ext_predicate}");
diesel::sql_query(sql)
.get_result(conn.deref_mut())
.with_context(|| "stats: total_photos")?
};
row.count
}; };
let persons_count: i64 = persons::table let persons_count: i64 = persons::table
.select(diesel::dsl::count_star()) .select(diesel::dsl::count_star())
@@ -2284,8 +2327,7 @@ async fn update_face_handler<D: FaceDao>(
"PATCH /image/faces/{}: crop failed for {:?}: {:?}", "PATCH /image/faces/{}: crop failed for {:?}: {:?}",
id, abs_path, e id, abs_path, e
); );
return HttpResponse::BadRequest() return HttpResponse::BadRequest().body(format!("cannot crop new bbox: {}", e));
.body(format!("cannot crop new bbox: {}", e));
} }
}; };
let meta = DetectMeta { let meta = DetectMeta {
@@ -2335,8 +2377,7 @@ async fn update_face_handler<D: FaceDao>(
return HttpResponse::ServiceUnavailable().body(format!("{}", e)); return HttpResponse::ServiceUnavailable().body(format!("{}", e));
} }
Err(FaceDetectError::Disabled) => { Err(FaceDetectError::Disabled) => {
return HttpResponse::ServiceUnavailable() return HttpResponse::ServiceUnavailable().body("face client disabled mid-flight");
.body("face client disabled mid-flight");
} }
} }
} }
@@ -3145,6 +3186,39 @@ mod tests {
assert_eq!(stats.with_faces, 0); assert_eq!(stats.with_faces, 0);
} }
#[test]
fn stats_total_photos_excludes_videos() {
// SCANNED counts content_hashes in face_detections; total_photos
// must apply the same image-extension filter as the watcher
// backlog query so the percentage can reach 100%. Without this,
// videos sit in image_exif but never produce a face_detections
// row (Apollo decodes images only) and the bar caps below 100%.
let mut dao = fresh_dao();
diesel::sql_query(
"INSERT OR IGNORE INTO libraries (id, name, root_path, created_at) \
VALUES (1, 'main', '/tmp', 0)",
)
.execute(dao.connection.lock().unwrap().deref_mut())
.expect("seed libraries");
diesel::sql_query(
"INSERT INTO image_exif \
(library_id, rel_path, content_hash, created_time, last_modified) VALUES \
(1, 'a.jpg', 'h-a', 0, 0), \
(1, 'b.JPEG', 'h-b', 0, 0), \
(1, 'movie.mp4', 'h-mp4', 0, 0), \
(1, 'clip.MOV', 'h-mov', 0, 0)",
)
.execute(dao.connection.lock().unwrap().deref_mut())
.expect("seed image_exif");
let stats = dao.stats(&ctx(), Some(1)).expect("stats");
assert_eq!(
stats.total_photos, 2,
"videos should not count toward total"
);
}
#[test] #[test]
fn merge_persons_repoints_faces() { fn merge_persons_repoints_faces() {
let mut dao = fresh_dao(); let mut dao = fresh_dao();
@@ -3325,8 +3399,7 @@ mod tests {
) )
.unwrap(); .unwrap();
let row = seed_library_and_face(&mut dao, Some(p.id)); let row = seed_library_and_face(&mut dao, Some(p.id));
let joined = let joined = hydrate_face_with_person(&mut dao, &ctx(), row).expect("hydrate assigned");
hydrate_face_with_person(&mut dao, &ctx(), row).expect("hydrate assigned");
assert_eq!(joined.person_id, Some(p.id)); assert_eq!(joined.person_id, Some(p.id));
assert_eq!(joined.person_name.as_deref(), Some("Alice")); assert_eq!(joined.person_name.as_deref(), Some("Alice"));
// Bbox + confidence + source must round-trip — these are what // Bbox + confidence + source must round-trip — these are what
@@ -3345,8 +3418,7 @@ mod tests {
// previously-assigned row's serialization. // previously-assigned row's serialization.
let mut dao = fresh_dao(); let mut dao = fresh_dao();
let row = seed_library_and_face(&mut dao, None); let row = seed_library_and_face(&mut dao, None);
let joined = let joined = hydrate_face_with_person(&mut dao, &ctx(), row).expect("hydrate unassigned");
hydrate_face_with_person(&mut dao, &ctx(), row).expect("hydrate unassigned");
assert!(joined.person_id.is_none()); assert!(joined.person_id.is_none());
assert!(joined.person_name.is_none()); assert!(joined.person_name.is_none());
} }
@@ -3367,7 +3439,12 @@ mod tests {
.execute(dao.connection.lock().unwrap().deref_mut()) .execute(dao.connection.lock().unwrap().deref_mut())
.expect("seed libraries"); .expect("seed libraries");
// Seed image_exif: mix of hashed/unhashed/scanned/cross-library. // Seed image_exif: mix of hashed/unhashed/scanned/cross-library,
// plus a video and a mixed-case image extension. Videos register
// in image_exif but can never produce a face_detections row, so
// the SQL must filter them out — otherwise the per-tick backlog
// drain re-pulls them every tick (no marker is ever written, so
// they loop forever) and the SCANNED stat is permanently capped.
diesel::sql_query( diesel::sql_query(
"INSERT INTO image_exif \ "INSERT INTO image_exif \
(library_id, rel_path, content_hash, created_time, last_modified) VALUES \ (library_id, rel_path, content_hash, created_time, last_modified) VALUES \
@@ -3375,6 +3452,9 @@ mod tests {
(1, 'b.jpg', 'h-b', 0, 0), \ (1, 'b.jpg', 'h-b', 0, 0), \
(1, 'c.jpg', NULL, 0, 0), \ (1, 'c.jpg', NULL, 0, 0), \
(1, 'd.jpg', 'h-d', 0, 0), \ (1, 'd.jpg', 'h-d', 0, 0), \
(1, 'movie.mp4', 'h-mp4', 0, 0), \
(1, 'clip.MOV', 'h-mov', 0, 0), \
(1, 'photo.JPG', 'h-jpg-upper', 0, 0), \
(2, 'e.jpg', 'h-e', 0, 0)", (2, 'e.jpg', 'h-e', 0, 0)",
) )
.execute(dao.connection.lock().unwrap().deref_mut()) .execute(dao.connection.lock().unwrap().deref_mut())
@@ -3388,16 +3468,26 @@ mod tests {
.list_unscanned_candidates(&ctx(), 1, 10) .list_unscanned_candidates(&ctx(), 1, 10)
.expect("list unscanned"); .expect("list unscanned");
let hashes: std::collections::HashSet<_> = let hashes: std::collections::HashSet<_> = cands.iter().map(|(_, h)| h.clone()).collect();
cands.iter().map(|(_, h)| h.clone()).collect();
// Should contain a and d (hashed, unscanned, library 1). // Should contain a, d, and the upper-case .JPG (image-extension
// match is case-insensitive).
assert!(hashes.contains("h-a"), "missing h-a: {:?}", hashes); assert!(hashes.contains("h-a"), "missing h-a: {:?}", hashes);
assert!(hashes.contains("h-d"), "missing h-d: {:?}", hashes); assert!(hashes.contains("h-d"), "missing h-d: {:?}", hashes);
// Should NOT contain b (scanned), c (no hash), e (other library). assert!(
hashes.contains("h-jpg-upper"),
"missing h-jpg-upper: {:?}",
hashes
);
// Should NOT contain b (scanned), c (no hash), e (other library),
// or videos (mp4/mov are not image extensions).
assert!(!hashes.contains("h-b"), "expected h-b filtered (scanned)"); assert!(!hashes.contains("h-b"), "expected h-b filtered (scanned)");
assert!(!hashes.contains("h-e"), "expected h-e filtered (other library)"); assert!(
assert_eq!(cands.len(), 2, "unexpected candidates: {:?}", cands); !hashes.contains("h-e"),
"expected h-e filtered (other library)"
);
assert!(!hashes.contains("h-mp4"), "expected h-mp4 filtered (video)");
assert!(!hashes.contains("h-mov"), "expected h-mov filtered (video)");
assert_eq!(cands.len(), 3, "unexpected candidates: {:?}", cands);
} }
} }