faces: exclude videos from backlog drain and SCANNED denominator

list_unscanned_candidates pulled every hashed image_exif row, including videos. filter_excluded then dropped them client-side without writing a marker, so the same set re-appeared every watcher tick — emitting the "backlog drain — running detection on N candidate(s)" log forever and producing no progress. face_stats.total_photos counted the same video rows in the denominator, so the SCANNED percentage was structurally capped below 100%. Add an image-extension SQL predicate (case-insensitive, sourced from file_types::IMAGE_EXTENSIONS) and apply it to both queries. Videos never enter the candidate set, total_photos counts only what can actually be scanned, and 100% becomes reachable. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-30 21:16:30 +00:00
parent dbb046dfa8
commit 0840d55c70
1 changed files with 125 additions and 35 deletions
--- a/src/faces.rs
+++ b/src/faces.rs
@@ -20,9 +20,10 @@

 use crate::Claims;
 use crate::ai::face_client::{DetectMeta, FaceClient, FaceDetectError};
-use crate::exif;
 use crate::database::schema::{face_detections, image_exif, persons};
 use crate::error::IntoHttpError;
+use crate::exif;
+use crate::file_types;
 use crate::libraries::{self, Library};
 use crate::otel::{extract_context_from_request, global_tracer, trace_db_call};
 use crate::state::AppState;
@@ -99,9 +100,30 @@ pub struct FaceDetectionRow {
    pub created_at: i64,
 }

+/// SQL fragment restricting an `image_exif.rel_path` (or `face_detections.rel_path`)
+/// column to image extensions. Videos register in `image_exif` with a
+/// populated `content_hash` but can never produce a `face_detections` row
+/// — applying this filter at query time keeps videos out of the per-tick
+/// backlog drain (which would otherwise loop forever — `filter_excluded`
+/// drops them client-side without writing a marker) and out of the SCANNED
+/// stat denominator (so 100% is reachable).
+fn image_path_predicate(col: &str) -> String {
+    let clauses: Vec<String> = file_types::IMAGE_EXTENSIONS
+        .iter()
+        .map(|ext| format!("lower({col}) LIKE '%.{ext}'"))
+        .collect();
+    format!("({})", clauses.join(" OR "))
+}
+
 /// Row shape for `list_unscanned_candidates`'s raw SQL. Diesel's
 /// `sql_query` requires a `QueryableByName` row type with explicit
 /// column SQL types; using a tuple isn't supported.
+#[derive(diesel::QueryableByName, Debug)]
+struct CountRow {
+    #[diesel(sql_type = diesel::sql_types::BigInt)]
+    count: i64,
+}
+
 #[derive(diesel::QueryableByName, Debug)]
 struct UnscannedRow {
    #[diesel(sql_type = diesel::sql_types::Text)]
@@ -601,19 +623,25 @@ impl FaceDao for SqliteFaceDao {
            // fire multiple detect calls for the same hash if it lives
            // under several rel_paths in the same library. The
            // anti-join (NOT EXISTS) drains hashes that have no row in
-            // face_detections at all.
-            let rows: Vec<(String, String)> = diesel::sql_query(
+            // face_detections at all. The image-extension predicate
+            // keeps videos out of the candidate set; without it they'd
+            // be filtered client-side and re-pulled every tick forever
+            // because no marker row is written for excluded paths.
+            let ext_predicate = image_path_predicate("rel_path");
+            let sql = format!(
                "SELECT rel_path, content_hash \
                 FROM image_exif e \
                 WHERE library_id = ? \
                   AND content_hash IS NOT NULL \
+                   AND {ext_predicate} \
                   AND NOT EXISTS ( \
                     SELECT 1 FROM face_detections f \
                     WHERE f.content_hash = e.content_hash \
                   ) \
                 GROUP BY content_hash \
-                 LIMIT ?",
-            )
+                 LIMIT ?"
+            );
+            let rows: Vec<(String, String)> = diesel::sql_query(sql)
                .bind::<diesel::sql_types::Integer, _>(library_id)
                .bind::<diesel::sql_types::BigInt, _>(limit)
                .load::<UnscannedRow>(conn.deref_mut())
@@ -1013,14 +1041,29 @@ impl FaceDao for SqliteFaceDao {
                    .first(conn.deref_mut())
                    .with_context(|| "stats: failed")?
            };
+            // Image-extension filter mirrors `list_unscanned_candidates` so
+            // SCANNED can actually reach 100%: videos sit in `image_exif` but
+            // never get a `face_detections` row, so counting them here
+            // permanently caps the percentage below 100%.
            let total_photos: i64 = {
-                let mut q = image_exif::table.into_boxed();
-                if let Some(lib) = library_id {
-                    q = q.filter(image_exif::library_id.eq(lib));
-                }
-                q.select(diesel::dsl::count_star())
-                    .first(conn.deref_mut())
+                let ext_predicate = image_path_predicate("rel_path");
+                let row: CountRow = if let Some(lib) = library_id {
+                    let sql = format!(
+                        "SELECT COUNT(*) AS count FROM image_exif \
+                         WHERE library_id = ? AND {ext_predicate}"
+                    );
+                    diesel::sql_query(sql)
+                        .bind::<diesel::sql_types::Integer, _>(lib)
+                        .get_result(conn.deref_mut())
                        .with_context(|| "stats: total_photos")?
+                } else {
+                    let sql =
+                        format!("SELECT COUNT(*) AS count FROM image_exif WHERE {ext_predicate}");
+                    diesel::sql_query(sql)
+                        .get_result(conn.deref_mut())
+                        .with_context(|| "stats: total_photos")?
+                };
+                row.count
            };
            let persons_count: i64 = persons::table
                .select(diesel::dsl::count_star())
@@ -2284,8 +2327,7 @@ async fn update_face_handler<D: FaceDao>(
                    "PATCH /image/faces/{}: crop failed for {:?}: {:?}",
                    id, abs_path, e
                );
-                return HttpResponse::BadRequest()
-                    .body(format!("cannot crop new bbox: {}", e));
+                return HttpResponse::BadRequest().body(format!("cannot crop new bbox: {}", e));
            }
        };
        let meta = DetectMeta {
@@ -2335,8 +2377,7 @@ async fn update_face_handler<D: FaceDao>(
                return HttpResponse::ServiceUnavailable().body(format!("{}", e));
            }
            Err(FaceDetectError::Disabled) => {
-                return HttpResponse::ServiceUnavailable()
-                    .body("face client disabled mid-flight");
+                return HttpResponse::ServiceUnavailable().body("face client disabled mid-flight");
            }
        }
    }
@@ -3145,6 +3186,39 @@ mod tests {
        assert_eq!(stats.with_faces, 0);
    }

+    #[test]
+    fn stats_total_photos_excludes_videos() {
+        // SCANNED counts content_hashes in face_detections; total_photos
+        // must apply the same image-extension filter as the watcher
+        // backlog query so the percentage can reach 100%. Without this,
+        // videos sit in image_exif but never produce a face_detections
+        // row (Apollo decodes images only) and the bar caps below 100%.
+        let mut dao = fresh_dao();
+        diesel::sql_query(
+            "INSERT OR IGNORE INTO libraries (id, name, root_path, created_at) \
+             VALUES (1, 'main', '/tmp', 0)",
+        )
+        .execute(dao.connection.lock().unwrap().deref_mut())
+        .expect("seed libraries");
+
+        diesel::sql_query(
+            "INSERT INTO image_exif \
+             (library_id, rel_path, content_hash, created_time, last_modified) VALUES \
+             (1, 'a.jpg',     'h-a',   0, 0), \
+             (1, 'b.JPEG',    'h-b',   0, 0), \
+             (1, 'movie.mp4', 'h-mp4', 0, 0), \
+             (1, 'clip.MOV',  'h-mov', 0, 0)",
+        )
+        .execute(dao.connection.lock().unwrap().deref_mut())
+        .expect("seed image_exif");
+
+        let stats = dao.stats(&ctx(), Some(1)).expect("stats");
+        assert_eq!(
+            stats.total_photos, 2,
+            "videos should not count toward total"
+        );
+    }
+
    #[test]
    fn merge_persons_repoints_faces() {
        let mut dao = fresh_dao();
@@ -3325,8 +3399,7 @@ mod tests {
            )
            .unwrap();
        let row = seed_library_and_face(&mut dao, Some(p.id));
-        let joined =
-            hydrate_face_with_person(&mut dao, &ctx(), row).expect("hydrate assigned");
+        let joined = hydrate_face_with_person(&mut dao, &ctx(), row).expect("hydrate assigned");
        assert_eq!(joined.person_id, Some(p.id));
        assert_eq!(joined.person_name.as_deref(), Some("Alice"));
        // Bbox + confidence + source must round-trip — these are what
@@ -3345,8 +3418,7 @@ mod tests {
        // previously-assigned row's serialization.
        let mut dao = fresh_dao();
        let row = seed_library_and_face(&mut dao, None);
-        let joined =
-            hydrate_face_with_person(&mut dao, &ctx(), row).expect("hydrate unassigned");
+        let joined = hydrate_face_with_person(&mut dao, &ctx(), row).expect("hydrate unassigned");
        assert!(joined.person_id.is_none());
        assert!(joined.person_name.is_none());
    }
@@ -3367,7 +3439,12 @@ mod tests {
        .execute(dao.connection.lock().unwrap().deref_mut())
        .expect("seed libraries");

-        // Seed image_exif: mix of hashed/unhashed/scanned/cross-library.
+        // Seed image_exif: mix of hashed/unhashed/scanned/cross-library,
+        // plus a video and a mixed-case image extension. Videos register
+        // in image_exif but can never produce a face_detections row, so
+        // the SQL must filter them out — otherwise the per-tick backlog
+        // drain re-pulls them every tick (no marker is ever written, so
+        // they loop forever) and the SCANNED stat is permanently capped.
        diesel::sql_query(
            "INSERT INTO image_exif \
             (library_id, rel_path, content_hash, created_time, last_modified) VALUES \
@@ -3375,6 +3452,9 @@ mod tests {
             (1, 'b.jpg', 'h-b', 0, 0), \
             (1, 'c.jpg', NULL,  0, 0), \
             (1, 'd.jpg', 'h-d', 0, 0), \
+             (1, 'movie.mp4', 'h-mp4', 0, 0), \
+             (1, 'clip.MOV',  'h-mov', 0, 0), \
+             (1, 'photo.JPG', 'h-jpg-upper', 0, 0), \
             (2, 'e.jpg', 'h-e', 0, 0)",
        )
        .execute(dao.connection.lock().unwrap().deref_mut())
@@ -3388,16 +3468,26 @@ mod tests {
            .list_unscanned_candidates(&ctx(), 1, 10)
            .expect("list unscanned");

-        let hashes: std::collections::HashSet<_> =
-            cands.iter().map(|(_, h)| h.clone()).collect();
+        let hashes: std::collections::HashSet<_> = cands.iter().map(|(_, h)| h.clone()).collect();

-        // Should contain a and d (hashed, unscanned, library 1).
+        // Should contain a, d, and the upper-case .JPG (image-extension
+        // match is case-insensitive).
        assert!(hashes.contains("h-a"), "missing h-a: {:?}", hashes);
        assert!(hashes.contains("h-d"), "missing h-d: {:?}", hashes);
-        // Should NOT contain b (scanned), c (no hash), e (other library).
+        assert!(
+            hashes.contains("h-jpg-upper"),
+            "missing h-jpg-upper: {:?}",
+            hashes
+        );
+        // Should NOT contain b (scanned), c (no hash), e (other library),
+        // or videos (mp4/mov are not image extensions).
        assert!(!hashes.contains("h-b"), "expected h-b filtered (scanned)");
-        assert!(!hashes.contains("h-e"), "expected h-e filtered (other library)");
-        assert_eq!(cands.len(), 2, "unexpected candidates: {:?}", cands);
+        assert!(
+            !hashes.contains("h-e"),
+            "expected h-e filtered (other library)"
+        );
+        assert!(!hashes.contains("h-mp4"), "expected h-mp4 filtered (video)");
+        assert!(!hashes.contains("h-mov"), "expected h-mov filtered (video)");
+        assert_eq!(cands.len(), 3, "unexpected candidates: {:?}", cands);
    }
-
 }