From 323097c650bf000f04e8cf9b06731c4fff856d94 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Thu, 30 Apr 2026 22:41:20 +0000 Subject: [PATCH] faces: count distinct content_hash in stats total_photos MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit face_detections is keyed on content_hash (one row per unique bytes, shared across libraries / duplicate paths) but total_photos was COUNT(*) over image_exif rows. A file present at multiple rel_paths or across libraries inflated the denominator without inflating the numerator, leaving a permanent gap (e.g. 1101/1103 with nothing actually pending detection). Switch total_photos to COUNT(DISTINCT content_hash) so numerator and denominator live in the same domain. Exclude rows with NULL content_hash from the count — they're held in the hash-backfill backlog, not the detection backlog, and counting them pins the bar below 100% for the duration of that pass. CLAUDE.md: document the stats domain rule next to the rest of the face-detection notes. Co-Authored-By: Claude Opus 4.7 (1M context) --- CLAUDE.md | 2 ++ src/faces.rs | 21 +++++++++++++++++---- 2 files changed, 19 insertions(+), 4 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 6d4a751..ee00921 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -233,6 +233,8 @@ ImageApi owns the face data; Apollo (sibling repo) hosts the insightface inferen **Rerun preserves manual rows** (`POST /image/faces/{id}/rerun`): only `source='auto'` rows are deleted before re-running detection. `already_scanned` returns true on ANY row, so a photo whose only faces are manually drawn never auto-redetects. +**Stats domain — content_hash, not file rows** (`FaceDao::stats` in `src/faces.rs`): `total_photos` counts `DISTINCT content_hash` over `image_exif` (filtered to image extensions, `content_hash IS NOT NULL`), and so do `scanned` / `with_faces` / `no_faces` / `failed` over `face_detections`. Numerator and denominator must live in the same domain — `face_detections` is keyed on content_hash, so the same JPEG present at two rel_paths or in two libraries scans once. Counting `image_exif` rows in the denominator inflated total by one per duplicate file and produced a permanent gap (e.g. 1101/1103 with nothing actually pending). Hash-less rows are excluded from total_photos while they sit in the `backfill_unhashed_backlog` queue; otherwise the bar pins below 100% for the duration of that backfill even though those rows aren't pending detection yet — they're pending hashing. + Module map: - `src/faces.rs` — `FaceDao` trait + `SqliteFaceDao` impl, route handlers for `/faces/*`, `/image/faces/*`, `/persons/*`. Mirror of `tags.rs` layout. - `src/face_watch.rs` — Tokio orchestration for the file-watch detect pass; `filter_excluded` (PathExcluder + image-extension filter), `read_image_bytes_for_detect` (RAW preview fallback). diff --git a/src/faces.rs b/src/faces.rs index 72ce370..0c8ea16 100644 --- a/src/faces.rs +++ b/src/faces.rs @@ -1045,20 +1045,33 @@ impl FaceDao for SqliteFaceDao { // SCANNED can actually reach 100%: videos sit in `image_exif` but // never get a `face_detections` row, so counting them here // permanently caps the percentage below 100%. + // + // Count DISTINCT content_hash (not rows) so the numerator + // (`scanned`, also distinct-content_hash) and denominator live + // in the same domain. Without this, a file present at multiple + // rel_paths or across libraries inflates total_photos by one + // per duplicate row while face_detections — keyed on + // content_hash — counts the bytes once, leaving a permanent + // gap (e.g. 1101/1103 with nothing actually pending). Rows + // with NULL content_hash are excluded; they're held in the + // hash-backfill backlog and counting them would pin the bar + // below 100% for the duration of that backfill. let total_photos: i64 = { let ext_predicate = image_path_predicate("rel_path"); let row: CountRow = if let Some(lib) = library_id { let sql = format!( - "SELECT COUNT(*) AS count FROM image_exif \ - WHERE library_id = ? AND {ext_predicate}" + "SELECT COUNT(DISTINCT content_hash) AS count FROM image_exif \ + WHERE library_id = ? AND content_hash IS NOT NULL AND {ext_predicate}" ); diesel::sql_query(sql) .bind::(lib) .get_result(conn.deref_mut()) .with_context(|| "stats: total_photos")? } else { - let sql = - format!("SELECT COUNT(*) AS count FROM image_exif WHERE {ext_predicate}"); + let sql = format!( + "SELECT COUNT(DISTINCT content_hash) AS count FROM image_exif \ + WHERE content_hash IS NOT NULL AND {ext_predicate}" + ); diesel::sql_query(sql) .get_result(conn.deref_mut()) .with_context(|| "stats: total_photos")? -- 2.49.1