diff --git a/src/bin/backfill_perceptual_hash.rs b/src/bin/backfill_perceptual_hash.rs index 2f8c0ad..ebf02c3 100644 --- a/src/bin/backfill_perceptual_hash.rs +++ b/src/bin/backfill_perceptual_hash.rs @@ -108,11 +108,7 @@ fn main() -> anyhow::Result<()> { // releases the GIL-equivalent. rayon's default thread pool // matches the host's logical-core count which is the right // ceiling for image_hasher's DCT pass. - let results: Vec<( - i32, - String, - FilePerceptualResult, - )> = rows + let results: Vec<(i32, String, FilePerceptualResult)> = rows .into_par_iter() .map(|(library_id, rel_path)| { let abs = libs_by_id @@ -158,13 +154,39 @@ fn main() -> anyhow::Result<()> { } } FilePerceptualResult::DecodeFailed => { - // Mark as "we tried" so the next run doesn't keep - // hammering this file. We persist NULL/NULL — - // unfortunately that leaves it eligible for the - // next run. The honest fix is a separate "perceptual - // hash attempted" timestamp; for now we accept the - // re-attempt cost since decode-failure rate is low. - total_decode_failures += 1; + // Persist phash_64=0/dhash_64=0 as a "tried, + // unhashable" sentinel so this row leaves the + // `phash_64 IS NULL` candidate set and the + // backfill doesn't infinite-loop on a queue of + // unbreakable formats (HEIC, RAW, CMYK JPEGs, + // truncated bytes). The all-zero hash is + // explicitly excluded from clustering by + // is_informative_hash in duplicates.rs, so it + // won't pollute group output — it just becomes + // invisible to the duplicate finder. + log::debug!( + "perceptual decode failed for {} (lib {}); marking unhashable", + rel_path, + library_id + ); + match guard.backfill_perceptual_hash( + &ctx, + *library_id, + rel_path, + Some(0), + Some(0), + ) { + Ok(_) => { + total_decode_failures += 1; + } + Err(e) => { + pb.println(format!( + "persist error (decode-fail sentinel) for {}: {:?}", + rel_path, e + )); + total_errors += 1; + } + } } FilePerceptualResult::MissingOnDisk => { total_missing += 1; diff --git a/src/duplicates.rs b/src/duplicates.rs index 94093f5..d6b70b8 100644 --- a/src/duplicates.rs +++ b/src/duplicates.rs @@ -196,18 +196,16 @@ async fn list_perceptual_handler( let include_resolved = query.include_resolved.unwrap_or(false); // Cache hit? - if let Ok(guard) = PERCEPTUAL_CACHE.lock() { - if let Some(entry) = guard.as_ref() { - if entry.library_id == library_id - && entry.threshold == threshold - && entry.include_resolved == include_resolved - && entry.computed_at.elapsed() < PERCEPTUAL_CACHE_TTL - { - return HttpResponse::Ok().json(GroupsResponse { - groups: entry.groups.clone(), - }); - } - } + if let Ok(guard) = PERCEPTUAL_CACHE.lock() + && let Some(entry) = guard.as_ref() + && entry.library_id == library_id + && entry.threshold == threshold + && entry.include_resolved == include_resolved + && entry.computed_at.elapsed() < PERCEPTUAL_CACHE_TTL + { + return HttpResponse::Ok().json(GroupsResponse { + groups: entry.groups.clone(), + }); } let rows = { @@ -302,23 +300,23 @@ async fn resolve_handler( // demoted's tag set onto the survivor before flipping the // soft-mark. For exact dups (same content_hash), tags are // already shared at the bytes layer — the union is a no-op. - if demoted.content_hash != survivor.content_hash { - if let Err(e) = dao.union_perceptual_tags( + if demoted.content_hash != survivor.content_hash + && let Err(e) = dao.union_perceptual_tags( &span_context, &survivor.content_hash, &demoted.content_hash, &survivor.rel_path, - ) { - log::warn!( - "duplicates.resolve: tag union failed for {}: {:?}", - demoted.rel_path, - e - ); - // Continue with the soft-mark anyway — losing tag - // continuity is recoverable (unresolve restores the - // demoted row's grid presence, and the original tags - // never moved off the demoted hash). - } + ) + { + log::warn!( + "duplicates.resolve: tag union failed for {}: {:?}", + demoted.rel_path, + e + ); + // Continue with the soft-mark anyway — losing tag + // continuity is recoverable (unresolve restores the + // demoted row's grid presence, and the original tags + // never moved off the demoted hash). } if let Err(e) = dao.set_duplicate_of( @@ -395,17 +393,43 @@ fn group_exact(rows: Vec) -> Vec { groups } +/// Bits set in a "useful" perceptual hash. Below this many or above +/// (64 - this many), the image is too uniform to compare meaningfully +/// — pHash collapses to all-zeros for solid colors, all-ones for +/// inverted-uniform, and a few in-between for low-frequency content. +/// Without this filter, every flat sky / black frame / monochrome +/// scan ends up Hamming-distance-zero from every other one, producing +/// a single mega-cluster of hundreds of unrelated photos. +const MIN_INFORMATIVE_POPCOUNT: u32 = 8; +const MAX_INFORMATIVE_POPCOUNT: u32 = 64 - MIN_INFORMATIVE_POPCOUNT; + +#[inline] +fn is_informative_hash(h: i64) -> bool { + let pop = (h as u64).count_ones(); + (MIN_INFORMATIVE_POPCOUNT..=MAX_INFORMATIVE_POPCOUNT).contains(&pop) +} + /// Single-link cluster the input rows by Hamming distance over their /// pHash, with `threshold` as the maximum distance for an edge. Rows -/// without a pHash are skipped (we already filter at the SQL layer but -/// the type carries an Option for safety). +/// without a pHash, or with a degenerate (low-entropy) pHash, are +/// excluded — they'd chain together unrelated images. +/// +/// Two-signal validation: the BK-tree gives candidate pairs cheaply, +/// then we additionally require dHash agreement before unioning. pHash +/// alone is too permissive; pairing it with dHash collapses the false- +/// positive cluster significantly (different DCT vs gradient +/// signatures on real near-dups still both stay close, but spurious +/// pHash collisions on uniform images don't survive the dHash check). /// /// Implementation: BK-tree neighbourhood lookup per row, union-find -/// over the resulting edges. O(N log N) instead of the O(N²) naive +/// over the validated edges. O(N log N) instead of the O(N²) naive /// pairwise scan; on a 1.26M-row library that's the difference between /// "responds in 1.5 s" and "responds in 25 minutes". fn cluster_perceptual(rows: Vec, threshold: u32) -> Vec { - let candidates: Vec = rows.into_iter().filter(|r| r.phash_64.is_some()).collect(); + let candidates: Vec = rows + .into_iter() + .filter(|r| r.phash_64.is_some_and(is_informative_hash)) + .collect(); if candidates.len() < 2 { return Vec::new(); } @@ -421,7 +445,13 @@ fn cluster_perceptual(rows: Vec, threshold: u32) -> Vec, threshold: u32) -> Vec { + (a as u64 ^ b as u64).count_ones() <= threshold + && is_informative_hash(a) + && is_informative_hash(b) + } + // Missing dHash on either side: trust pHash alone + // rather than dropping the candidate, so partial + // backfills don't silently disappear. + _ => true, + }; + if dhash_ok { uf.union(idx, neighbour.idx); } } @@ -524,7 +570,12 @@ impl UnionFind { pub fn add_duplicate_services(app: App) -> App where - T: ServiceFactory, + T: ServiceFactory< + actix_web::dev::ServiceRequest, + Config = (), + Error = actix_web::Error, + InitError = (), + >, { app.service(web::resource("/duplicates/exact").route(web::get().to(list_exact_handler))) .service( @@ -570,14 +621,61 @@ mod tests { assert_eq!(groups[0].members.len(), 3); } + /// All hashes used below have popcount in the "informative" + /// 8..=56 band so they survive the entropy filter that keeps + /// solid-colour images out of the cluster graph. + const INFORMATIVE_BASE: i64 = 0x55AA_55AA_55AA_55AA; // popcount = 32 + const INFORMATIVE_NEAR: i64 = 0x55AA_55AA_55AA_55AB; // 1-bit away from BASE + const INFORMATIVE_FAR: i64 = 0x6996_6996_6996_6996; // 32-bits away from BASE + + fn row_with_dhash( + library_id: i32, + rel: &str, + hash: &str, + phash: Option, + dhash: Option, + ) -> DuplicateRow { + DuplicateRow { + library_id, + rel_path: rel.into(), + content_hash: hash.into(), + size_bytes: Some(1000), + date_taken: None, + width: None, + height: None, + phash_64: phash, + dhash_64: dhash, + duplicate_of_hash: None, + duplicate_decided_at: None, + } + } + #[test] fn cluster_perceptual_unites_close_hashes() { - // Three rows: two near each other (phash differs by 1 bit), - // one far away. Threshold 4 should merge the close pair. + // Two rows near each other on both pHash and dHash; one far + // on pHash. Threshold 4 should merge the close pair. let rows = vec![ - row(1, "a.jpg", "h1", Some(0b0000)), - row(1, "b.jpg", "h2", Some(0b0001)), - row(1, "c.jpg", "h3", Some(i64::MAX)), + row_with_dhash( + 1, + "a.jpg", + "h1", + Some(INFORMATIVE_BASE), + Some(INFORMATIVE_BASE), + ), + row_with_dhash( + 1, + "b.jpg", + "h2", + Some(INFORMATIVE_NEAR), + Some(INFORMATIVE_NEAR), + ), + row_with_dhash( + 1, + "c.jpg", + "h3", + Some(INFORMATIVE_FAR), + Some(INFORMATIVE_FAR), + ), ]; let groups = cluster_perceptual(rows, 4); assert_eq!(groups.len(), 1); @@ -594,8 +692,20 @@ mod tests { #[test] fn cluster_perceptual_threshold_zero_drops_distinct() { let rows = vec![ - row(1, "a.jpg", "h1", Some(0b0000)), - row(1, "b.jpg", "h2", Some(0b0001)), + row_with_dhash( + 1, + "a.jpg", + "h1", + Some(INFORMATIVE_BASE), + Some(INFORMATIVE_BASE), + ), + row_with_dhash( + 1, + "b.jpg", + "h2", + Some(INFORMATIVE_NEAR), + Some(INFORMATIVE_NEAR), + ), ]; let groups = cluster_perceptual(rows, 0); assert!(groups.is_empty()); @@ -603,17 +713,63 @@ mod tests { #[test] fn cluster_perceptual_skips_singletons() { - let rows = vec![row(1, "alone.jpg", "h1", Some(0))]; + let rows = vec![row(1, "alone.jpg", "h1", Some(INFORMATIVE_BASE))]; assert!(cluster_perceptual(rows, 8).is_empty()); } + #[test] + fn cluster_perceptual_filters_low_entropy_hashes() { + // Both 0 (popcount 0) and i64::MAX (popcount 63) fall outside + // the informative band. A pair of these would trivially match + // (Hamming distance to each other small or zero) without the + // entropy filter — that's exactly the regression that was + // producing a giant first cluster of solid-colour images. + let rows = vec![ + row(1, "blank-a.jpg", "h1", Some(0)), + row(1, "blank-b.jpg", "h2", Some(0)), + row(1, "white-a.jpg", "h3", Some(i64::MAX)), + row(1, "white-b.jpg", "h4", Some(i64::MAX)), + ]; + assert!(cluster_perceptual(rows, 8).is_empty()); + } + + #[test] + fn cluster_perceptual_requires_dhash_agreement() { + // pHash within threshold but dHash far apart — the candidate + // edge from the BK-tree must be rejected. Without the dHash + // double-check this would form a 2-member cluster. + let rows = vec![ + row_with_dhash( + 1, + "a.jpg", + "h1", + Some(INFORMATIVE_BASE), + Some(INFORMATIVE_BASE), + ), + row_with_dhash( + 1, + "b.jpg", + "h2", + Some(INFORMATIVE_NEAR), + Some(INFORMATIVE_FAR), + ), + ]; + assert!(cluster_perceptual(rows, 4).is_empty()); + } + /// Sanity-check the BK-tree's metric, which is what the duplicates /// path actually clusters on. #[test] fn hamming_metric_is_symmetric() { let m = HammingMetric; - let a = HashKey { phash: 0b1010, idx: 0 }; - let b = HashKey { phash: 0b0101, idx: 1 }; + let a = HashKey { + phash: 0b1010, + idx: 0, + }; + let b = HashKey { + phash: 0b0101, + idx: 1, + }; let d1 = m.distance(&a, &b); let d2 = m.distance(&b, &a); assert_eq!(d1, d2); diff --git a/src/lib.rs b/src/lib.rs index eb3c252..c110d8e 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -8,10 +8,9 @@ pub mod auth; pub mod bin_progress; pub mod cleanup; pub mod content_hash; -pub mod perceptual_hash; -pub mod duplicates; pub mod data; pub mod database; +pub mod duplicates; pub mod error; pub mod exif; pub mod face_watch; @@ -25,6 +24,7 @@ pub mod library_maintenance; pub mod memories; pub mod otel; pub mod parsers; +pub mod perceptual_hash; pub mod service; pub mod state; pub mod tags; diff --git a/src/main.rs b/src/main.rs index 54504ad..2d598ca 100644 --- a/src/main.rs +++ b/src/main.rs @@ -62,10 +62,9 @@ use opentelemetry::{KeyValue, global}; mod ai; mod auth; mod content_hash; -mod perceptual_hash; -mod duplicates; mod data; mod database; +mod duplicates; mod error; mod exif; mod face_watch; @@ -75,6 +74,7 @@ mod files; mod geo; mod libraries; mod library_maintenance; +mod perceptual_hash; mod state; mod tags; mod utils; @@ -671,24 +671,22 @@ async fn upload_image( .to_string(); { let mut dao = exif_dao.lock().expect("Unable to lock ExifDao"); - if let Ok(Some(existing)) = - dao.find_by_content_hash(&span_context, &upload_hash) + if let Ok(Some(existing)) = dao.find_by_content_hash(&span_context, &upload_hash) + && existing.duplicate_of_hash.is_none() { - if existing.duplicate_of_hash.is_none() { - let library_name = libraries::load_all(&mut crate::database::connect()) - .into_iter() - .find(|l| l.id == existing.library_id) - .map(|l| l.name); - span.set_status(Status::Ok); - return HttpResponse::Conflict().json(serde_json::json!({ - "duplicate_of": { - "library_id": existing.library_id, - "rel_path": existing.file_path, - }, - "content_hash": upload_hash, - "library_name": library_name, - })); - } + let library_name = libraries::load_all(&mut crate::database::connect()) + .into_iter() + .find(|l| l.id == existing.library_id) + .map(|l| l.name); + span.set_status(Status::Ok); + return HttpResponse::Conflict().json(serde_json::json!({ + "duplicate_of": { + "library_id": existing.library_id, + "rel_path": existing.file_path, + }, + "content_hash": upload_hash, + "library_name": library_name, + })); } }