From 98057c98a1d2848a890c21f150149262e6f8a380 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Sun, 3 May 2026 18:19:48 -0400 Subject: [PATCH] =?UTF-8?q?duplicates:=20tighten=20perceptual=20cluster=20?= =?UTF-8?q?=E2=80=94=20entropy=20band,=20asymmetric=20dHash,=20medoid=20pr?= =?UTF-8?q?une?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three changes against "still too loose at lowest sensitivity": - Popcount entropy band tightened from [8, 56] to [16, 48]. The wider band let too much low-frequency content through (skies, scans, faded film) where pHash collapses to near-uniform values that Hamming-trivially across hundreds of unrelated images. - dHash check now uses an asymmetric stricter threshold (dhash_threshold = max(2, threshold/2)). pHash is the candidate- discovery signal; dHash is validation. Splitting the budget means a real near-dup survives both while incidental pHash collisions on uniform content get vetoed. Missing dHash on either side now rejects the edge (was: trust pHash alone). - Single-link union-find can chain weakly-similar images via transitive edges. Added a medoid-validation pass: per cluster, pick the member with smallest summed distance to others, then drop any whose distance to it exceeds threshold. Two new tests pin both invariants. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/duplicates.rs | 159 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 137 insertions(+), 22 deletions(-) diff --git a/src/duplicates.rs b/src/duplicates.rs index d6b70b8..d7b6921 100644 --- a/src/duplicates.rs +++ b/src/duplicates.rs @@ -393,14 +393,14 @@ fn group_exact(rows: Vec) -> Vec { groups } -/// Bits set in a "useful" perceptual hash. Below this many or above -/// (64 - this many), the image is too uniform to compare meaningfully -/// — pHash collapses to all-zeros for solid colors, all-ones for -/// inverted-uniform, and a few in-between for low-frequency content. -/// Without this filter, every flat sky / black frame / monochrome -/// scan ends up Hamming-distance-zero from every other one, producing -/// a single mega-cluster of hundreds of unrelated photos. -const MIN_INFORMATIVE_POPCOUNT: u32 = 8; +/// Bits set in a "useful" perceptual hash. Real photographic content +/// produces ~50/50 bit distributions; anything outside the [16, 48] +/// band is low-entropy structure (uniform skies, black frames, +/// monochrome scans, faded film) where pHash collapses to near- +/// uniform values that Hamming-trivially across hundreds of unrelated +/// images. The 8/56 band that shipped first was too permissive — +/// even at threshold=4 the false-positive cluster persisted. +const MIN_INFORMATIVE_POPCOUNT: u32 = 16; const MAX_INFORMATIVE_POPCOUNT: u32 = 64 - MIN_INFORMATIVE_POPCOUNT; #[inline] @@ -409,6 +409,21 @@ fn is_informative_hash(h: i64) -> bool { (MIN_INFORMATIVE_POPCOUNT..=MAX_INFORMATIVE_POPCOUNT).contains(&pop) } +/// dHash gets a stricter threshold than pHash. pHash is the +/// candidate-discovery signal (BK-tree neighbourhood lookup); dHash +/// is the validation signal that has to actively agree before we +/// union. Splitting the budget asymmetrically means a real near-dup +/// (which scores well on both) survives while an incidental pHash +/// collision (uniform-content false positive) gets vetoed. +/// +/// Floor of 2 so threshold=4 still allows a 1-bit jitter in dHash — +/// genuine resampling can flip a low-frequency gradient bit even +/// when the visual content is identical. +#[inline] +fn dhash_threshold(phash_threshold: u32) -> u32 { + (phash_threshold / 2).max(2) +} + /// Single-link cluster the input rows by Hamming distance over their /// pHash, with `threshold` as the maximum distance for an edge. Rows /// without a pHash, or with a degenerate (low-entropy) pHash, are @@ -446,12 +461,15 @@ fn cluster_perceptual(rows: Vec, threshold: u32) -> Vec, threshold: u32) -> Vec { - (a as u64 ^ b as u64).count_ones() <= threshold + (a as u64 ^ b as u64).count_ones() <= dhash_max && is_informative_hash(a) && is_informative_hash(b) } - // Missing dHash on either side: trust pHash alone - // rather than dropping the candidate, so partial - // backfills don't silently disappear. - _ => true, + _ => false, }; if dhash_ok { uf.union(idx, neighbour.idx); @@ -489,9 +503,16 @@ fn cluster_perceptual(rows: Vec, threshold: u32) -> Vec = by_root + // Medoid-validate each cluster to break single-link chains. + // Single-link unions any pair within threshold; that means a chain + // A↔B↔C can collapse into one cluster even when A and C aren't + // similar. The medoid pass picks the cluster's most-central member + // and drops any other whose distance to it exceeds threshold — + // chains lose their tail, dense real-near-dup clusters keep all + // members. Discard clusters that drop below 2 after refinement. + let groups: Vec = by_root .into_values() - .filter(|cluster| cluster.len() > 1) + .filter_map(|cluster| refine_cluster(cluster, threshold, dhash_max)) .map(|cluster| { let representative_hash = cluster[0].content_hash.clone(); DuplicateGroup { @@ -501,6 +522,7 @@ fn cluster_perceptual(rows: Vec, threshold: u32) -> Vec, threshold: u32) -> Vec, + phash_max: u32, + dhash_max: u32, +) -> Option> { + if cluster.len() < 2 { + return None; + } + if cluster.len() == 2 { + // No chain can exist with only two members; the union-find + // already guaranteed both signals validated when joining. + return Some(cluster); + } + + // Pick the medoid: member whose summed pHash+dHash distance to the + // rest of the cluster is smallest. Stable-deterministic via the + // first-best-wins tie break (lower content_hash wins via natural + // iteration order from the BK-tree input ordering). + let phashes: Vec = cluster + .iter() + .map(|r| r.phash_64.unwrap_or(0) as u64) + .collect(); + let dhashes: Vec = cluster + .iter() + .map(|r| r.dhash_64.unwrap_or(0) as u64) + .collect(); + + let mut best_idx = 0usize; + let mut best_score = u32::MAX; + for i in 0..cluster.len() { + let mut score: u32 = 0; + for j in 0..cluster.len() { + if i == j { + continue; + } + score = score.saturating_add((phashes[i] ^ phashes[j]).count_ones()); + score = score.saturating_add((dhashes[i] ^ dhashes[j]).count_ones()); + } + if score < best_score { + best_score = score; + best_idx = i; + } + } + + let medoid_phash = phashes[best_idx]; + let medoid_dhash = dhashes[best_idx]; + + let kept: Vec = cluster + .into_iter() + .enumerate() + .filter(|(i, _)| { + *i == best_idx + || ((phashes[*i] ^ medoid_phash).count_ones() <= phash_max + && (dhashes[*i] ^ medoid_dhash).count_ones() <= dhash_max) + }) + .map(|(_, r)| r) + .collect(); + + if kept.len() < 2 { None } else { Some(kept) } +} + #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] struct HashKey { phash: u64, @@ -757,6 +842,36 @@ mod tests { assert!(cluster_perceptual(rows, 4).is_empty()); } + #[test] + fn cluster_perceptual_breaks_long_chain_at_medoid() { + // 4-link chain at threshold=2 with pairwise distances chosen + // so single-link unions all four but the endpoints sit past + // the medoid's neighbourhood. Bit positions hop by exactly 2 + // bits per step, in non-overlapping nibbles, so consecutive + // hops compose into wider distant-pair distances: + // A↔B = 2, B↔C = 2, C↔D = 2, + // A↔C = 4, B↔D = 4, A↔D = 6. + // Medoid (B or C) keeps Δ ≤ 2 of itself; the far endpoint + // gets chopped, leaving exactly 3 members. + const A: i64 = 0x55AA_55AA_55AA_55AA; + const B: i64 = 0x55AA_55AA_55AA_55A9; // ^0x03 last byte + const C: i64 = 0x55AA_55AA_55AA_55A5; // ^0x0C from B + const D: i64 = 0x55AA_55AA_55AA_5595; // ^0x30 from C + let rows = vec![ + row_with_dhash(1, "a.jpg", "h1", Some(A), Some(A)), + row_with_dhash(1, "b.jpg", "h2", Some(B), Some(B)), + row_with_dhash(1, "c.jpg", "h3", Some(C), Some(C)), + row_with_dhash(1, "d.jpg", "h4", Some(D), Some(D)), + ]; + let groups = cluster_perceptual(rows, 2); + assert_eq!(groups.len(), 1); + assert_eq!( + groups[0].members.len(), + 3, + "medoid pass should chop one chain endpoint past Δ=2" + ); + } + /// Sanity-check the BK-tree's metric, which is what the duplicates /// path actually clusters on. #[test]