feature/duplicate-detection #73
@@ -393,14 +393,14 @@ fn group_exact(rows: Vec<DuplicateRow>) -> Vec<DuplicateGroup> {
|
||||
groups
|
||||
}
|
||||
|
||||
/// Bits set in a "useful" perceptual hash. Below this many or above
|
||||
/// (64 - this many), the image is too uniform to compare meaningfully
|
||||
/// — pHash collapses to all-zeros for solid colors, all-ones for
|
||||
/// inverted-uniform, and a few in-between for low-frequency content.
|
||||
/// Without this filter, every flat sky / black frame / monochrome
|
||||
/// scan ends up Hamming-distance-zero from every other one, producing
|
||||
/// a single mega-cluster of hundreds of unrelated photos.
|
||||
const MIN_INFORMATIVE_POPCOUNT: u32 = 8;
|
||||
/// Bits set in a "useful" perceptual hash. Real photographic content
|
||||
/// produces ~50/50 bit distributions; anything outside the [16, 48]
|
||||
/// band is low-entropy structure (uniform skies, black frames,
|
||||
/// monochrome scans, faded film) where pHash collapses to near-
|
||||
/// uniform values that Hamming-trivially across hundreds of unrelated
|
||||
/// images. The 8/56 band that shipped first was too permissive —
|
||||
/// even at threshold=4 the false-positive cluster persisted.
|
||||
const MIN_INFORMATIVE_POPCOUNT: u32 = 16;
|
||||
const MAX_INFORMATIVE_POPCOUNT: u32 = 64 - MIN_INFORMATIVE_POPCOUNT;
|
||||
|
||||
#[inline]
|
||||
@@ -409,6 +409,21 @@ fn is_informative_hash(h: i64) -> bool {
|
||||
(MIN_INFORMATIVE_POPCOUNT..=MAX_INFORMATIVE_POPCOUNT).contains(&pop)
|
||||
}
|
||||
|
||||
/// dHash gets a stricter threshold than pHash. pHash is the
|
||||
/// candidate-discovery signal (BK-tree neighbourhood lookup); dHash
|
||||
/// is the validation signal that has to actively agree before we
|
||||
/// union. Splitting the budget asymmetrically means a real near-dup
|
||||
/// (which scores well on both) survives while an incidental pHash
|
||||
/// collision (uniform-content false positive) gets vetoed.
|
||||
///
|
||||
/// Floor of 2 so threshold=4 still allows a 1-bit jitter in dHash —
|
||||
/// genuine resampling can flip a low-frequency gradient bit even
|
||||
/// when the visual content is identical.
|
||||
#[inline]
|
||||
fn dhash_threshold(phash_threshold: u32) -> u32 {
|
||||
(phash_threshold / 2).max(2)
|
||||
}
|
||||
|
||||
/// Single-link cluster the input rows by Hamming distance over their
|
||||
/// pHash, with `threshold` as the maximum distance for an edge. Rows
|
||||
/// without a pHash, or with a degenerate (low-entropy) pHash, are
|
||||
@@ -446,12 +461,15 @@ fn cluster_perceptual(rows: Vec<DuplicateRow>, threshold: u32) -> Vec<DuplicateG
|
||||
}
|
||||
|
||||
// Union-find over edges within `threshold`. For a candidate pair
|
||||
// surfaced by the pHash BK-tree, ALSO require dHash within the
|
||||
// same threshold when both rows have one — pHash agreement on
|
||||
// low-entropy structure can be incidental, but pHash AND dHash
|
||||
// both agreeing is a strong near-dup signal. When dHash is
|
||||
// missing on either side we fall back to pHash-only (decode-
|
||||
// failure parity behavior; these rows are rare).
|
||||
// surfaced by the pHash BK-tree, require dHash within a *stricter*
|
||||
// threshold (`dhash_threshold(threshold)`) before unioning. pHash
|
||||
// agreement on low-entropy structure can be incidental; pHash
|
||||
// agreement AND dHash within roughly half that distance is a
|
||||
// strong near-dup signal. dHash on either side missing → reject
|
||||
// (was: trust pHash alone). Missing dHash means we can't validate
|
||||
// the candidate, and the false-positive cost outweighs the rare
|
||||
// case of a partial backfill.
|
||||
let dhash_max = dhash_threshold(threshold);
|
||||
let mut uf = UnionFind::new(candidates.len());
|
||||
for (idx, row) in candidates.iter().enumerate() {
|
||||
let Some(p) = row.phash_64 else { continue };
|
||||
@@ -464,17 +482,13 @@ fn cluster_perceptual(rows: Vec<DuplicateRow>, threshold: u32) -> Vec<DuplicateG
|
||||
continue;
|
||||
}
|
||||
let other = &candidates[neighbour.idx];
|
||||
// dHash double-check.
|
||||
let dhash_ok = match (row.dhash_64, other.dhash_64) {
|
||||
(Some(a), Some(b)) => {
|
||||
(a as u64 ^ b as u64).count_ones() <= threshold
|
||||
(a as u64 ^ b as u64).count_ones() <= dhash_max
|
||||
&& is_informative_hash(a)
|
||||
&& is_informative_hash(b)
|
||||
}
|
||||
// Missing dHash on either side: trust pHash alone
|
||||
// rather than dropping the candidate, so partial
|
||||
// backfills don't silently disappear.
|
||||
_ => true,
|
||||
_ => false,
|
||||
};
|
||||
if dhash_ok {
|
||||
uf.union(idx, neighbour.idx);
|
||||
@@ -489,9 +503,16 @@ fn cluster_perceptual(rows: Vec<DuplicateRow>, threshold: u32) -> Vec<DuplicateG
|
||||
by_root.entry(root).or_default().push(row);
|
||||
}
|
||||
|
||||
let mut groups: Vec<DuplicateGroup> = by_root
|
||||
// Medoid-validate each cluster to break single-link chains.
|
||||
// Single-link unions any pair within threshold; that means a chain
|
||||
// A↔B↔C can collapse into one cluster even when A and C aren't
|
||||
// similar. The medoid pass picks the cluster's most-central member
|
||||
// and drops any other whose distance to it exceeds threshold —
|
||||
// chains lose their tail, dense real-near-dup clusters keep all
|
||||
// members. Discard clusters that drop below 2 after refinement.
|
||||
let groups: Vec<DuplicateGroup> = by_root
|
||||
.into_values()
|
||||
.filter(|cluster| cluster.len() > 1)
|
||||
.filter_map(|cluster| refine_cluster(cluster, threshold, dhash_max))
|
||||
.map(|cluster| {
|
||||
let representative_hash = cluster[0].content_hash.clone();
|
||||
DuplicateGroup {
|
||||
@@ -501,6 +522,7 @@ fn cluster_perceptual(rows: Vec<DuplicateRow>, threshold: u32) -> Vec<DuplicateG
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
let mut groups = groups;
|
||||
groups.sort_by(|a, b| {
|
||||
b.members
|
||||
.len()
|
||||
@@ -510,6 +532,69 @@ fn cluster_perceptual(rows: Vec<DuplicateRow>, threshold: u32) -> Vec<DuplicateG
|
||||
groups
|
||||
}
|
||||
|
||||
/// Tighten a single-link cluster to its medoid neighbourhood. Returns
|
||||
/// `None` when fewer than 2 members survive — caller drops the cluster.
|
||||
fn refine_cluster(
|
||||
cluster: Vec<DuplicateRow>,
|
||||
phash_max: u32,
|
||||
dhash_max: u32,
|
||||
) -> Option<Vec<DuplicateRow>> {
|
||||
if cluster.len() < 2 {
|
||||
return None;
|
||||
}
|
||||
if cluster.len() == 2 {
|
||||
// No chain can exist with only two members; the union-find
|
||||
// already guaranteed both signals validated when joining.
|
||||
return Some(cluster);
|
||||
}
|
||||
|
||||
// Pick the medoid: member whose summed pHash+dHash distance to the
|
||||
// rest of the cluster is smallest. Stable-deterministic via the
|
||||
// first-best-wins tie break (lower content_hash wins via natural
|
||||
// iteration order from the BK-tree input ordering).
|
||||
let phashes: Vec<u64> = cluster
|
||||
.iter()
|
||||
.map(|r| r.phash_64.unwrap_or(0) as u64)
|
||||
.collect();
|
||||
let dhashes: Vec<u64> = cluster
|
||||
.iter()
|
||||
.map(|r| r.dhash_64.unwrap_or(0) as u64)
|
||||
.collect();
|
||||
|
||||
let mut best_idx = 0usize;
|
||||
let mut best_score = u32::MAX;
|
||||
for i in 0..cluster.len() {
|
||||
let mut score: u32 = 0;
|
||||
for j in 0..cluster.len() {
|
||||
if i == j {
|
||||
continue;
|
||||
}
|
||||
score = score.saturating_add((phashes[i] ^ phashes[j]).count_ones());
|
||||
score = score.saturating_add((dhashes[i] ^ dhashes[j]).count_ones());
|
||||
}
|
||||
if score < best_score {
|
||||
best_score = score;
|
||||
best_idx = i;
|
||||
}
|
||||
}
|
||||
|
||||
let medoid_phash = phashes[best_idx];
|
||||
let medoid_dhash = dhashes[best_idx];
|
||||
|
||||
let kept: Vec<DuplicateRow> = cluster
|
||||
.into_iter()
|
||||
.enumerate()
|
||||
.filter(|(i, _)| {
|
||||
*i == best_idx
|
||||
|| ((phashes[*i] ^ medoid_phash).count_ones() <= phash_max
|
||||
&& (dhashes[*i] ^ medoid_dhash).count_ones() <= dhash_max)
|
||||
})
|
||||
.map(|(_, r)| r)
|
||||
.collect();
|
||||
|
||||
if kept.len() < 2 { None } else { Some(kept) }
|
||||
}
|
||||
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
|
||||
struct HashKey {
|
||||
phash: u64,
|
||||
@@ -757,6 +842,36 @@ mod tests {
|
||||
assert!(cluster_perceptual(rows, 4).is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cluster_perceptual_breaks_long_chain_at_medoid() {
|
||||
// 4-link chain at threshold=2 with pairwise distances chosen
|
||||
// so single-link unions all four but the endpoints sit past
|
||||
// the medoid's neighbourhood. Bit positions hop by exactly 2
|
||||
// bits per step, in non-overlapping nibbles, so consecutive
|
||||
// hops compose into wider distant-pair distances:
|
||||
// A↔B = 2, B↔C = 2, C↔D = 2,
|
||||
// A↔C = 4, B↔D = 4, A↔D = 6.
|
||||
// Medoid (B or C) keeps Δ ≤ 2 of itself; the far endpoint
|
||||
// gets chopped, leaving exactly 3 members.
|
||||
const A: i64 = 0x55AA_55AA_55AA_55AA;
|
||||
const B: i64 = 0x55AA_55AA_55AA_55A9; // ^0x03 last byte
|
||||
const C: i64 = 0x55AA_55AA_55AA_55A5; // ^0x0C from B
|
||||
const D: i64 = 0x55AA_55AA_55AA_5595; // ^0x30 from C
|
||||
let rows = vec![
|
||||
row_with_dhash(1, "a.jpg", "h1", Some(A), Some(A)),
|
||||
row_with_dhash(1, "b.jpg", "h2", Some(B), Some(B)),
|
||||
row_with_dhash(1, "c.jpg", "h3", Some(C), Some(C)),
|
||||
row_with_dhash(1, "d.jpg", "h4", Some(D), Some(D)),
|
||||
];
|
||||
let groups = cluster_perceptual(rows, 2);
|
||||
assert_eq!(groups.len(), 1);
|
||||
assert_eq!(
|
||||
groups[0].members.len(),
|
||||
3,
|
||||
"medoid pass should chop one chain endpoint past Δ=2"
|
||||
);
|
||||
}
|
||||
|
||||
/// Sanity-check the BK-tree's metric, which is what the duplicates
|
||||
/// path actually clusters on.
|
||||
#[test]
|
||||
|
||||
Reference in New Issue
Block a user