feature/duplicate-detection #73

Merged
cameron merged 4 commits from feature/duplicate-detection into master 2026-05-03 22:34:50 +00:00
Showing only changes of commit 98057c98a1 - Show all commits

View File

@@ -393,14 +393,14 @@ fn group_exact(rows: Vec<DuplicateRow>) -> Vec<DuplicateGroup> {
groups
}
/// Bits set in a "useful" perceptual hash. Below this many or above
/// (64 - this many), the image is too uniform to compare meaningfully
/// — pHash collapses to all-zeros for solid colors, all-ones for
/// inverted-uniform, and a few in-between for low-frequency content.
/// Without this filter, every flat sky / black frame / monochrome
/// scan ends up Hamming-distance-zero from every other one, producing
/// a single mega-cluster of hundreds of unrelated photos.
const MIN_INFORMATIVE_POPCOUNT: u32 = 8;
/// Bits set in a "useful" perceptual hash. Real photographic content
/// produces ~50/50 bit distributions; anything outside the [16, 48]
/// band is low-entropy structure (uniform skies, black frames,
/// monochrome scans, faded film) where pHash collapses to near-
/// uniform values that Hamming-trivially across hundreds of unrelated
/// images. The 8/56 band that shipped first was too permissive —
/// even at threshold=4 the false-positive cluster persisted.
const MIN_INFORMATIVE_POPCOUNT: u32 = 16;
const MAX_INFORMATIVE_POPCOUNT: u32 = 64 - MIN_INFORMATIVE_POPCOUNT;
#[inline]
@@ -409,6 +409,21 @@ fn is_informative_hash(h: i64) -> bool {
(MIN_INFORMATIVE_POPCOUNT..=MAX_INFORMATIVE_POPCOUNT).contains(&pop)
}
/// dHash gets a stricter threshold than pHash. pHash is the
/// candidate-discovery signal (BK-tree neighbourhood lookup); dHash
/// is the validation signal that has to actively agree before we
/// union. Splitting the budget asymmetrically means a real near-dup
/// (which scores well on both) survives while an incidental pHash
/// collision (uniform-content false positive) gets vetoed.
///
/// Floor of 2 so threshold=4 still allows a 1-bit jitter in dHash —
/// genuine resampling can flip a low-frequency gradient bit even
/// when the visual content is identical.
#[inline]
fn dhash_threshold(phash_threshold: u32) -> u32 {
(phash_threshold / 2).max(2)
}
/// Single-link cluster the input rows by Hamming distance over their
/// pHash, with `threshold` as the maximum distance for an edge. Rows
/// without a pHash, or with a degenerate (low-entropy) pHash, are
@@ -446,12 +461,15 @@ fn cluster_perceptual(rows: Vec<DuplicateRow>, threshold: u32) -> Vec<DuplicateG
}
// Union-find over edges within `threshold`. For a candidate pair
// surfaced by the pHash BK-tree, ALSO require dHash within the
// same threshold when both rows have one — pHash agreement on
// low-entropy structure can be incidental, but pHash AND dHash
// both agreeing is a strong near-dup signal. When dHash is
// missing on either side we fall back to pHash-only (decode-
// failure parity behavior; these rows are rare).
// surfaced by the pHash BK-tree, require dHash within a *stricter*
// threshold (`dhash_threshold(threshold)`) before unioning. pHash
// agreement on low-entropy structure can be incidental; pHash
// agreement AND dHash within roughly half that distance is a
// strong near-dup signal. dHash on either side missing → reject
// (was: trust pHash alone). Missing dHash means we can't validate
// the candidate, and the false-positive cost outweighs the rare
// case of a partial backfill.
let dhash_max = dhash_threshold(threshold);
let mut uf = UnionFind::new(candidates.len());
for (idx, row) in candidates.iter().enumerate() {
let Some(p) = row.phash_64 else { continue };
@@ -464,17 +482,13 @@ fn cluster_perceptual(rows: Vec<DuplicateRow>, threshold: u32) -> Vec<DuplicateG
continue;
}
let other = &candidates[neighbour.idx];
// dHash double-check.
let dhash_ok = match (row.dhash_64, other.dhash_64) {
(Some(a), Some(b)) => {
(a as u64 ^ b as u64).count_ones() <= threshold
(a as u64 ^ b as u64).count_ones() <= dhash_max
&& is_informative_hash(a)
&& is_informative_hash(b)
}
// Missing dHash on either side: trust pHash alone
// rather than dropping the candidate, so partial
// backfills don't silently disappear.
_ => true,
_ => false,
};
if dhash_ok {
uf.union(idx, neighbour.idx);
@@ -489,9 +503,16 @@ fn cluster_perceptual(rows: Vec<DuplicateRow>, threshold: u32) -> Vec<DuplicateG
by_root.entry(root).or_default().push(row);
}
let mut groups: Vec<DuplicateGroup> = by_root
// Medoid-validate each cluster to break single-link chains.
// Single-link unions any pair within threshold; that means a chain
// A↔B↔C can collapse into one cluster even when A and C aren't
// similar. The medoid pass picks the cluster's most-central member
// and drops any other whose distance to it exceeds threshold —
// chains lose their tail, dense real-near-dup clusters keep all
// members. Discard clusters that drop below 2 after refinement.
let groups: Vec<DuplicateGroup> = by_root
.into_values()
.filter(|cluster| cluster.len() > 1)
.filter_map(|cluster| refine_cluster(cluster, threshold, dhash_max))
.map(|cluster| {
let representative_hash = cluster[0].content_hash.clone();
DuplicateGroup {
@@ -501,6 +522,7 @@ fn cluster_perceptual(rows: Vec<DuplicateRow>, threshold: u32) -> Vec<DuplicateG
}
})
.collect();
let mut groups = groups;
groups.sort_by(|a, b| {
b.members
.len()
@@ -510,6 +532,69 @@ fn cluster_perceptual(rows: Vec<DuplicateRow>, threshold: u32) -> Vec<DuplicateG
groups
}
/// Tighten a single-link cluster to its medoid neighbourhood. Returns
/// `None` when fewer than 2 members survive — caller drops the cluster.
fn refine_cluster(
cluster: Vec<DuplicateRow>,
phash_max: u32,
dhash_max: u32,
) -> Option<Vec<DuplicateRow>> {
if cluster.len() < 2 {
return None;
}
if cluster.len() == 2 {
// No chain can exist with only two members; the union-find
// already guaranteed both signals validated when joining.
return Some(cluster);
}
// Pick the medoid: member whose summed pHash+dHash distance to the
// rest of the cluster is smallest. Stable-deterministic via the
// first-best-wins tie break (lower content_hash wins via natural
// iteration order from the BK-tree input ordering).
let phashes: Vec<u64> = cluster
.iter()
.map(|r| r.phash_64.unwrap_or(0) as u64)
.collect();
let dhashes: Vec<u64> = cluster
.iter()
.map(|r| r.dhash_64.unwrap_or(0) as u64)
.collect();
let mut best_idx = 0usize;
let mut best_score = u32::MAX;
for i in 0..cluster.len() {
let mut score: u32 = 0;
for j in 0..cluster.len() {
if i == j {
continue;
}
score = score.saturating_add((phashes[i] ^ phashes[j]).count_ones());
score = score.saturating_add((dhashes[i] ^ dhashes[j]).count_ones());
}
if score < best_score {
best_score = score;
best_idx = i;
}
}
let medoid_phash = phashes[best_idx];
let medoid_dhash = dhashes[best_idx];
let kept: Vec<DuplicateRow> = cluster
.into_iter()
.enumerate()
.filter(|(i, _)| {
*i == best_idx
|| ((phashes[*i] ^ medoid_phash).count_ones() <= phash_max
&& (dhashes[*i] ^ medoid_dhash).count_ones() <= dhash_max)
})
.map(|(_, r)| r)
.collect();
if kept.len() < 2 { None } else { Some(kept) }
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
struct HashKey {
phash: u64,
@@ -757,6 +842,36 @@ mod tests {
assert!(cluster_perceptual(rows, 4).is_empty());
}
#[test]
fn cluster_perceptual_breaks_long_chain_at_medoid() {
// 4-link chain at threshold=2 with pairwise distances chosen
// so single-link unions all four but the endpoints sit past
// the medoid's neighbourhood. Bit positions hop by exactly 2
// bits per step, in non-overlapping nibbles, so consecutive
// hops compose into wider distant-pair distances:
// A↔B = 2, B↔C = 2, C↔D = 2,
// A↔C = 4, B↔D = 4, A↔D = 6.
// Medoid (B or C) keeps Δ ≤ 2 of itself; the far endpoint
// gets chopped, leaving exactly 3 members.
const A: i64 = 0x55AA_55AA_55AA_55AA;
const B: i64 = 0x55AA_55AA_55AA_55A9; // ^0x03 last byte
const C: i64 = 0x55AA_55AA_55AA_55A5; // ^0x0C from B
const D: i64 = 0x55AA_55AA_55AA_5595; // ^0x30 from C
let rows = vec![
row_with_dhash(1, "a.jpg", "h1", Some(A), Some(A)),
row_with_dhash(1, "b.jpg", "h2", Some(B), Some(B)),
row_with_dhash(1, "c.jpg", "h3", Some(C), Some(C)),
row_with_dhash(1, "d.jpg", "h4", Some(D), Some(D)),
];
let groups = cluster_perceptual(rows, 2);
assert_eq!(groups.len(), 1);
assert_eq!(
groups[0].members.len(),
3,
"medoid pass should chop one chain endpoint past Δ=2"
);
}
/// Sanity-check the BK-tree's metric, which is what the duplicates
/// path actually clusters on.
#[test]