feature/duplicate-detection #73

Merged
cameron merged 4 commits from feature/duplicate-detection into master 2026-05-03 22:34:50 +00:00
4 changed files with 251 additions and 75 deletions
Showing only changes of commit 7ca888e95d - Show all commits

View File

@@ -108,11 +108,7 @@ fn main() -> anyhow::Result<()> {
// releases the GIL-equivalent. rayon's default thread pool // releases the GIL-equivalent. rayon's default thread pool
// matches the host's logical-core count which is the right // matches the host's logical-core count which is the right
// ceiling for image_hasher's DCT pass. // ceiling for image_hasher's DCT pass.
let results: Vec<( let results: Vec<(i32, String, FilePerceptualResult)> = rows
i32,
String,
FilePerceptualResult,
)> = rows
.into_par_iter() .into_par_iter()
.map(|(library_id, rel_path)| { .map(|(library_id, rel_path)| {
let abs = libs_by_id let abs = libs_by_id
@@ -158,14 +154,40 @@ fn main() -> anyhow::Result<()> {
} }
} }
FilePerceptualResult::DecodeFailed => { FilePerceptualResult::DecodeFailed => {
// Mark as "we tried" so the next run doesn't keep // Persist phash_64=0/dhash_64=0 as a "tried,
// hammering this file. We persist NULL/NULL — // unhashable" sentinel so this row leaves the
// unfortunately that leaves it eligible for the // `phash_64 IS NULL` candidate set and the
// next run. The honest fix is a separate "perceptual // backfill doesn't infinite-loop on a queue of
// hash attempted" timestamp; for now we accept the // unbreakable formats (HEIC, RAW, CMYK JPEGs,
// re-attempt cost since decode-failure rate is low. // truncated bytes). The all-zero hash is
// explicitly excluded from clustering by
// is_informative_hash in duplicates.rs, so it
// won't pollute group output — it just becomes
// invisible to the duplicate finder.
log::debug!(
"perceptual decode failed for {} (lib {}); marking unhashable",
rel_path,
library_id
);
match guard.backfill_perceptual_hash(
&ctx,
*library_id,
rel_path,
Some(0),
Some(0),
) {
Ok(_) => {
total_decode_failures += 1; total_decode_failures += 1;
} }
Err(e) => {
pb.println(format!(
"persist error (decode-fail sentinel) for {}: {:?}",
rel_path, e
));
total_errors += 1;
}
}
}
FilePerceptualResult::MissingOnDisk => { FilePerceptualResult::MissingOnDisk => {
total_missing += 1; total_missing += 1;
} }

View File

@@ -196,9 +196,9 @@ async fn list_perceptual_handler(
let include_resolved = query.include_resolved.unwrap_or(false); let include_resolved = query.include_resolved.unwrap_or(false);
// Cache hit? // Cache hit?
if let Ok(guard) = PERCEPTUAL_CACHE.lock() { if let Ok(guard) = PERCEPTUAL_CACHE.lock()
if let Some(entry) = guard.as_ref() { && let Some(entry) = guard.as_ref()
if entry.library_id == library_id && entry.library_id == library_id
&& entry.threshold == threshold && entry.threshold == threshold
&& entry.include_resolved == include_resolved && entry.include_resolved == include_resolved
&& entry.computed_at.elapsed() < PERCEPTUAL_CACHE_TTL && entry.computed_at.elapsed() < PERCEPTUAL_CACHE_TTL
@@ -207,8 +207,6 @@ async fn list_perceptual_handler(
groups: entry.groups.clone(), groups: entry.groups.clone(),
}); });
} }
}
}
let rows = { let rows = {
let mut dao = exif_dao.lock().expect("exif dao lock"); let mut dao = exif_dao.lock().expect("exif dao lock");
@@ -302,13 +300,14 @@ async fn resolve_handler(
// demoted's tag set onto the survivor before flipping the // demoted's tag set onto the survivor before flipping the
// soft-mark. For exact dups (same content_hash), tags are // soft-mark. For exact dups (same content_hash), tags are
// already shared at the bytes layer — the union is a no-op. // already shared at the bytes layer — the union is a no-op.
if demoted.content_hash != survivor.content_hash { if demoted.content_hash != survivor.content_hash
if let Err(e) = dao.union_perceptual_tags( && let Err(e) = dao.union_perceptual_tags(
&span_context, &span_context,
&survivor.content_hash, &survivor.content_hash,
&demoted.content_hash, &demoted.content_hash,
&survivor.rel_path, &survivor.rel_path,
) { )
{
log::warn!( log::warn!(
"duplicates.resolve: tag union failed for {}: {:?}", "duplicates.resolve: tag union failed for {}: {:?}",
demoted.rel_path, demoted.rel_path,
@@ -319,7 +318,6 @@ async fn resolve_handler(
// demoted row's grid presence, and the original tags // demoted row's grid presence, and the original tags
// never moved off the demoted hash). // never moved off the demoted hash).
} }
}
if let Err(e) = dao.set_duplicate_of( if let Err(e) = dao.set_duplicate_of(
&span_context, &span_context,
@@ -395,17 +393,43 @@ fn group_exact(rows: Vec<DuplicateRow>) -> Vec<DuplicateGroup> {
groups groups
} }
/// Bits set in a "useful" perceptual hash. Below this many or above
/// (64 - this many), the image is too uniform to compare meaningfully
/// — pHash collapses to all-zeros for solid colors, all-ones for
/// inverted-uniform, and a few in-between for low-frequency content.
/// Without this filter, every flat sky / black frame / monochrome
/// scan ends up Hamming-distance-zero from every other one, producing
/// a single mega-cluster of hundreds of unrelated photos.
const MIN_INFORMATIVE_POPCOUNT: u32 = 8;
const MAX_INFORMATIVE_POPCOUNT: u32 = 64 - MIN_INFORMATIVE_POPCOUNT;
#[inline]
fn is_informative_hash(h: i64) -> bool {
let pop = (h as u64).count_ones();
(MIN_INFORMATIVE_POPCOUNT..=MAX_INFORMATIVE_POPCOUNT).contains(&pop)
}
/// Single-link cluster the input rows by Hamming distance over their /// Single-link cluster the input rows by Hamming distance over their
/// pHash, with `threshold` as the maximum distance for an edge. Rows /// pHash, with `threshold` as the maximum distance for an edge. Rows
/// without a pHash are skipped (we already filter at the SQL layer but /// without a pHash, or with a degenerate (low-entropy) pHash, are
/// the type carries an Option for safety). /// excluded — they'd chain together unrelated images.
///
/// Two-signal validation: the BK-tree gives candidate pairs cheaply,
/// then we additionally require dHash agreement before unioning. pHash
/// alone is too permissive; pairing it with dHash collapses the false-
/// positive cluster significantly (different DCT vs gradient
/// signatures on real near-dups still both stay close, but spurious
/// pHash collisions on uniform images don't survive the dHash check).
/// ///
/// Implementation: BK-tree neighbourhood lookup per row, union-find /// Implementation: BK-tree neighbourhood lookup per row, union-find
/// over the resulting edges. O(N log N) instead of the O(N²) naive /// over the validated edges. O(N log N) instead of the O(N²) naive
/// pairwise scan; on a 1.26M-row library that's the difference between /// pairwise scan; on a 1.26M-row library that's the difference between
/// "responds in 1.5 s" and "responds in 25 minutes". /// "responds in 1.5 s" and "responds in 25 minutes".
fn cluster_perceptual(rows: Vec<DuplicateRow>, threshold: u32) -> Vec<DuplicateGroup> { fn cluster_perceptual(rows: Vec<DuplicateRow>, threshold: u32) -> Vec<DuplicateGroup> {
let candidates: Vec<DuplicateRow> = rows.into_iter().filter(|r| r.phash_64.is_some()).collect(); let candidates: Vec<DuplicateRow> = rows
.into_iter()
.filter(|r| r.phash_64.is_some_and(is_informative_hash))
.collect();
if candidates.len() < 2 { if candidates.len() < 2 {
return Vec::new(); return Vec::new();
} }
@@ -421,7 +445,13 @@ fn cluster_perceptual(rows: Vec<DuplicateRow>, threshold: u32) -> Vec<DuplicateG
} }
} }
// Union-find over edges within `threshold`. // Union-find over edges within `threshold`. For a candidate pair
// surfaced by the pHash BK-tree, ALSO require dHash within the
// same threshold when both rows have one — pHash agreement on
// low-entropy structure can be incidental, but pHash AND dHash
// both agreeing is a strong near-dup signal. When dHash is
// missing on either side we fall back to pHash-only (decode-
// failure parity behavior; these rows are rare).
let mut uf = UnionFind::new(candidates.len()); let mut uf = UnionFind::new(candidates.len());
for (idx, row) in candidates.iter().enumerate() { for (idx, row) in candidates.iter().enumerate() {
let Some(p) = row.phash_64 else { continue }; let Some(p) = row.phash_64 else { continue };
@@ -430,7 +460,23 @@ fn cluster_perceptual(rows: Vec<DuplicateRow>, threshold: u32) -> Vec<DuplicateG
idx, idx,
}; };
for (_, neighbour) in tree.find(&key, threshold) { for (_, neighbour) in tree.find(&key, threshold) {
if neighbour.idx != idx { if neighbour.idx == idx {
continue;
}
let other = &candidates[neighbour.idx];
// dHash double-check.
let dhash_ok = match (row.dhash_64, other.dhash_64) {
(Some(a), Some(b)) => {
(a as u64 ^ b as u64).count_ones() <= threshold
&& is_informative_hash(a)
&& is_informative_hash(b)
}
// Missing dHash on either side: trust pHash alone
// rather than dropping the candidate, so partial
// backfills don't silently disappear.
_ => true,
};
if dhash_ok {
uf.union(idx, neighbour.idx); uf.union(idx, neighbour.idx);
} }
} }
@@ -524,7 +570,12 @@ impl UnionFind {
pub fn add_duplicate_services<T>(app: App<T>) -> App<T> pub fn add_duplicate_services<T>(app: App<T>) -> App<T>
where where
T: ServiceFactory<actix_web::dev::ServiceRequest, Config = (), Error = actix_web::Error, InitError = ()>, T: ServiceFactory<
actix_web::dev::ServiceRequest,
Config = (),
Error = actix_web::Error,
InitError = (),
>,
{ {
app.service(web::resource("/duplicates/exact").route(web::get().to(list_exact_handler))) app.service(web::resource("/duplicates/exact").route(web::get().to(list_exact_handler)))
.service( .service(
@@ -570,14 +621,61 @@ mod tests {
assert_eq!(groups[0].members.len(), 3); assert_eq!(groups[0].members.len(), 3);
} }
/// All hashes used below have popcount in the "informative"
/// 8..=56 band so they survive the entropy filter that keeps
/// solid-colour images out of the cluster graph.
const INFORMATIVE_BASE: i64 = 0x55AA_55AA_55AA_55AA; // popcount = 32
const INFORMATIVE_NEAR: i64 = 0x55AA_55AA_55AA_55AB; // 1-bit away from BASE
const INFORMATIVE_FAR: i64 = 0x6996_6996_6996_6996; // 32-bits away from BASE
fn row_with_dhash(
library_id: i32,
rel: &str,
hash: &str,
phash: Option<i64>,
dhash: Option<i64>,
) -> DuplicateRow {
DuplicateRow {
library_id,
rel_path: rel.into(),
content_hash: hash.into(),
size_bytes: Some(1000),
date_taken: None,
width: None,
height: None,
phash_64: phash,
dhash_64: dhash,
duplicate_of_hash: None,
duplicate_decided_at: None,
}
}
#[test] #[test]
fn cluster_perceptual_unites_close_hashes() { fn cluster_perceptual_unites_close_hashes() {
// Three rows: two near each other (phash differs by 1 bit), // Two rows near each other on both pHash and dHash; one far
// one far away. Threshold 4 should merge the close pair. // on pHash. Threshold 4 should merge the close pair.
let rows = vec![ let rows = vec![
row(1, "a.jpg", "h1", Some(0b0000)), row_with_dhash(
row(1, "b.jpg", "h2", Some(0b0001)), 1,
row(1, "c.jpg", "h3", Some(i64::MAX)), "a.jpg",
"h1",
Some(INFORMATIVE_BASE),
Some(INFORMATIVE_BASE),
),
row_with_dhash(
1,
"b.jpg",
"h2",
Some(INFORMATIVE_NEAR),
Some(INFORMATIVE_NEAR),
),
row_with_dhash(
1,
"c.jpg",
"h3",
Some(INFORMATIVE_FAR),
Some(INFORMATIVE_FAR),
),
]; ];
let groups = cluster_perceptual(rows, 4); let groups = cluster_perceptual(rows, 4);
assert_eq!(groups.len(), 1); assert_eq!(groups.len(), 1);
@@ -594,8 +692,20 @@ mod tests {
#[test] #[test]
fn cluster_perceptual_threshold_zero_drops_distinct() { fn cluster_perceptual_threshold_zero_drops_distinct() {
let rows = vec![ let rows = vec![
row(1, "a.jpg", "h1", Some(0b0000)), row_with_dhash(
row(1, "b.jpg", "h2", Some(0b0001)), 1,
"a.jpg",
"h1",
Some(INFORMATIVE_BASE),
Some(INFORMATIVE_BASE),
),
row_with_dhash(
1,
"b.jpg",
"h2",
Some(INFORMATIVE_NEAR),
Some(INFORMATIVE_NEAR),
),
]; ];
let groups = cluster_perceptual(rows, 0); let groups = cluster_perceptual(rows, 0);
assert!(groups.is_empty()); assert!(groups.is_empty());
@@ -603,17 +713,63 @@ mod tests {
#[test] #[test]
fn cluster_perceptual_skips_singletons() { fn cluster_perceptual_skips_singletons() {
let rows = vec![row(1, "alone.jpg", "h1", Some(0))]; let rows = vec![row(1, "alone.jpg", "h1", Some(INFORMATIVE_BASE))];
assert!(cluster_perceptual(rows, 8).is_empty()); assert!(cluster_perceptual(rows, 8).is_empty());
} }
#[test]
fn cluster_perceptual_filters_low_entropy_hashes() {
// Both 0 (popcount 0) and i64::MAX (popcount 63) fall outside
// the informative band. A pair of these would trivially match
// (Hamming distance to each other small or zero) without the
// entropy filter — that's exactly the regression that was
// producing a giant first cluster of solid-colour images.
let rows = vec![
row(1, "blank-a.jpg", "h1", Some(0)),
row(1, "blank-b.jpg", "h2", Some(0)),
row(1, "white-a.jpg", "h3", Some(i64::MAX)),
row(1, "white-b.jpg", "h4", Some(i64::MAX)),
];
assert!(cluster_perceptual(rows, 8).is_empty());
}
#[test]
fn cluster_perceptual_requires_dhash_agreement() {
// pHash within threshold but dHash far apart — the candidate
// edge from the BK-tree must be rejected. Without the dHash
// double-check this would form a 2-member cluster.
let rows = vec![
row_with_dhash(
1,
"a.jpg",
"h1",
Some(INFORMATIVE_BASE),
Some(INFORMATIVE_BASE),
),
row_with_dhash(
1,
"b.jpg",
"h2",
Some(INFORMATIVE_NEAR),
Some(INFORMATIVE_FAR),
),
];
assert!(cluster_perceptual(rows, 4).is_empty());
}
/// Sanity-check the BK-tree's metric, which is what the duplicates /// Sanity-check the BK-tree's metric, which is what the duplicates
/// path actually clusters on. /// path actually clusters on.
#[test] #[test]
fn hamming_metric_is_symmetric() { fn hamming_metric_is_symmetric() {
let m = HammingMetric; let m = HammingMetric;
let a = HashKey { phash: 0b1010, idx: 0 }; let a = HashKey {
let b = HashKey { phash: 0b0101, idx: 1 }; phash: 0b1010,
idx: 0,
};
let b = HashKey {
phash: 0b0101,
idx: 1,
};
let d1 = m.distance(&a, &b); let d1 = m.distance(&a, &b);
let d2 = m.distance(&b, &a); let d2 = m.distance(&b, &a);
assert_eq!(d1, d2); assert_eq!(d1, d2);

View File

@@ -8,10 +8,9 @@ pub mod auth;
pub mod bin_progress; pub mod bin_progress;
pub mod cleanup; pub mod cleanup;
pub mod content_hash; pub mod content_hash;
pub mod perceptual_hash;
pub mod duplicates;
pub mod data; pub mod data;
pub mod database; pub mod database;
pub mod duplicates;
pub mod error; pub mod error;
pub mod exif; pub mod exif;
pub mod face_watch; pub mod face_watch;
@@ -25,6 +24,7 @@ pub mod library_maintenance;
pub mod memories; pub mod memories;
pub mod otel; pub mod otel;
pub mod parsers; pub mod parsers;
pub mod perceptual_hash;
pub mod service; pub mod service;
pub mod state; pub mod state;
pub mod tags; pub mod tags;

View File

@@ -62,10 +62,9 @@ use opentelemetry::{KeyValue, global};
mod ai; mod ai;
mod auth; mod auth;
mod content_hash; mod content_hash;
mod perceptual_hash;
mod duplicates;
mod data; mod data;
mod database; mod database;
mod duplicates;
mod error; mod error;
mod exif; mod exif;
mod face_watch; mod face_watch;
@@ -75,6 +74,7 @@ mod files;
mod geo; mod geo;
mod libraries; mod libraries;
mod library_maintenance; mod library_maintenance;
mod perceptual_hash;
mod state; mod state;
mod tags; mod tags;
mod utils; mod utils;
@@ -671,10 +671,9 @@ async fn upload_image(
.to_string(); .to_string();
{ {
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao"); let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
if let Ok(Some(existing)) = if let Ok(Some(existing)) = dao.find_by_content_hash(&span_context, &upload_hash)
dao.find_by_content_hash(&span_context, &upload_hash) && existing.duplicate_of_hash.is_none()
{ {
if existing.duplicate_of_hash.is_none() {
let library_name = libraries::load_all(&mut crate::database::connect()) let library_name = libraries::load_all(&mut crate::database::connect())
.into_iter() .into_iter()
.find(|l| l.id == existing.library_id) .find(|l| l.id == existing.library_id)
@@ -690,7 +689,6 @@ async fn upload_image(
})); }));
} }
} }
}
let context = let context =
opentelemetry::Context::new().with_remote_span_context(span.span_context().clone()); opentelemetry::Context::new().with_remote_span_context(span.span_context().clone());