duplicates: filter low-entropy hashes + dHash double-check, fix backfill loop
The perceptual cluster was producing one giant first group that contained hundreds of unrelated images. Two causes: - Solid-colour images (skies, black frames, monochrome scans) all hash to near-zero pHashes that Hamming-distance-zero to each other. - Single-link clustering on pHash alone is too permissive — a chain of weakly-similar images all collapses into one cluster. Fixed by skipping hashes outside the popcount [8, 56] band (uniform content) and requiring dHash agreement within threshold before unioning a candidate edge from the BK-tree. Two new tests pin both invariants. Backfill bin separately fix: decode-failed rows kept phash_64=NULL and got re-pulled by every batch, infinite-looping on a queue of unbreakable formats. Persist a 0/0 sentinel on decode failure so the row leaves the candidate set; the all-zero hash is excluded from clustering by the same entropy filter so it doesn't pollute results. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
36
src/main.rs
36
src/main.rs
@@ -62,10 +62,9 @@ use opentelemetry::{KeyValue, global};
|
||||
mod ai;
|
||||
mod auth;
|
||||
mod content_hash;
|
||||
mod perceptual_hash;
|
||||
mod duplicates;
|
||||
mod data;
|
||||
mod database;
|
||||
mod duplicates;
|
||||
mod error;
|
||||
mod exif;
|
||||
mod face_watch;
|
||||
@@ -75,6 +74,7 @@ mod files;
|
||||
mod geo;
|
||||
mod libraries;
|
||||
mod library_maintenance;
|
||||
mod perceptual_hash;
|
||||
mod state;
|
||||
mod tags;
|
||||
mod utils;
|
||||
@@ -671,24 +671,22 @@ async fn upload_image(
|
||||
.to_string();
|
||||
{
|
||||
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
|
||||
if let Ok(Some(existing)) =
|
||||
dao.find_by_content_hash(&span_context, &upload_hash)
|
||||
if let Ok(Some(existing)) = dao.find_by_content_hash(&span_context, &upload_hash)
|
||||
&& existing.duplicate_of_hash.is_none()
|
||||
{
|
||||
if existing.duplicate_of_hash.is_none() {
|
||||
let library_name = libraries::load_all(&mut crate::database::connect())
|
||||
.into_iter()
|
||||
.find(|l| l.id == existing.library_id)
|
||||
.map(|l| l.name);
|
||||
span.set_status(Status::Ok);
|
||||
return HttpResponse::Conflict().json(serde_json::json!({
|
||||
"duplicate_of": {
|
||||
"library_id": existing.library_id,
|
||||
"rel_path": existing.file_path,
|
||||
},
|
||||
"content_hash": upload_hash,
|
||||
"library_name": library_name,
|
||||
}));
|
||||
}
|
||||
let library_name = libraries::load_all(&mut crate::database::connect())
|
||||
.into_iter()
|
||||
.find(|l| l.id == existing.library_id)
|
||||
.map(|l| l.name);
|
||||
span.set_status(Status::Ok);
|
||||
return HttpResponse::Conflict().json(serde_json::json!({
|
||||
"duplicate_of": {
|
||||
"library_id": existing.library_id,
|
||||
"rel_path": existing.file_path,
|
||||
},
|
||||
"content_hash": upload_hash,
|
||||
"library_name": library_name,
|
||||
}));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user