duplicates: perceptual hash + soft-mark resolution + upload 409

Adds pHash + dHash columns alongside the existing blake3 content_hash so
near-duplicates (re-encoded, resized, format-converted copies) become
queryable. /duplicates/{exact,perceptual} return groups; /duplicates/
{resolve,unresolve} flip a duplicate_of_hash soft-mark on losing rows
and union perceptual-only tag sets onto the survivor. The default
/photos listing filters duplicate_of_hash IS NULL so demoted siblings
stop cluttering the grid; include_duplicates=true opts back in for
Apollo's review modal. Upload now hashes bytes pre-write and returns
409 with the canonical sibling when a file's bytes already exist.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Cameron Cordes
2026-05-03 17:36:01 -04:00
parent 4340b164eb
commit 7584cd8792
14 changed files with 1852 additions and 1 deletions

View File

@@ -62,6 +62,8 @@ use opentelemetry::{KeyValue, global};
mod ai;
mod auth;
mod content_hash;
mod perceptual_hash;
mod duplicates;
mod data;
mod database;
mod error;
@@ -530,6 +532,11 @@ async fn set_image_gps(
.ok()
.map(|c| c.content_hash),
size_bytes: content_hash::compute(&full_path).ok().map(|c| c.size_bytes),
// GPS-update path doesn't touch perceptual hashes either; columns
// ignored by update_exif. Compute best-effort so a new file lands
// with a usable signal; failure just leaves prior values in place.
phash_64: perceptual_hash::compute(&full_path).map(|h| h.phash_64),
dhash_64: perceptual_hash::compute(&full_path).map(|h| h.dhash_64),
};
let updated = {
@@ -652,6 +659,39 @@ async fn upload_image(
&full_path.to_str().unwrap().to_string(),
true,
) {
// Pre-write content-hash check: if these exact bytes already
// exist anywhere in any library (and aren't themselves
// soft-marked as duplicates), don't write the file. Return
// 409 with the canonical sibling so the mobile app can show
// a friendly "already in your library" toast.
let upload_hash = blake3::Hasher::new()
.update(&file_content)
.finalize()
.to_hex()
.to_string();
{
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
if let Ok(Some(existing)) =
dao.find_by_content_hash(&span_context, &upload_hash)
{
if existing.duplicate_of_hash.is_none() {
let library_name = libraries::load_all(&mut crate::database::connect())
.into_iter()
.find(|l| l.id == existing.library_id)
.map(|l| l.name);
span.set_status(Status::Ok);
return HttpResponse::Conflict().json(serde_json::json!({
"duplicate_of": {
"library_id": existing.library_id,
"rel_path": existing.file_path,
},
"content_hash": upload_hash,
"library_name": library_name,
}));
}
}
}
let context =
opentelemetry::Context::new().with_remote_span_context(span.span_context().clone());
tracer
@@ -710,6 +750,7 @@ async fn upload_image(
(None, None)
}
};
let perceptual = perceptual_hash::compute(&uploaded_path);
let insert_exif = InsertImageExif {
library_id: target_library.id,
file_path: relative_path.clone(),
@@ -731,6 +772,8 @@ async fn upload_image(
last_modified: timestamp,
content_hash,
size_bytes,
phash_64: perceptual.map(|h| h.phash_64),
dhash_64: perceptual.map(|h| h.dhash_64),
};
if let Ok(mut dao) = exif_dao.lock() {
@@ -1661,6 +1704,7 @@ fn main() -> std::io::Result<()> {
.add_feature(add_tag_services::<_, SqliteTagDao>)
.add_feature(knowledge::add_knowledge_services::<_, SqliteKnowledgeDao>)
.add_feature(faces::add_face_services::<_, faces::SqliteFaceDao>)
.add_feature(duplicates::add_duplicate_services)
.app_data(app_data.clone())
.app_data::<Data<RealFileSystem>>(Data::new(RealFileSystem::new(
app_data.base_path.clone(),
@@ -2309,6 +2353,12 @@ fn process_new_files(
}
};
// Perceptual hashes (pHash + dHash). Best-effort — None for
// videos and decode failures. Drives near-duplicate detection
// in the Apollo duplicates surface; failure here is non-fatal
// and never blocks indexing.
let perceptual = perceptual_hash::compute(&file_path);
// EXIF is best-effort enrichment. When extraction fails (or the
// file type doesn't support EXIF) we still store a row with all
// EXIF fields NULL; the file remains visible to sort-by-date
@@ -2360,6 +2410,8 @@ fn process_new_files(
last_modified: timestamp,
content_hash,
size_bytes,
phash_64: perceptual.map(|h| h.phash_64),
dhash_64: perceptual.map(|h| h.dhash_64),
};
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");