Merge pull request 'feature/duplicate-detection' (#73) from feature/duplicate-detection into master

Reviewed-on: #73
This commit was merged in pull request #73.
This commit is contained in:
2026-05-03 22:34:49 +00:00
14 changed files with 2174 additions and 1 deletions

88
Cargo.lock generated
View File

@@ -600,6 +600,16 @@ version = "2.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6099cdc01846bc367c4e7dd630dc5966dccf36b652fae7a74e17b640411a91b2"
[[package]]
name = "bk-tree"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8283fb8e64b873918f8bc527efa6aff34956296e48ea750a9c909cd47c01546"
dependencies = [
"fnv",
"triple_accel",
]
[[package]]
name = "blake3"
version = "1.8.4"
@@ -1928,6 +1938,7 @@ dependencies = [
"async-trait",
"base64",
"bcrypt",
"bk-tree",
"blake3",
"bytes",
"chrono",
@@ -1939,6 +1950,7 @@ dependencies = [
"futures",
"ical",
"image",
"image_hasher",
"indicatif",
"infer",
"jsonwebtoken",
@@ -1978,6 +1990,19 @@ dependencies = [
"quick-error",
]
[[package]]
name = "image_hasher"
version = "3.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dd266c66b0a0e2d4c6db8e710663fc163a2d33595ce997b6fbda407c8759d344"
dependencies = [
"base64",
"image",
"rustdct",
"serde",
"transpose",
]
[[package]]
name = "imgref"
version = "1.11.0"
@@ -2438,6 +2463,15 @@ dependencies = [
"num-traits",
]
[[package]]
name = "num-complex"
version = "0.4.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
dependencies = [
"num-traits",
]
[[package]]
name = "num-conv"
version = "0.1.0"
@@ -2907,6 +2941,15 @@ version = "0.1.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"
[[package]]
name = "primal-check"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "dc0d895b311e3af9902528fbb8f928688abbd95872819320517cc24ca6b2bd08"
dependencies = [
"num-integer",
]
[[package]]
name = "proc-macro2"
version = "1.0.101"
@@ -3286,6 +3329,29 @@ dependencies = [
"semver",
]
[[package]]
name = "rustdct"
version = "0.7.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b61555105d6a9bf98797c063c362a1d24ed8ab0431655e38f1cf51e52089551"
dependencies = [
"rustfft",
]
[[package]]
name = "rustfft"
version = "6.4.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "21db5f9893e91f41798c88680037dba611ca6674703c1a18601b01a72c8adb89"
dependencies = [
"num-complex",
"num-integer",
"num-traits",
"primal-check",
"strength_reduce",
"transpose",
]
[[package]]
name = "rustix"
version = "1.0.8"
@@ -3624,6 +3690,12 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "strength_reduce"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82"
[[package]]
name = "strfmt"
version = "0.2.5"
@@ -4122,6 +4194,22 @@ dependencies = [
"once_cell",
]
[[package]]
name = "transpose"
version = "0.2.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1ad61aed86bc3faea4300c7aee358b4c6d0c8d6ccc36524c96e4c92ccf26e77e"
dependencies = [
"num-integer",
"strength_reduce",
]
[[package]]
name = "triple_accel"
version = "0.3.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "622b09ce2fe2df4618636fb92176d205662f59803f39e70d1c333393082de96c"
[[package]]
name = "try-lock"
version = "0.2.5"

View File

@@ -59,5 +59,7 @@ ical = "0.11"
scraper = "0.20"
base64 = "0.22"
blake3 = "1.5"
image_hasher = "3.0"
bk-tree = "0.5"
async-trait = "0.1"
indicatif = "0.17"

View File

@@ -0,0 +1,8 @@
DROP INDEX IF EXISTS idx_image_exif_duplicate_of_hash;
DROP INDEX IF EXISTS idx_image_exif_dhash;
DROP INDEX IF EXISTS idx_image_exif_phash;
ALTER TABLE image_exif DROP COLUMN duplicate_decided_at;
ALTER TABLE image_exif DROP COLUMN duplicate_of_hash;
ALTER TABLE image_exif DROP COLUMN dhash_64;
ALTER TABLE image_exif DROP COLUMN phash_64;

View File

@@ -0,0 +1,41 @@
-- Adds perceptual-hash signals + soft-mark resolution state to image_exif so
-- the duplicates surface in Apollo can group near-duplicates (re-encoded,
-- resized, format-converted copies) and let the user demote losers without
-- touching the file on disk. Image-only for v1: phash_64/dhash_64 are NULL
-- on videos and on images that fail to decode. See Apollo CLAUDE.md →
-- Duplicate detection / Caching layer for the policy.
--
-- Soft-mark columns are media-type-agnostic — when video perceptual hashing
-- arrives, it lives in a separate hash-keyed companion table and reuses the
-- same duplicate_of_hash / duplicate_decided_at machinery.
-- pHash (DCT, 64-bit) packed as i64 for fast XOR + popcount Hamming.
ALTER TABLE image_exif ADD COLUMN phash_64 BIGINT;
-- dHash (gradient, 64-bit). Cheap, robust to compression/resize. Stored
-- alongside pHash so the query layer can fall back if either is null.
ALTER TABLE image_exif ADD COLUMN dhash_64 BIGINT;
-- When non-null, this row is a soft-marked duplicate of the row whose
-- content_hash matches. The duplicate file stays on disk; the default
-- /photos listing filters it out. /photos?include_duplicates=true opts
-- back in (the Apollo duplicates modal uses this).
ALTER TABLE image_exif ADD COLUMN duplicate_of_hash TEXT;
-- Unix seconds of the resolve. Distinguishes "never reviewed" from
-- "reviewed and resolved" for the Apollo include_resolved toggle.
ALTER TABLE image_exif ADD COLUMN duplicate_decided_at BIGINT;
-- Partial indexes — the columns are NULL for the vast majority of rows
-- during the transitional window and forever for videos / decode failures.
CREATE INDEX idx_image_exif_phash
ON image_exif (phash_64)
WHERE phash_64 IS NOT NULL;
CREATE INDEX idx_image_exif_dhash
ON image_exif (dhash_64)
WHERE dhash_64 IS NOT NULL;
CREATE INDEX idx_image_exif_duplicate_of_hash
ON image_exif (duplicate_of_hash)
WHERE duplicate_of_hash IS NOT NULL;

View File

@@ -0,0 +1,243 @@
//! Backfill `image_exif.phash_64` + `dhash_64` for image rows that
//! were ingested before perceptual hashing was wired into the watcher.
//!
//! The watcher computes perceptual hashes for new images as they're
//! ingested, so this binary is a one-shot for the historical backlog.
//! Idempotent — only rows with a non-null content_hash and a null
//! phash are processed, so re-runs are safe and pick up where they
//! left off (e.g. after a crash or interrupt).
//!
//! Image-only by design: `get_rows_missing_perceptual_hash` filters by
//! file extension at the DB layer so videos and other non-decodable
//! media are skipped without round-tripping `image_hasher`. Files that
//! can't be opened (missing on disk, permission errors) are quietly
//! left as null and counted as "missing"; on next run, if the file is
//! restored, the row will surface again.
use std::path::Path;
use std::sync::{Arc, Mutex};
use std::time::Instant;
use clap::Parser;
use log::{error, warn};
use rayon::prelude::*;
use image_api::bin_progress;
use image_api::database::{ExifDao, SqliteExifDao, connect};
use image_api::libraries::{self, Library};
use image_api::perceptual_hash;
#[derive(Parser, Debug)]
#[command(name = "backfill_perceptual_hash")]
#[command(about = "Compute pHash + dHash for image_exif rows missing one")]
struct Args {
/// Max rows to hash per batch. The process loops until no rows remain.
#[arg(long, default_value_t = 256)]
batch_size: i64,
/// Rayon parallelism override. 0 uses the default thread pool size.
#[arg(long, default_value_t = 0)]
parallelism: usize,
/// Dry-run: log what would be hashed without writing to the DB.
#[arg(long)]
dry_run: bool,
}
fn main() -> anyhow::Result<()> {
env_logger::init();
dotenv::dotenv().ok();
let args = Args::parse();
if args.parallelism > 0 {
rayon::ThreadPoolBuilder::new()
.num_threads(args.parallelism)
.build_global()
.expect("Unable to configure rayon thread pool");
}
let base_path = dotenv::var("BASE_PATH").ok();
let mut seed_conn = connect();
if let Some(base) = base_path.as_deref() {
libraries::seed_or_patch_from_env(&mut seed_conn, base);
}
let libs = libraries::load_all(&mut seed_conn);
drop(seed_conn);
if libs.is_empty() {
anyhow::bail!("No libraries configured; cannot backfill perceptual hashes");
}
let libs_by_id: std::collections::HashMap<i32, Library> =
libs.into_iter().map(|lib| (lib.id, lib)).collect();
println!(
"Configured libraries: {}",
libs_by_id
.values()
.map(|l| format!("{} -> {}", l.name, l.root_path))
.collect::<Vec<_>>()
.join(", ")
);
let dao: Arc<Mutex<Box<dyn ExifDao>>> = Arc::new(Mutex::new(Box::new(SqliteExifDao::new())));
let ctx = opentelemetry::Context::new();
let mut total_hashed = 0u64;
let mut total_missing = 0u64;
let mut total_decode_failures = 0u64;
let mut total_errors = 0u64;
let start = Instant::now();
let pb = bin_progress::spinner("perceptual-hashing");
loop {
let rows = {
let mut guard = dao.lock().expect("Unable to lock ExifDao");
guard
.get_rows_missing_perceptual_hash(&ctx, args.batch_size)
.map_err(|e| anyhow::anyhow!("DB error: {:?}", e))?
};
if rows.is_empty() {
break;
}
let batch_size = rows.len();
pb.set_message(format!(
"batch of {} (hashed={} decode_fail={} missing={} errors={})",
batch_size, total_hashed, total_decode_failures, total_missing, total_errors
));
// Compute perceptual hashes in parallel — CPU-bound, decoder
// releases the GIL-equivalent. rayon's default thread pool
// matches the host's logical-core count which is the right
// ceiling for image_hasher's DCT pass.
let results: Vec<(i32, String, FilePerceptualResult)> = rows
.into_par_iter()
.map(|(library_id, rel_path)| {
let abs = libs_by_id
.get(&library_id)
.map(|lib| Path::new(&lib.root_path).join(&rel_path));
match abs {
Some(abs_path) if abs_path.exists() => {
match perceptual_hash::compute(&abs_path) {
Some(id) => (library_id, rel_path, FilePerceptualResult::Ok(id)),
None => (library_id, rel_path, FilePerceptualResult::DecodeFailed),
}
}
Some(_) => (library_id, rel_path, FilePerceptualResult::MissingOnDisk),
None => {
warn!("Row refers to unknown library_id {}", library_id);
(library_id, rel_path, FilePerceptualResult::MissingOnDisk)
}
}
})
.collect();
// Persist sequentially — SQLite writes serialize anyway.
if !args.dry_run {
let mut guard = dao.lock().expect("Unable to lock ExifDao");
for (library_id, rel_path, result) in &results {
match result {
FilePerceptualResult::Ok(id) => {
match guard.backfill_perceptual_hash(
&ctx,
*library_id,
rel_path,
Some(id.phash_64),
Some(id.dhash_64),
) {
Ok(_) => {
total_hashed += 1;
pb.inc(1);
}
Err(e) => {
pb.println(format!("persist error for {}: {:?}", rel_path, e));
total_errors += 1;
}
}
}
FilePerceptualResult::DecodeFailed => {
// Persist phash_64=0/dhash_64=0 as a "tried,
// unhashable" sentinel so this row leaves the
// `phash_64 IS NULL` candidate set and the
// backfill doesn't infinite-loop on a queue of
// unbreakable formats (HEIC, RAW, CMYK JPEGs,
// truncated bytes). The all-zero hash is
// explicitly excluded from clustering by
// is_informative_hash in duplicates.rs, so it
// won't pollute group output — it just becomes
// invisible to the duplicate finder.
log::debug!(
"perceptual decode failed for {} (lib {}); marking unhashable",
rel_path,
library_id
);
match guard.backfill_perceptual_hash(
&ctx,
*library_id,
rel_path,
Some(0),
Some(0),
) {
Ok(_) => {
total_decode_failures += 1;
}
Err(e) => {
pb.println(format!(
"persist error (decode-fail sentinel) for {}: {:?}",
rel_path, e
));
total_errors += 1;
}
}
}
FilePerceptualResult::MissingOnDisk => {
total_missing += 1;
}
}
}
} else {
for (_, rel_path, result) in &results {
match result {
FilePerceptualResult::Ok(id) => {
pb.println(format!(
"[dry-run] {} -> phash={:016x} dhash={:016x}",
rel_path, id.phash_64, id.dhash_64
));
total_hashed += 1;
pb.inc(1);
}
FilePerceptualResult::DecodeFailed => {
total_decode_failures += 1;
}
FilePerceptualResult::MissingOnDisk => {
total_missing += 1;
}
}
}
pb.println(format!(
"[dry-run] processed one batch of {}. Stopping — a real run would continue \
until no NULL phash_64 image rows remain.",
results.len()
));
break;
}
}
pb.finish_and_clear();
println!(
"Done. hashed={}, decode_failed={}, skipped (missing on disk)={}, errors={}, elapsed={:.1}s",
total_hashed,
total_decode_failures,
total_missing,
total_errors,
start.elapsed().as_secs_f64()
);
if total_errors > 0 {
error!("Backfill completed with {} persist errors", total_errors);
}
Ok(())
}
enum FilePerceptualResult {
Ok(perceptual_hash::PerceptualIdentity),
DecodeFailed,
MissingOnDisk,
}

View File

@@ -165,6 +165,15 @@ pub struct FilesRequest {
/// Optional library filter. Accepts a library id (e.g. "1") or name
/// (e.g. "main"). When omitted, results span all libraries.
pub library: Option<String>,
/// When true, include rows soft-marked as duplicates of another file
/// (i.e. `image_exif.duplicate_of_hash IS NOT NULL`). Default false —
/// the standard /photos listing hides demoted siblings so the grid
/// silently shrinks after a resolve. The Apollo duplicates modal
/// passes `true` so it can show both survivors and demoted members
/// inside a group.
#[serde(default)]
pub include_duplicates: Option<bool>,
}
#[derive(Copy, Clone, Deserialize, PartialEq, Debug)]

View File

@@ -9,6 +9,25 @@ use crate::database::models::{
};
use crate::otel::trace_db_call;
/// Wire shape for a single member of a duplicate group, returned by
/// `list_duplicates_*` and `lookup_duplicate_row`. Carries everything
/// the Apollo modal needs to render a member tile and its meta line —
/// thumbnails are derived from `(library_id, rel_path)` upstream.
#[derive(Debug, Clone, serde::Serialize)]
pub struct DuplicateRow {
pub library_id: i32,
pub rel_path: String,
pub content_hash: String,
pub size_bytes: Option<i64>,
pub date_taken: Option<i64>,
pub width: Option<i32>,
pub height: Option<i32>,
pub phash_64: Option<i64>,
pub dhash_64: Option<i64>,
pub duplicate_of_hash: Option<String>,
pub duplicate_decided_at: Option<i64>,
}
pub mod calendar_dao;
pub mod daily_summary_dao;
pub mod insights_dao;
@@ -377,6 +396,104 @@ pub trait ExifDao: Sync + Send {
size_bytes: i64,
) -> Result<(), DbError>;
/// Return image rows that have a `content_hash` but no `phash_64`,
/// oldest first. Used by the `backfill_perceptual_hash` binary.
/// Filters by image extension at the DB layer to avoid ever asking
/// `image_hasher` to decode a video. Returns `(library_id, rel_path)`.
fn get_rows_missing_perceptual_hash(
&mut self,
context: &opentelemetry::Context,
limit: i64,
) -> Result<Vec<(i32, String)>, DbError>;
/// Persist computed perceptual hashes (pHash + dHash) for an
/// existing image_exif row. Either column may be left NULL by
/// passing `None`, but in practice the binary computes both or
/// neither — `image_hasher` either decodes the image and produces
/// both signals, or fails entirely.
fn backfill_perceptual_hash(
&mut self,
context: &opentelemetry::Context,
library_id: i32,
rel_path: &str,
phash_64: Option<i64>,
dhash_64: Option<i64>,
) -> Result<(), DbError>;
/// Group exact-hash duplicates: rows whose `content_hash` appears
/// more than once across the (optionally library-scoped) corpus.
/// Returns one [`DuplicateRow`] per member; callers group by
/// `content_hash`. When `include_resolved=false`, rows already
/// soft-marked (`duplicate_of_hash IS NOT NULL`) are excluded so
/// the modal doesn't re-surface decisions the user already made.
fn list_duplicates_exact(
&mut self,
context: &opentelemetry::Context,
library_id: Option<i32>,
include_resolved: bool,
) -> Result<Vec<DuplicateRow>, DbError>;
/// Return all rows with a non-null `phash_64` (optionally library-
/// scoped), used by the perceptual-cluster routine in
/// [`crate::main`] to single-link cluster via Hamming distance.
/// Each returned row is a *distinct content_hash* — exact duplicates
/// are collapsed at the DB layer so the in-memory clusterer doesn't
/// rediscover them.
fn list_perceptual_candidates(
&mut self,
context: &opentelemetry::Context,
library_id: Option<i32>,
include_resolved: bool,
) -> Result<Vec<DuplicateRow>, DbError>;
/// Look up a single row's metadata by `(library_id, rel_path)`. Used
/// by the resolve endpoint to map the request payload to the
/// underlying `content_hash` before writing the soft-mark. Returns
/// `Ok(None)` if the file doesn't exist in `image_exif`.
fn lookup_duplicate_row(
&mut self,
context: &opentelemetry::Context,
library_id: i32,
rel_path: &str,
) -> Result<Option<DuplicateRow>, DbError>;
/// Soft-mark a file as a duplicate of `survivor_hash`. Sets
/// `duplicate_of_hash` and `duplicate_decided_at` on the row(s)
/// matching `(library_id, rel_path)`. The file stays on disk; the
/// default `/photos` listing hides it because of the
/// `duplicate_of_hash IS NULL` filter.
fn set_duplicate_of(
&mut self,
context: &opentelemetry::Context,
library_id: i32,
rel_path: &str,
survivor_hash: &str,
decided_at: i64,
) -> Result<(), DbError>;
/// Reverse a soft-mark: clears `duplicate_of_hash` and
/// `duplicate_decided_at`. Used by the modal's UNRESOLVE chip.
fn clear_duplicate_of(
&mut self,
context: &opentelemetry::Context,
library_id: i32,
rel_path: &str,
) -> Result<(), DbError>;
/// Union the tags from `demoted_hash` onto `survivor_hash`. Used at
/// resolve time for *perceptual* duplicates (different content_hashes,
/// independent tag sets) so the user doesn't lose their tagging work
/// when promoting a survivor. Idempotent: a tag already on the survivor
/// is left alone. Exact duplicates (same content_hash) don't need this
/// because their tag rows are already shared.
fn union_perceptual_tags(
&mut self,
context: &opentelemetry::Context,
survivor_hash: &str,
demoted_hash: &str,
survivor_rel_path: &str,
) -> Result<(), DbError>;
/// Return the first EXIF row with the given content hash (any library).
/// Used by thumbnail/HLS generation to detect pre-existing derivatives
/// from another library before regenerating.
@@ -440,11 +557,17 @@ pub trait ExifDao: Sync + Send {
/// `library_ids` is empty, rows from every library are returned. Used by
/// `/photos` recursive listing to skip the filesystem walk — the watcher
/// keeps image_exif in parity with disk via the reconciliation pass.
///
/// `include_duplicates=false` filters out rows soft-marked with
/// `duplicate_of_hash IS NOT NULL` so the default photo listing hides
/// demoted siblings; the Apollo duplicates modal passes `true` to
/// see both survivors and demoted members inside a group.
fn list_rel_paths_for_libraries(
&mut self,
context: &opentelemetry::Context,
library_ids: &[i32],
path_prefix: Option<&str>,
include_duplicates: bool,
) -> Result<Vec<(i32, String)>, DbError>;
/// Delete a single image_exif row scoped to `(library_id, rel_path)`.
@@ -1077,6 +1200,7 @@ impl ExifDao for SqliteExifDao {
context: &opentelemetry::Context,
library_ids: &[i32],
path_prefix: Option<&str>,
include_duplicates: bool,
) -> Result<Vec<(i32, String)>, DbError> {
trace_db_call(context, "query", "list_rel_paths_for_libraries", |_span| {
use schema::image_exif::dsl::*;
@@ -1097,6 +1221,41 @@ impl ExifDao for SqliteExifDao {
query = query.filter(rel_path.like(pattern).escape('\\'));
}
if !include_duplicates {
if library_ids.is_empty() {
// Unscoped (all-libraries) view — every survivor is
// reachable somewhere, so a soft-marked row is
// genuinely a duplicate from the user's perspective.
// Hide it.
query = query.filter(duplicate_of_hash.is_null());
} else {
// Scoped to specific libraries: only hide a
// soft-marked row when the survivor is reachable
// *in this view*. If the survivor lives in a
// library the user can't see right now, the
// demoted file is the only copy of those bytes
// they have access to — keep it visible.
//
// Implemented as a correlated NOT EXISTS subquery
// over an aliased image_exif. Library ids are i32
// so format!-inlining the integer list is safe.
use diesel::sql_types::Bool;
let lib_list = library_ids
.iter()
.map(i32::to_string)
.collect::<Vec<_>>()
.join(",");
let raw = format!(
"(image_exif.duplicate_of_hash IS NULL OR NOT EXISTS \
(SELECT 1 FROM image_exif AS survivor \
WHERE survivor.content_hash = image_exif.duplicate_of_hash \
AND survivor.library_id IN ({})))",
lib_list
);
query = query.filter(diesel::dsl::sql::<Bool>(&raw));
}
}
query
.load::<(i32, String)>(connection.deref_mut())
.map_err(|_| anyhow::anyhow!("Query error"))
@@ -1168,6 +1327,421 @@ impl ExifDao for SqliteExifDao {
)
.map_err(|_| DbError::new(DbErrorKind::QueryError))
}
fn get_rows_missing_perceptual_hash(
&mut self,
context: &opentelemetry::Context,
limit: i64,
) -> Result<Vec<(i32, String)>, DbError> {
trace_db_call(
context,
"query",
"get_rows_missing_perceptual_hash",
|_span| {
use schema::image_exif::dsl::*;
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
// Image-only filter via extension. Videos and decode-failures
// would always come back NULL otherwise and the binary would
// grind through them on every run. The list mirrors the file
// formats `image` 0.25 / `image_hasher` 3.x can decode.
image_exif
.filter(content_hash.is_not_null())
.filter(phash_64.is_null())
.filter(
rel_path
.like("%.jpg")
.or(rel_path.like("%.jpeg"))
.or(rel_path.like("%.JPG"))
.or(rel_path.like("%.JPEG"))
.or(rel_path.like("%.png"))
.or(rel_path.like("%.PNG"))
.or(rel_path.like("%.webp"))
.or(rel_path.like("%.WEBP"))
.or(rel_path.like("%.tif"))
.or(rel_path.like("%.tiff"))
.or(rel_path.like("%.TIF"))
.or(rel_path.like("%.TIFF"))
.or(rel_path.like("%.avif"))
.or(rel_path.like("%.AVIF")),
)
.select((library_id, rel_path))
.order(id.asc())
.limit(limit)
.load::<(i32, String)>(connection.deref_mut())
.map_err(|_| anyhow::anyhow!("Query error"))
},
)
.map_err(|_| DbError::new(DbErrorKind::QueryError))
}
fn backfill_perceptual_hash(
&mut self,
context: &opentelemetry::Context,
library_id_val: i32,
rel_path_val: &str,
phash_val: Option<i64>,
dhash_val: Option<i64>,
) -> Result<(), DbError> {
trace_db_call(context, "update", "backfill_perceptual_hash", |_span| {
use schema::image_exif::dsl::*;
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
diesel::update(
image_exif
.filter(library_id.eq(library_id_val))
.filter(rel_path.eq(rel_path_val)),
)
.set((phash_64.eq(phash_val), dhash_64.eq(dhash_val)))
.execute(connection.deref_mut())
.map(|_| ())
.map_err(|_| anyhow::anyhow!("Update error"))
})
.map_err(|_| DbError::new(DbErrorKind::UpdateError))
}
fn list_duplicates_exact(
&mut self,
context: &opentelemetry::Context,
library_id_filter: Option<i32>,
include_resolved: bool,
) -> Result<Vec<DuplicateRow>, DbError> {
trace_db_call(context, "query", "list_duplicates_exact", |_span| {
// Sub-select the content_hashes that appear more than once
// (optionally library-scoped), then load the full member rows
// for those hashes ordered by hash + library + path so the
// caller can stream-group without buffering the full dataset.
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
// Step 1: hashes with count > 1.
let dup_hashes: Vec<String> = {
use schema::image_exif::dsl::*;
let mut q = image_exif
.filter(content_hash.is_not_null())
.group_by(content_hash)
.select(content_hash.assume_not_null())
.having(diesel::dsl::count_star().gt(1))
.into_boxed();
if let Some(lib) = library_id_filter {
q = q.filter(library_id.eq(lib));
}
q.load::<String>(connection.deref_mut())
.map_err(|_| anyhow::anyhow!("Query error"))?
};
if dup_hashes.is_empty() {
return Ok(Vec::new());
}
// Step 2: every member row for those hashes.
use schema::image_exif::dsl::*;
let mut q = image_exif
.filter(content_hash.eq_any(&dup_hashes))
.select((
library_id,
rel_path,
content_hash.assume_not_null(),
size_bytes,
date_taken,
width,
height,
phash_64,
dhash_64,
duplicate_of_hash,
duplicate_decided_at,
))
.order((content_hash.asc(), library_id.asc(), rel_path.asc()))
.into_boxed();
if let Some(lib) = library_id_filter {
q = q.filter(library_id.eq(lib));
}
if !include_resolved {
q = q.filter(duplicate_of_hash.is_null());
}
let rows: Vec<(
i32,
String,
String,
Option<i64>,
Option<i64>,
Option<i32>,
Option<i32>,
Option<i64>,
Option<i64>,
Option<String>,
Option<i64>,
)> = q
.load(connection.deref_mut())
.map_err(|_| anyhow::anyhow!("Query error"))?;
Ok(rows
.into_iter()
.map(|r| DuplicateRow {
library_id: r.0,
rel_path: r.1,
content_hash: r.2,
size_bytes: r.3,
date_taken: r.4,
width: r.5,
height: r.6,
phash_64: r.7,
dhash_64: r.8,
duplicate_of_hash: r.9,
duplicate_decided_at: r.10,
})
.collect())
})
.map_err(|_| DbError::new(DbErrorKind::QueryError))
}
fn list_perceptual_candidates(
&mut self,
context: &opentelemetry::Context,
library_id_filter: Option<i32>,
include_resolved: bool,
) -> Result<Vec<DuplicateRow>, DbError> {
trace_db_call(context, "query", "list_perceptual_candidates", |_span| {
use schema::image_exif::dsl::*;
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
// For perceptual candidates we want one canonical row per
// distinct content_hash — exact dups are clustered by the
// exact-dup query and would only pollute the perceptual
// graph with zero-distance edges. Diesel doesn't have a
// clean `DISTINCT ON`, so we load every row and dedup
// client-side keyed on content_hash. The result set is small
// (only rows with a phash) and the cost is negligible vs
// the BK-tree clustering that follows.
let mut q = image_exif
.filter(content_hash.is_not_null())
.filter(phash_64.is_not_null())
.select((
library_id,
rel_path,
content_hash.assume_not_null(),
size_bytes,
date_taken,
width,
height,
phash_64,
dhash_64,
duplicate_of_hash,
duplicate_decided_at,
))
.order((content_hash.asc(), library_id.asc(), rel_path.asc()))
.into_boxed();
if let Some(lib) = library_id_filter {
q = q.filter(library_id.eq(lib));
}
if !include_resolved {
q = q.filter(duplicate_of_hash.is_null());
}
let rows: Vec<(
i32,
String,
String,
Option<i64>,
Option<i64>,
Option<i32>,
Option<i32>,
Option<i64>,
Option<i64>,
Option<String>,
Option<i64>,
)> = q
.load(connection.deref_mut())
.map_err(|_| anyhow::anyhow!("Query error"))?;
// Dedup keyed on content_hash, keeping the first occurrence
// (deterministic by the SQL ORDER BY: lowest library_id,
// then lexicographically smallest rel_path).
let mut seen = std::collections::HashSet::new();
let mut out = Vec::with_capacity(rows.len());
for r in rows {
if seen.insert(r.2.clone()) {
out.push(DuplicateRow {
library_id: r.0,
rel_path: r.1,
content_hash: r.2,
size_bytes: r.3,
date_taken: r.4,
width: r.5,
height: r.6,
phash_64: r.7,
dhash_64: r.8,
duplicate_of_hash: r.9,
duplicate_decided_at: r.10,
});
}
}
Ok(out)
})
.map_err(|_| DbError::new(DbErrorKind::QueryError))
}
fn lookup_duplicate_row(
&mut self,
context: &opentelemetry::Context,
library_id_val: i32,
rel_path_val: &str,
) -> Result<Option<DuplicateRow>, DbError> {
trace_db_call(context, "query", "lookup_duplicate_row", |_span| {
use schema::image_exif::dsl::*;
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
image_exif
.filter(library_id.eq(library_id_val))
.filter(rel_path.eq(rel_path_val))
.filter(content_hash.is_not_null())
.select((
library_id,
rel_path,
content_hash.assume_not_null(),
size_bytes,
date_taken,
width,
height,
phash_64,
dhash_64,
duplicate_of_hash,
duplicate_decided_at,
))
.first::<(
i32,
String,
String,
Option<i64>,
Option<i64>,
Option<i32>,
Option<i32>,
Option<i64>,
Option<i64>,
Option<String>,
Option<i64>,
)>(connection.deref_mut())
.optional()
.map(|opt| {
opt.map(|r| DuplicateRow {
library_id: r.0,
rel_path: r.1,
content_hash: r.2,
size_bytes: r.3,
date_taken: r.4,
width: r.5,
height: r.6,
phash_64: r.7,
dhash_64: r.8,
duplicate_of_hash: r.9,
duplicate_decided_at: r.10,
})
})
.map_err(|_| anyhow::anyhow!("Query error"))
})
.map_err(|_| DbError::new(DbErrorKind::QueryError))
}
fn set_duplicate_of(
&mut self,
context: &opentelemetry::Context,
library_id_val: i32,
rel_path_val: &str,
survivor_hash: &str,
decided_at: i64,
) -> Result<(), DbError> {
trace_db_call(context, "update", "set_duplicate_of", |_span| {
use schema::image_exif::dsl::*;
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
diesel::update(
image_exif
.filter(library_id.eq(library_id_val))
.filter(rel_path.eq(rel_path_val)),
)
.set((
duplicate_of_hash.eq(survivor_hash),
duplicate_decided_at.eq(decided_at),
))
.execute(connection.deref_mut())
.map(|_| ())
.map_err(|_| anyhow::anyhow!("Update error"))
})
.map_err(|_| DbError::new(DbErrorKind::UpdateError))
}
fn clear_duplicate_of(
&mut self,
context: &opentelemetry::Context,
library_id_val: i32,
rel_path_val: &str,
) -> Result<(), DbError> {
trace_db_call(context, "update", "clear_duplicate_of", |_span| {
use schema::image_exif::dsl::*;
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
diesel::update(
image_exif
.filter(library_id.eq(library_id_val))
.filter(rel_path.eq(rel_path_val)),
)
.set((
duplicate_of_hash.eq::<Option<String>>(None),
duplicate_decided_at.eq::<Option<i64>>(None),
))
.execute(connection.deref_mut())
.map(|_| ())
.map_err(|_| anyhow::anyhow!("Update error"))
})
.map_err(|_| DbError::new(DbErrorKind::UpdateError))
}
fn union_perceptual_tags(
&mut self,
context: &opentelemetry::Context,
survivor_hash: &str,
demoted_hash: &str,
survivor_rel_path: &str,
) -> Result<(), DbError> {
trace_db_call(context, "update", "union_perceptual_tags", |_span| {
// INSERT OR IGNORE handles two relevant uniqueness paths:
// - tagged_photo (rel_path, tag_id) is the historical key,
// so existing tag rows under the survivor's path collide
// and stay put.
// - The (rel_path, tag_id) collision is the one that
// matters for idempotence; (content_hash, tag_id) at the
// bytes level isn't enforced by SQLite but the read path
// dedups on it, so an extra row would be cosmetic.
// Tags whose rel_path differs are inserted, picking up the
// survivor's content_hash so they live under the right bytes.
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
diesel::sql_query(
"INSERT OR IGNORE INTO tagged_photo (rel_path, tag_id, created_time, content_hash) \
SELECT ?, tag_id, strftime('%s','now'), ? \
FROM tagged_photo \
WHERE content_hash = ? \
AND tag_id NOT IN ( \
SELECT tag_id FROM tagged_photo WHERE content_hash = ? \
)",
)
.bind::<diesel::sql_types::Text, _>(survivor_rel_path)
.bind::<diesel::sql_types::Text, _>(survivor_hash)
.bind::<diesel::sql_types::Text, _>(demoted_hash)
.bind::<diesel::sql_types::Text, _>(survivor_hash)
.execute(connection.deref_mut())
.map(|_| ())
.map_err(|_| anyhow::anyhow!("Tag union error"))
})
.map_err(|_| DbError::new(DbErrorKind::UpdateError))
}
}
#[cfg(test)]
@@ -1204,6 +1778,8 @@ mod exif_dao_tests {
last_modified: 0,
content_hash: None,
size_bytes: None,
phash_64: None,
dhash_64: None,
},
)
.expect("insert exif row");

View File

@@ -59,6 +59,10 @@ pub struct InsertImageExif {
pub last_modified: i64,
pub content_hash: Option<String>,
pub size_bytes: Option<i64>,
/// 64-bit pHash (DCT) packed as i64. NULL for videos and decode failures.
pub phash_64: Option<i64>,
/// 64-bit dHash (gradient). NULL for videos and decode failures.
pub dhash_64: Option<i64>,
}
// Field order matches the post-migration column order in `image_exif`.
@@ -86,6 +90,14 @@ pub struct ImageExif {
pub last_modified: i64,
pub content_hash: Option<String>,
pub size_bytes: Option<i64>,
pub phash_64: Option<i64>,
pub dhash_64: Option<i64>,
/// When non-null, this row is a soft-marked duplicate of the file
/// whose `content_hash` matches this value. The default `/photos`
/// listing filters such rows out.
pub duplicate_of_hash: Option<String>,
/// Unix seconds at which the resolve was committed.
pub duplicate_decided_at: Option<i64>,
}
#[derive(Insertable)]

View File

@@ -121,6 +121,10 @@ diesel::table! {
last_modified -> BigInt,
content_hash -> Nullable<Text>,
size_bytes -> Nullable<BigInt>,
phash_64 -> Nullable<BigInt>,
dhash_64 -> Nullable<BigInt>,
duplicate_of_hash -> Nullable<Text>,
duplicate_decided_at -> Nullable<BigInt>,
}
}

893
src/duplicates.rs Normal file
View File

@@ -0,0 +1,893 @@
//! Duplicate detection surface — exact (blake3) and perceptual
//! (pHash + Hamming) groups, plus the soft-mark resolve flow that
//! Apollo's DUPLICATES modal drives.
//!
//! All routes require auth (Claims). Endpoints:
//!
//! - `GET /duplicates/exact?library=&include_resolved=` — count>1 byte-identical groups.
//! - `GET /duplicates/perceptual?library=&threshold=&include_resolved=` — Hamming-clustered groups.
//! - `POST /duplicates/resolve` — soft-mark demoted siblings.
//! - `POST /duplicates/unresolve` — clear a prior soft-mark.
//!
//! Perceptual clustering caches the BK-tree result for 5 minutes so
//! repeated opens of the modal don't re-cluster the whole library.
//! Cache invalidation is best-effort: resolve/unresolve clear the
//! cache, but new files arriving via the watcher don't (the next
//! 5-minute window picks them up). For a single-user personal tool
//! that's the right trade-off.
use std::collections::HashMap;
use std::sync::Mutex;
use std::time::{Duration, Instant};
use actix_web::{App, HttpRequest, HttpResponse, Responder, dev::ServiceFactory, web};
use bk_tree::{BKTree, Metric};
use lazy_static::lazy_static;
use opentelemetry::trace::{TraceContextExt, Tracer};
use serde::{Deserialize, Serialize};
use crate::data::Claims;
use crate::database::{DuplicateRow, ExifDao};
use crate::libraries;
use crate::otel::{extract_context_from_request, global_tracer};
use crate::state::AppState;
// ── Cache ────────────────────────────────────────────────────────────────
const PERCEPTUAL_CACHE_TTL: Duration = Duration::from_secs(300);
#[derive(Clone)]
struct PerceptualCacheEntry {
/// Cache key: (library_id, threshold, include_resolved). `library_id`
/// is `None` for "all libraries". Cluster output is the same shape we
/// return on the wire so we can serve cached requests with zero work.
library_id: Option<i32>,
threshold: u32,
include_resolved: bool,
computed_at: Instant,
groups: Vec<DuplicateGroup>,
}
lazy_static! {
static ref PERCEPTUAL_CACHE: Mutex<Option<PerceptualCacheEntry>> = Mutex::new(None);
}
/// Drop the perceptual-cluster cache. Called from `resolve`/`unresolve`
/// so the next modal open reflects the soft-mark change immediately.
fn invalidate_perceptual_cache() {
if let Ok(mut guard) = PERCEPTUAL_CACHE.lock() {
*guard = None;
}
}
// ── Wire shapes ──────────────────────────────────────────────────────────
#[derive(Serialize, Debug, Clone)]
pub struct DuplicateMember {
pub library_id: i32,
pub rel_path: String,
pub content_hash: String,
pub size_bytes: Option<i64>,
pub date_taken: Option<i64>,
pub width: Option<i32>,
pub height: Option<i32>,
pub duplicate_of_hash: Option<String>,
pub duplicate_decided_at: Option<i64>,
}
impl From<DuplicateRow> for DuplicateMember {
fn from(r: DuplicateRow) -> Self {
Self {
library_id: r.library_id,
rel_path: r.rel_path,
content_hash: r.content_hash,
size_bytes: r.size_bytes,
date_taken: r.date_taken,
width: r.width,
height: r.height,
duplicate_of_hash: r.duplicate_of_hash,
duplicate_decided_at: r.duplicate_decided_at,
}
}
}
#[derive(Serialize, Debug, Clone)]
#[serde(rename_all = "lowercase")]
pub enum DuplicateKind {
Exact,
Perceptual,
}
#[derive(Serialize, Debug, Clone)]
pub struct DuplicateGroup {
pub kind: DuplicateKind,
/// Representative content_hash. For exact groups, the shared hash
/// (every member has the same one). For perceptual groups, an
/// arbitrary cluster member's hash, used only as a stable id for
/// the UI to key off.
pub representative_hash: String,
pub members: Vec<DuplicateMember>,
}
#[derive(Deserialize, Debug)]
pub struct ListDuplicatesQuery {
pub library: Option<String>,
#[serde(default)]
pub include_resolved: Option<bool>,
/// Perceptual only — Hamming-distance threshold. Ignored on the
/// exact endpoint. Defaults to 8 (~12% similarity tolerance, the
/// sweet spot for resized/recompressed copies).
#[serde(default)]
pub threshold: Option<u32>,
}
#[derive(Deserialize, Debug)]
pub struct DuplicateMemberRef {
pub library_id: i32,
pub rel_path: String,
}
#[derive(Deserialize, Debug)]
pub struct ResolveDuplicatesReq {
pub survivor: DuplicateMemberRef,
pub demoted: Vec<DuplicateMemberRef>,
}
#[derive(Serialize, Debug)]
pub struct ResolveResponse {
pub resolved_count: usize,
}
#[derive(Deserialize, Debug)]
pub struct UnresolveDuplicateReq {
pub library_id: i32,
pub rel_path: String,
}
// ── Handlers ─────────────────────────────────────────────────────────────
async fn list_exact_handler(
_: Claims,
request: HttpRequest,
app_state: web::Data<AppState>,
query: web::Query<ListDuplicatesQuery>,
exif_dao: web::Data<Mutex<Box<dyn ExifDao>>>,
) -> impl Responder {
let context = extract_context_from_request(&request);
let span = global_tracer().start_with_context("duplicates.list_exact", &context);
let span_context = opentelemetry::Context::current_with_span(span);
let library_id = libraries::resolve_library_param(&app_state, query.library.as_deref())
.ok()
.flatten()
.map(|l| l.id);
let include_resolved = query.include_resolved.unwrap_or(false);
let rows = {
let mut dao = exif_dao.lock().expect("exif dao lock");
match dao.list_duplicates_exact(&span_context, library_id, include_resolved) {
Ok(rows) => rows,
Err(e) => {
return HttpResponse::InternalServerError().body(format!("{:?}", e));
}
}
};
let groups = group_exact(rows);
HttpResponse::Ok().json(GroupsResponse { groups })
}
async fn list_perceptual_handler(
_: Claims,
request: HttpRequest,
app_state: web::Data<AppState>,
query: web::Query<ListDuplicatesQuery>,
exif_dao: web::Data<Mutex<Box<dyn ExifDao>>>,
) -> impl Responder {
let context = extract_context_from_request(&request);
let span = global_tracer().start_with_context("duplicates.list_perceptual", &context);
let span_context = opentelemetry::Context::current_with_span(span);
let library_id = libraries::resolve_library_param(&app_state, query.library.as_deref())
.ok()
.flatten()
.map(|l| l.id);
let threshold = query.threshold.unwrap_or(8).clamp(0, 32);
let include_resolved = query.include_resolved.unwrap_or(false);
// Cache hit?
if let Ok(guard) = PERCEPTUAL_CACHE.lock()
&& let Some(entry) = guard.as_ref()
&& entry.library_id == library_id
&& entry.threshold == threshold
&& entry.include_resolved == include_resolved
&& entry.computed_at.elapsed() < PERCEPTUAL_CACHE_TTL
{
return HttpResponse::Ok().json(GroupsResponse {
groups: entry.groups.clone(),
});
}
let rows = {
let mut dao = exif_dao.lock().expect("exif dao lock");
match dao.list_perceptual_candidates(&span_context, library_id, include_resolved) {
Ok(rows) => rows,
Err(e) => {
return HttpResponse::InternalServerError().body(format!("{:?}", e));
}
}
};
let groups = cluster_perceptual(rows, threshold);
if let Ok(mut guard) = PERCEPTUAL_CACHE.lock() {
*guard = Some(PerceptualCacheEntry {
library_id,
threshold,
include_resolved,
computed_at: Instant::now(),
groups: groups.clone(),
});
}
HttpResponse::Ok().json(GroupsResponse { groups })
}
async fn resolve_handler(
_: Claims,
request: HttpRequest,
body: web::Json<ResolveDuplicatesReq>,
exif_dao: web::Data<Mutex<Box<dyn ExifDao>>>,
) -> impl Responder {
let context = extract_context_from_request(&request);
let span = global_tracer().start_with_context("duplicates.resolve", &context);
let span_context = opentelemetry::Context::current_with_span(span);
if body.demoted.is_empty() {
return HttpResponse::BadRequest().body("demoted list is empty");
}
let mut dao = exif_dao.lock().expect("exif dao lock");
// Resolve survivor → its content_hash, plus the canonical rel_path
// we'll use as the destination for any tag-union INSERTs.
let survivor = match dao.lookup_duplicate_row(
&span_context,
body.survivor.library_id,
&body.survivor.rel_path,
) {
Ok(Some(row)) => row,
Ok(None) => return HttpResponse::NotFound().body("survivor not found"),
Err(e) => return HttpResponse::InternalServerError().body(format!("{:?}", e)),
};
// Survivor must not itself be soft-marked — otherwise the modal is
// pointing at a row we've already demoted, which would create a chain.
if survivor.duplicate_of_hash.is_some() {
return HttpResponse::Conflict().body("survivor is itself soft-marked as a duplicate");
}
let now = chrono::Utc::now().timestamp();
let mut resolved_count = 0usize;
for member_ref in &body.demoted {
let demoted = match dao.lookup_duplicate_row(
&span_context,
member_ref.library_id,
&member_ref.rel_path,
) {
Ok(Some(row)) => row,
Ok(None) => {
log::warn!(
"duplicates.resolve: skipping unknown demoted ({}, {})",
member_ref.library_id,
member_ref.rel_path
);
continue;
}
Err(e) => {
return HttpResponse::InternalServerError().body(format!("{:?}", e));
}
};
// Survivor and demoted must not be the same row (would set
// duplicate_of_hash to its own hash — recursive nonsense).
if demoted.library_id == survivor.library_id && demoted.rel_path == survivor.rel_path {
continue;
}
// For perceptual dups (different content_hash), union the
// demoted's tag set onto the survivor before flipping the
// soft-mark. For exact dups (same content_hash), tags are
// already shared at the bytes layer — the union is a no-op.
if demoted.content_hash != survivor.content_hash
&& let Err(e) = dao.union_perceptual_tags(
&span_context,
&survivor.content_hash,
&demoted.content_hash,
&survivor.rel_path,
)
{
log::warn!(
"duplicates.resolve: tag union failed for {}: {:?}",
demoted.rel_path,
e
);
// Continue with the soft-mark anyway — losing tag
// continuity is recoverable (unresolve restores the
// demoted row's grid presence, and the original tags
// never moved off the demoted hash).
}
if let Err(e) = dao.set_duplicate_of(
&span_context,
demoted.library_id,
&demoted.rel_path,
&survivor.content_hash,
now,
) {
return HttpResponse::InternalServerError().body(format!("{:?}", e));
}
resolved_count += 1;
}
drop(dao);
invalidate_perceptual_cache();
HttpResponse::Ok().json(ResolveResponse { resolved_count })
}
async fn unresolve_handler(
_: Claims,
request: HttpRequest,
body: web::Json<UnresolveDuplicateReq>,
exif_dao: web::Data<Mutex<Box<dyn ExifDao>>>,
) -> impl Responder {
let context = extract_context_from_request(&request);
let span = global_tracer().start_with_context("duplicates.unresolve", &context);
let span_context = opentelemetry::Context::current_with_span(span);
let mut dao = exif_dao.lock().expect("exif dao lock");
if let Err(e) = dao.clear_duplicate_of(&span_context, body.library_id, &body.rel_path) {
return HttpResponse::InternalServerError().body(format!("{:?}", e));
}
drop(dao);
invalidate_perceptual_cache();
HttpResponse::Ok().finish()
}
// ── Grouping / clustering ────────────────────────────────────────────────
#[derive(Serialize, Debug)]
struct GroupsResponse {
groups: Vec<DuplicateGroup>,
}
fn group_exact(rows: Vec<DuplicateRow>) -> Vec<DuplicateGroup> {
let mut by_hash: HashMap<String, Vec<DuplicateRow>> = HashMap::new();
for row in rows {
by_hash
.entry(row.content_hash.clone())
.or_default()
.push(row);
}
let mut groups: Vec<DuplicateGroup> = by_hash
.into_iter()
.filter(|(_, members)| members.len() > 1)
.map(|(hash, members)| DuplicateGroup {
kind: DuplicateKind::Exact,
representative_hash: hash,
members: members.into_iter().map(DuplicateMember::from).collect(),
})
.collect();
// Largest groups first (most reward per click), then deterministic.
groups.sort_by(|a, b| {
b.members
.len()
.cmp(&a.members.len())
.then_with(|| a.representative_hash.cmp(&b.representative_hash))
});
groups
}
/// Bits set in a "useful" perceptual hash. Real photographic content
/// produces ~50/50 bit distributions; anything outside the [16, 48]
/// band is low-entropy structure (uniform skies, black frames,
/// monochrome scans, faded film) where pHash collapses to near-
/// uniform values that Hamming-trivially across hundreds of unrelated
/// images. The 8/56 band that shipped first was too permissive —
/// even at threshold=4 the false-positive cluster persisted.
const MIN_INFORMATIVE_POPCOUNT: u32 = 16;
const MAX_INFORMATIVE_POPCOUNT: u32 = 64 - MIN_INFORMATIVE_POPCOUNT;
#[inline]
fn is_informative_hash(h: i64) -> bool {
let pop = (h as u64).count_ones();
(MIN_INFORMATIVE_POPCOUNT..=MAX_INFORMATIVE_POPCOUNT).contains(&pop)
}
/// dHash gets a stricter threshold than pHash. pHash is the
/// candidate-discovery signal (BK-tree neighbourhood lookup); dHash
/// is the validation signal that has to actively agree before we
/// union. Splitting the budget asymmetrically means a real near-dup
/// (which scores well on both) survives while an incidental pHash
/// collision (uniform-content false positive) gets vetoed.
///
/// Floor of 2 so threshold=4 still allows a 1-bit jitter in dHash —
/// genuine resampling can flip a low-frequency gradient bit even
/// when the visual content is identical.
#[inline]
fn dhash_threshold(phash_threshold: u32) -> u32 {
(phash_threshold / 2).max(2)
}
/// Single-link cluster the input rows by Hamming distance over their
/// pHash, with `threshold` as the maximum distance for an edge. Rows
/// without a pHash, or with a degenerate (low-entropy) pHash, are
/// excluded — they'd chain together unrelated images.
///
/// Two-signal validation: the BK-tree gives candidate pairs cheaply,
/// then we additionally require dHash agreement before unioning. pHash
/// alone is too permissive; pairing it with dHash collapses the false-
/// positive cluster significantly (different DCT vs gradient
/// signatures on real near-dups still both stay close, but spurious
/// pHash collisions on uniform images don't survive the dHash check).
///
/// Implementation: BK-tree neighbourhood lookup per row, union-find
/// over the validated edges. O(N log N) instead of the O(N²) naive
/// pairwise scan; on a 1.26M-row library that's the difference between
/// "responds in 1.5 s" and "responds in 25 minutes".
fn cluster_perceptual(rows: Vec<DuplicateRow>, threshold: u32) -> Vec<DuplicateGroup> {
let candidates: Vec<DuplicateRow> = rows
.into_iter()
.filter(|r| r.phash_64.is_some_and(is_informative_hash))
.collect();
if candidates.len() < 2 {
return Vec::new();
}
// Build BK-tree keyed on (phash_u64, index-in-candidates).
let mut tree: BKTree<HashKey, HammingMetric> = BKTree::new(HammingMetric);
for (idx, row) in candidates.iter().enumerate() {
if let Some(p) = row.phash_64 {
tree.add(HashKey {
phash: p as u64,
idx,
});
}
}
// Union-find over edges within `threshold`. For a candidate pair
// surfaced by the pHash BK-tree, require dHash within a *stricter*
// threshold (`dhash_threshold(threshold)`) before unioning. pHash
// agreement on low-entropy structure can be incidental; pHash
// agreement AND dHash within roughly half that distance is a
// strong near-dup signal. dHash on either side missing → reject
// (was: trust pHash alone). Missing dHash means we can't validate
// the candidate, and the false-positive cost outweighs the rare
// case of a partial backfill.
let dhash_max = dhash_threshold(threshold);
let mut uf = UnionFind::new(candidates.len());
for (idx, row) in candidates.iter().enumerate() {
let Some(p) = row.phash_64 else { continue };
let key = HashKey {
phash: p as u64,
idx,
};
for (_, neighbour) in tree.find(&key, threshold) {
if neighbour.idx == idx {
continue;
}
let other = &candidates[neighbour.idx];
let dhash_ok = match (row.dhash_64, other.dhash_64) {
(Some(a), Some(b)) => {
(a as u64 ^ b as u64).count_ones() <= dhash_max
&& is_informative_hash(a)
&& is_informative_hash(b)
}
_ => false,
};
if dhash_ok {
uf.union(idx, neighbour.idx);
}
}
}
// Bucket by root.
let mut by_root: HashMap<usize, Vec<DuplicateRow>> = HashMap::new();
for (idx, row) in candidates.into_iter().enumerate() {
let root = uf.find(idx);
by_root.entry(root).or_default().push(row);
}
// Medoid-validate each cluster to break single-link chains.
// Single-link unions any pair within threshold; that means a chain
// A↔B↔C can collapse into one cluster even when A and C aren't
// similar. The medoid pass picks the cluster's most-central member
// and drops any other whose distance to it exceeds threshold —
// chains lose their tail, dense real-near-dup clusters keep all
// members. Discard clusters that drop below 2 after refinement.
let groups: Vec<DuplicateGroup> = by_root
.into_values()
.filter_map(|cluster| refine_cluster(cluster, threshold, dhash_max))
.map(|cluster| {
let representative_hash = cluster[0].content_hash.clone();
DuplicateGroup {
kind: DuplicateKind::Perceptual,
representative_hash,
members: cluster.into_iter().map(DuplicateMember::from).collect(),
}
})
.collect();
let mut groups = groups;
groups.sort_by(|a, b| {
b.members
.len()
.cmp(&a.members.len())
.then_with(|| a.representative_hash.cmp(&b.representative_hash))
});
groups
}
/// Tighten a single-link cluster to its medoid neighbourhood. Returns
/// `None` when fewer than 2 members survive — caller drops the cluster.
fn refine_cluster(
cluster: Vec<DuplicateRow>,
phash_max: u32,
dhash_max: u32,
) -> Option<Vec<DuplicateRow>> {
if cluster.len() < 2 {
return None;
}
if cluster.len() == 2 {
// No chain can exist with only two members; the union-find
// already guaranteed both signals validated when joining.
return Some(cluster);
}
// Pick the medoid: member whose summed pHash+dHash distance to the
// rest of the cluster is smallest. Stable-deterministic via the
// first-best-wins tie break (lower content_hash wins via natural
// iteration order from the BK-tree input ordering).
let phashes: Vec<u64> = cluster
.iter()
.map(|r| r.phash_64.unwrap_or(0) as u64)
.collect();
let dhashes: Vec<u64> = cluster
.iter()
.map(|r| r.dhash_64.unwrap_or(0) as u64)
.collect();
let mut best_idx = 0usize;
let mut best_score = u32::MAX;
for i in 0..cluster.len() {
let mut score: u32 = 0;
for j in 0..cluster.len() {
if i == j {
continue;
}
score = score.saturating_add((phashes[i] ^ phashes[j]).count_ones());
score = score.saturating_add((dhashes[i] ^ dhashes[j]).count_ones());
}
if score < best_score {
best_score = score;
best_idx = i;
}
}
let medoid_phash = phashes[best_idx];
let medoid_dhash = dhashes[best_idx];
let kept: Vec<DuplicateRow> = cluster
.into_iter()
.enumerate()
.filter(|(i, _)| {
*i == best_idx
|| ((phashes[*i] ^ medoid_phash).count_ones() <= phash_max
&& (dhashes[*i] ^ medoid_dhash).count_ones() <= dhash_max)
})
.map(|(_, r)| r)
.collect();
if kept.len() < 2 { None } else { Some(kept) }
}
#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
struct HashKey {
phash: u64,
idx: usize,
}
struct HammingMetric;
impl Metric<HashKey> for HammingMetric {
fn distance(&self, a: &HashKey, b: &HashKey) -> u32 {
(a.phash ^ b.phash).count_ones()
}
fn threshold_distance(&self, a: &HashKey, b: &HashKey, _: u32) -> Option<u32> {
Some(self.distance(a, b))
}
}
struct UnionFind {
parent: Vec<usize>,
rank: Vec<u8>,
}
impl UnionFind {
fn new(n: usize) -> Self {
Self {
parent: (0..n).collect(),
rank: vec![0; n],
}
}
fn find(&mut self, x: usize) -> usize {
if self.parent[x] != x {
let root = self.find(self.parent[x]);
self.parent[x] = root;
}
self.parent[x]
}
fn union(&mut self, a: usize, b: usize) {
let ra = self.find(a);
let rb = self.find(b);
if ra == rb {
return;
}
if self.rank[ra] < self.rank[rb] {
self.parent[ra] = rb;
} else if self.rank[ra] > self.rank[rb] {
self.parent[rb] = ra;
} else {
self.parent[rb] = ra;
self.rank[ra] += 1;
}
}
}
// ── Routing ──────────────────────────────────────────────────────────────
pub fn add_duplicate_services<T>(app: App<T>) -> App<T>
where
T: ServiceFactory<
actix_web::dev::ServiceRequest,
Config = (),
Error = actix_web::Error,
InitError = (),
>,
{
app.service(web::resource("/duplicates/exact").route(web::get().to(list_exact_handler)))
.service(
web::resource("/duplicates/perceptual").route(web::get().to(list_perceptual_handler)),
)
.service(web::resource("/duplicates/resolve").route(web::post().to(resolve_handler)))
.service(web::resource("/duplicates/unresolve").route(web::post().to(unresolve_handler)))
}
// ── Tests ────────────────────────────────────────────────────────────────
#[cfg(test)]
mod tests {
use super::*;
fn row(library_id: i32, rel: &str, hash: &str, phash: Option<i64>) -> DuplicateRow {
DuplicateRow {
library_id,
rel_path: rel.into(),
content_hash: hash.into(),
size_bytes: Some(1000),
date_taken: None,
width: None,
height: None,
phash_64: phash,
dhash_64: None,
duplicate_of_hash: None,
duplicate_decided_at: None,
}
}
#[test]
fn group_exact_collapses_by_hash() {
let rows = vec![
row(1, "a.jpg", "h1", None),
row(1, "b.jpg", "h1", None),
row(2, "c.jpg", "h1", None),
row(1, "lonely.jpg", "h2", None),
];
let groups = group_exact(rows);
assert_eq!(groups.len(), 1);
assert_eq!(groups[0].representative_hash, "h1");
assert_eq!(groups[0].members.len(), 3);
}
/// All hashes used below have popcount in the "informative"
/// 8..=56 band so they survive the entropy filter that keeps
/// solid-colour images out of the cluster graph.
const INFORMATIVE_BASE: i64 = 0x55AA_55AA_55AA_55AA; // popcount = 32
const INFORMATIVE_NEAR: i64 = 0x55AA_55AA_55AA_55AB; // 1-bit away from BASE
const INFORMATIVE_FAR: i64 = 0x6996_6996_6996_6996; // 32-bits away from BASE
fn row_with_dhash(
library_id: i32,
rel: &str,
hash: &str,
phash: Option<i64>,
dhash: Option<i64>,
) -> DuplicateRow {
DuplicateRow {
library_id,
rel_path: rel.into(),
content_hash: hash.into(),
size_bytes: Some(1000),
date_taken: None,
width: None,
height: None,
phash_64: phash,
dhash_64: dhash,
duplicate_of_hash: None,
duplicate_decided_at: None,
}
}
#[test]
fn cluster_perceptual_unites_close_hashes() {
// Two rows near each other on both pHash and dHash; one far
// on pHash. Threshold 4 should merge the close pair.
let rows = vec![
row_with_dhash(
1,
"a.jpg",
"h1",
Some(INFORMATIVE_BASE),
Some(INFORMATIVE_BASE),
),
row_with_dhash(
1,
"b.jpg",
"h2",
Some(INFORMATIVE_NEAR),
Some(INFORMATIVE_NEAR),
),
row_with_dhash(
1,
"c.jpg",
"h3",
Some(INFORMATIVE_FAR),
Some(INFORMATIVE_FAR),
),
];
let groups = cluster_perceptual(rows, 4);
assert_eq!(groups.len(), 1);
assert_eq!(groups[0].members.len(), 2);
let paths: Vec<&str> = groups[0]
.members
.iter()
.map(|m| m.rel_path.as_str())
.collect();
assert!(paths.contains(&"a.jpg"));
assert!(paths.contains(&"b.jpg"));
}
#[test]
fn cluster_perceptual_threshold_zero_drops_distinct() {
let rows = vec![
row_with_dhash(
1,
"a.jpg",
"h1",
Some(INFORMATIVE_BASE),
Some(INFORMATIVE_BASE),
),
row_with_dhash(
1,
"b.jpg",
"h2",
Some(INFORMATIVE_NEAR),
Some(INFORMATIVE_NEAR),
),
];
let groups = cluster_perceptual(rows, 0);
assert!(groups.is_empty());
}
#[test]
fn cluster_perceptual_skips_singletons() {
let rows = vec![row(1, "alone.jpg", "h1", Some(INFORMATIVE_BASE))];
assert!(cluster_perceptual(rows, 8).is_empty());
}
#[test]
fn cluster_perceptual_filters_low_entropy_hashes() {
// Both 0 (popcount 0) and i64::MAX (popcount 63) fall outside
// the informative band. A pair of these would trivially match
// (Hamming distance to each other small or zero) without the
// entropy filter — that's exactly the regression that was
// producing a giant first cluster of solid-colour images.
let rows = vec![
row(1, "blank-a.jpg", "h1", Some(0)),
row(1, "blank-b.jpg", "h2", Some(0)),
row(1, "white-a.jpg", "h3", Some(i64::MAX)),
row(1, "white-b.jpg", "h4", Some(i64::MAX)),
];
assert!(cluster_perceptual(rows, 8).is_empty());
}
#[test]
fn cluster_perceptual_requires_dhash_agreement() {
// pHash within threshold but dHash far apart — the candidate
// edge from the BK-tree must be rejected. Without the dHash
// double-check this would form a 2-member cluster.
let rows = vec![
row_with_dhash(
1,
"a.jpg",
"h1",
Some(INFORMATIVE_BASE),
Some(INFORMATIVE_BASE),
),
row_with_dhash(
1,
"b.jpg",
"h2",
Some(INFORMATIVE_NEAR),
Some(INFORMATIVE_FAR),
),
];
assert!(cluster_perceptual(rows, 4).is_empty());
}
#[test]
fn cluster_perceptual_breaks_long_chain_at_medoid() {
// 4-link chain at threshold=2 with pairwise distances chosen
// so single-link unions all four but the endpoints sit past
// the medoid's neighbourhood. Bit positions hop by exactly 2
// bits per step, in non-overlapping nibbles, so consecutive
// hops compose into wider distant-pair distances:
// A↔B = 2, B↔C = 2, C↔D = 2,
// A↔C = 4, B↔D = 4, A↔D = 6.
// Medoid (B or C) keeps Δ ≤ 2 of itself; the far endpoint
// gets chopped, leaving exactly 3 members.
const A: i64 = 0x55AA_55AA_55AA_55AA;
const B: i64 = 0x55AA_55AA_55AA_55A9; // ^0x03 last byte
const C: i64 = 0x55AA_55AA_55AA_55A5; // ^0x0C from B
const D: i64 = 0x55AA_55AA_55AA_5595; // ^0x30 from C
let rows = vec![
row_with_dhash(1, "a.jpg", "h1", Some(A), Some(A)),
row_with_dhash(1, "b.jpg", "h2", Some(B), Some(B)),
row_with_dhash(1, "c.jpg", "h3", Some(C), Some(C)),
row_with_dhash(1, "d.jpg", "h4", Some(D), Some(D)),
];
let groups = cluster_perceptual(rows, 2);
assert_eq!(groups.len(), 1);
assert_eq!(
groups[0].members.len(),
3,
"medoid pass should chop one chain endpoint past Δ=2"
);
}
/// Sanity-check the BK-tree's metric, which is what the duplicates
/// path actually clusters on.
#[test]
fn hamming_metric_is_symmetric() {
let m = HammingMetric;
let a = HashKey {
phash: 0b1010,
idx: 0,
};
let b = HashKey {
phash: 0b0101,
idx: 1,
};
let d1 = m.distance(&a, &b);
let d2 = m.distance(&b, &a);
assert_eq!(d1, d2);
assert_eq!(d1, 4);
}
}

View File

@@ -583,9 +583,10 @@ pub async fn list_photos<TagD: TagDao, FS: FileSystemAccess>(
} else {
Some(trimmed)
};
let include_duplicates = req.include_duplicates.unwrap_or(false);
let rows = {
let mut dao = exif_dao.lock().expect("Unable to get ExifDao");
dao.list_rel_paths_for_libraries(&span_context, &lib_ids, prefix)
dao.list_rel_paths_for_libraries(&span_context, &lib_ids, prefix, include_duplicates)
.unwrap_or_else(|e| {
warn!("list_rel_paths_for_libraries failed: {:?}", e);
Vec::new()
@@ -1503,6 +1504,10 @@ mod tests {
last_modified: data.last_modified,
content_hash: data.content_hash.clone(),
size_bytes: data.size_bytes,
phash_64: data.phash_64,
dhash_64: data.dhash_64,
duplicate_of_hash: None,
duplicate_decided_at: None,
})
}
@@ -1542,6 +1547,10 @@ mod tests {
last_modified: data.last_modified,
content_hash: data.content_hash.clone(),
size_bytes: data.size_bytes,
phash_64: data.phash_64,
dhash_64: data.dhash_64,
duplicate_of_hash: None,
duplicate_decided_at: None,
})
}
@@ -1689,6 +1698,7 @@ mod tests {
_context: &opentelemetry::Context,
_library_ids: &[i32],
_path_prefix: Option<&str>,
_include_duplicates: bool,
) -> Result<Vec<(i32, String)>, DbError> {
Ok(vec![])
}
@@ -1719,6 +1729,82 @@ mod tests {
) -> Result<Vec<(i32, String)>, DbError> {
Ok(Vec::new())
}
fn get_rows_missing_perceptual_hash(
&mut self,
_context: &opentelemetry::Context,
_limit: i64,
) -> Result<Vec<(i32, String)>, DbError> {
Ok(Vec::new())
}
fn backfill_perceptual_hash(
&mut self,
_context: &opentelemetry::Context,
_library_id: i32,
_rel_path: &str,
_phash_64: Option<i64>,
_dhash_64: Option<i64>,
) -> Result<(), DbError> {
Ok(())
}
fn list_duplicates_exact(
&mut self,
_context: &opentelemetry::Context,
_library_id: Option<i32>,
_include_resolved: bool,
) -> Result<Vec<crate::database::DuplicateRow>, DbError> {
Ok(Vec::new())
}
fn list_perceptual_candidates(
&mut self,
_context: &opentelemetry::Context,
_library_id: Option<i32>,
_include_resolved: bool,
) -> Result<Vec<crate::database::DuplicateRow>, DbError> {
Ok(Vec::new())
}
fn lookup_duplicate_row(
&mut self,
_context: &opentelemetry::Context,
_library_id: i32,
_rel_path: &str,
) -> Result<Option<crate::database::DuplicateRow>, DbError> {
Ok(None)
}
fn set_duplicate_of(
&mut self,
_context: &opentelemetry::Context,
_library_id: i32,
_rel_path: &str,
_survivor_hash: &str,
_decided_at: i64,
) -> Result<(), DbError> {
Ok(())
}
fn clear_duplicate_of(
&mut self,
_context: &opentelemetry::Context,
_library_id: i32,
_rel_path: &str,
) -> Result<(), DbError> {
Ok(())
}
fn union_perceptual_tags(
&mut self,
_context: &opentelemetry::Context,
_survivor_hash: &str,
_demoted_hash: &str,
_survivor_rel_path: &str,
) -> Result<(), DbError> {
Ok(())
}
}
mod api {

View File

@@ -10,6 +10,7 @@ pub mod cleanup;
pub mod content_hash;
pub mod data;
pub mod database;
pub mod duplicates;
pub mod error;
pub mod exif;
pub mod face_watch;
@@ -23,6 +24,7 @@ pub mod library_maintenance;
pub mod memories;
pub mod otel;
pub mod parsers;
pub mod perceptual_hash;
pub mod service;
pub mod state;
pub mod tags;

View File

@@ -64,6 +64,7 @@ mod auth;
mod content_hash;
mod data;
mod database;
mod duplicates;
mod error;
mod exif;
mod face_watch;
@@ -73,6 +74,7 @@ mod files;
mod geo;
mod libraries;
mod library_maintenance;
mod perceptual_hash;
mod state;
mod tags;
mod utils;
@@ -530,6 +532,11 @@ async fn set_image_gps(
.ok()
.map(|c| c.content_hash),
size_bytes: content_hash::compute(&full_path).ok().map(|c| c.size_bytes),
// GPS-update path doesn't touch perceptual hashes either; columns
// ignored by update_exif. Compute best-effort so a new file lands
// with a usable signal; failure just leaves prior values in place.
phash_64: perceptual_hash::compute(&full_path).map(|h| h.phash_64),
dhash_64: perceptual_hash::compute(&full_path).map(|h| h.dhash_64),
};
let updated = {
@@ -652,6 +659,37 @@ async fn upload_image(
&full_path.to_str().unwrap().to_string(),
true,
) {
// Pre-write content-hash check: if these exact bytes already
// exist anywhere in any library (and aren't themselves
// soft-marked as duplicates), don't write the file. Return
// 409 with the canonical sibling so the mobile app can show
// a friendly "already in your library" toast.
let upload_hash = blake3::Hasher::new()
.update(&file_content)
.finalize()
.to_hex()
.to_string();
{
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
if let Ok(Some(existing)) = dao.find_by_content_hash(&span_context, &upload_hash)
&& existing.duplicate_of_hash.is_none()
{
let library_name = libraries::load_all(&mut crate::database::connect())
.into_iter()
.find(|l| l.id == existing.library_id)
.map(|l| l.name);
span.set_status(Status::Ok);
return HttpResponse::Conflict().json(serde_json::json!({
"duplicate_of": {
"library_id": existing.library_id,
"rel_path": existing.file_path,
},
"content_hash": upload_hash,
"library_name": library_name,
}));
}
}
let context =
opentelemetry::Context::new().with_remote_span_context(span.span_context().clone());
tracer
@@ -710,6 +748,7 @@ async fn upload_image(
(None, None)
}
};
let perceptual = perceptual_hash::compute(&uploaded_path);
let insert_exif = InsertImageExif {
library_id: target_library.id,
file_path: relative_path.clone(),
@@ -731,6 +770,8 @@ async fn upload_image(
last_modified: timestamp,
content_hash,
size_bytes,
phash_64: perceptual.map(|h| h.phash_64),
dhash_64: perceptual.map(|h| h.dhash_64),
};
if let Ok(mut dao) = exif_dao.lock() {
@@ -1661,6 +1702,7 @@ fn main() -> std::io::Result<()> {
.add_feature(add_tag_services::<_, SqliteTagDao>)
.add_feature(knowledge::add_knowledge_services::<_, SqliteKnowledgeDao>)
.add_feature(faces::add_face_services::<_, faces::SqliteFaceDao>)
.add_feature(duplicates::add_duplicate_services)
.app_data(app_data.clone())
.app_data::<Data<RealFileSystem>>(Data::new(RealFileSystem::new(
app_data.base_path.clone(),
@@ -2309,6 +2351,12 @@ fn process_new_files(
}
};
// Perceptual hashes (pHash + dHash). Best-effort — None for
// videos and decode failures. Drives near-duplicate detection
// in the Apollo duplicates surface; failure here is non-fatal
// and never blocks indexing.
let perceptual = perceptual_hash::compute(&file_path);
// EXIF is best-effort enrichment. When extraction fails (or the
// file type doesn't support EXIF) we still store a row with all
// EXIF fields NULL; the file remains visible to sort-by-date
@@ -2360,6 +2408,8 @@ fn process_new_files(
last_modified: timestamp,
content_hash,
size_bytes,
phash_64: perceptual.map(|h| h.phash_64),
dhash_64: perceptual.map(|h| h.dhash_64),
};
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");

159
src/perceptual_hash.rs Normal file
View File

@@ -0,0 +1,159 @@
//! Perceptual image hashing for near-duplicate detection.
//!
//! Two 64-bit signals per image, packed into i64 for storage and fast
//! Hamming distance via XOR + popcount:
//!
//! - **pHash (DCT)** — robust to lossy recompression, format conversion,
//! moderate brightness/contrast shifts. The primary signal.
//! - **dHash (gradient)** — much cheaper to compute, robust to scaling
//! and small crops. Acts as a fallback / corroboration when pHash is
//! ambiguous (very flat images can collide).
//!
//! Image-only by design. Videos, decode failures, and any image we
//! can't open all return `None` — perceptual hash failure is non-fatal
//! and must not block the indexer; the file is still hashed by blake3
//! and exact-match dedup keeps working.
use std::path::Path;
use image_hasher::{HashAlg, HasherConfig};
/// 64-bit perceptual fingerprint pair.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct PerceptualIdentity {
pub phash_64: i64,
pub dhash_64: i64,
}
/// Compute pHash + dHash for an image at `path`. Returns `None` on
/// decode failure (unsupported format, corrupt bytes, video, etc.) —
/// callers should treat that as "no perceptual signal available" and
/// proceed with exact-match dedup only.
pub fn compute(path: &Path) -> Option<PerceptualIdentity> {
let img = image::open(path).ok()?;
// 8x8 = 64 bits, the standard size for pHash/dHash. Larger sizes
// give more discriminative power but no longer fit in i64 and the
// marginal robustness isn't worth the storage / index cost for a
// personal-scale library.
let phash = HasherConfig::new()
.hash_alg(HashAlg::Mean)
.hash_size(8, 8)
.preproc_dct()
.to_hasher()
.hash_image(&img);
let dhash = HasherConfig::new()
.hash_alg(HashAlg::Gradient)
.hash_size(8, 8)
.to_hasher()
.hash_image(&img);
Some(PerceptualIdentity {
phash_64: bytes_to_i64(phash.as_bytes())?,
dhash_64: bytes_to_i64(dhash.as_bytes())?,
})
}
/// Hamming distance between two 64-bit perceptual hashes. The primary
/// query primitive: two images are "near-duplicates" when this is below
/// a threshold (default 8 for pHash, ~12% similarity tolerance). The
/// duplicates module clusters via a BK-tree which uses its own copy of
/// this calculation; this helper is kept for ad-hoc tools and tests.
#[allow(dead_code)]
#[inline]
pub fn hamming_distance(a: i64, b: i64) -> u32 {
(a ^ b).count_ones()
}
fn bytes_to_i64(bytes: &[u8]) -> Option<i64> {
if bytes.len() < 8 {
return None;
}
let mut buf = [0u8; 8];
buf.copy_from_slice(&bytes[..8]);
Some(i64::from_be_bytes(buf))
}
#[cfg(test)]
mod tests {
use super::*;
use image::{ImageBuffer, Rgb};
fn write_test_image(path: &Path, seed: u32) {
// Deterministic-but-distinct image content: simple gradient with
// a per-seed offset. Gives pHash/dHash a real signal to work
// with (a uniform image collapses to all-zero hashes).
let img: ImageBuffer<Rgb<u8>, Vec<u8>> = ImageBuffer::from_fn(64, 64, |x, y| {
let r = ((x + seed) & 0xFF) as u8;
let g = ((y + seed * 2) & 0xFF) as u8;
let b = ((x ^ y ^ seed) & 0xFF) as u8;
Rgb([r, g, b])
});
img.save(path).unwrap();
}
#[test]
fn identical_bytes_yield_identical_hashes() {
let dir = tempfile::tempdir().unwrap();
let a = dir.path().join("a.png");
let b = dir.path().join("b.png");
write_test_image(&a, 42);
write_test_image(&b, 42);
let ha = compute(&a).expect("hash a");
let hb = compute(&b).expect("hash b");
assert_eq!(ha, hb);
assert_eq!(hamming_distance(ha.phash_64, hb.phash_64), 0);
}
#[test]
fn distinct_images_have_distinct_hashes() {
let dir = tempfile::tempdir().unwrap();
let a = dir.path().join("a.png");
let b = dir.path().join("b.png");
write_test_image(&a, 42);
write_test_image(&b, 123);
let ha = compute(&a).expect("hash a");
let hb = compute(&b).expect("hash b");
assert_ne!(ha.phash_64, hb.phash_64);
}
#[test]
fn resized_copy_is_near_duplicate_under_threshold() {
// The whole point of perceptual hashing: a resized copy of the
// same source image should land within a small Hamming distance
// of the original. We check the dHash specifically because it's
// the more resize-robust of the two; pHash is also tight but
// gradient-based dHash gives the most reliable signal here.
let dir = tempfile::tempdir().unwrap();
let a = dir.path().join("a.png");
write_test_image(&a, 7);
let img = image::open(&a).unwrap();
let small = img.resize_exact(32, 32, image::imageops::FilterType::Lanczos3);
let b = dir.path().join("b.png");
small.save(&b).unwrap();
let ha = compute(&a).expect("hash a");
let hb = compute(&b).expect("hash b");
let d_dhash = hamming_distance(ha.dhash_64, hb.dhash_64);
assert!(
d_dhash <= 8,
"expected dhash Hamming distance <= 8 for resized copy, got {}",
d_dhash
);
}
#[test]
fn unsupported_path_returns_none() {
let dir = tempfile::tempdir().unwrap();
let p = dir.path().join("notanimage.txt");
std::fs::write(&p, b"hello").unwrap();
assert!(compute(&p).is_none());
}
#[test]
fn missing_file_returns_none() {
let p = Path::new("/nonexistent/path/that/does/not/exist.png");
assert!(compute(p).is_none());
}
}