Merge pull request 'feature/duplicate-detection' (#73) from feature/duplicate-detection into master

Reviewed-on: #73
2026-05-03 22:34:49 +00:00
parent 4340b164eb 57b7bad086
commit 82dd21b205
14 changed files with 2174 additions and 1 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -600,6 +600,16 @@ version = "2.6.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "6099cdc01846bc367c4e7dd630dc5966dccf36b652fae7a74e17b640411a91b2"

+[[package]]
+name = "bk-tree"
+version = "0.5.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a8283fb8e64b873918f8bc527efa6aff34956296e48ea750a9c909cd47c01546"
+dependencies = [
+ "fnv",
+ "triple_accel",
+]
+
 [[package]]
 name = "blake3"
 version = "1.8.4"
@@ -1928,6 +1938,7 @@ dependencies = [
 "async-trait",
 "base64",
 "bcrypt",
+ "bk-tree",
 "blake3",
 "bytes",
 "chrono",
@@ -1939,6 +1950,7 @@ dependencies = [
 "futures",
 "ical",
 "image",
+ "image_hasher",
 "indicatif",
 "infer",
 "jsonwebtoken",
@@ -1978,6 +1990,19 @@ dependencies = [
 "quick-error",
 ]

+[[package]]
+name = "image_hasher"
+version = "3.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dd266c66b0a0e2d4c6db8e710663fc163a2d33595ce997b6fbda407c8759d344"
+dependencies = [
+ "base64",
+ "image",
+ "rustdct",
+ "serde",
+ "transpose",
+]
+
 [[package]]
 name = "imgref"
 version = "1.11.0"
@@ -2438,6 +2463,15 @@ dependencies = [
 "num-traits",
 ]

+[[package]]
+name = "num-complex"
+version = "0.4.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "73f88a1307638156682bada9d7604135552957b7818057dcef22705b4d509495"
+dependencies = [
+ "num-traits",
+]
+
 [[package]]
 name = "num-conv"
 version = "0.1.0"
@@ -2907,6 +2941,15 @@ version = "0.1.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c"

+[[package]]
+name = "primal-check"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "dc0d895b311e3af9902528fbb8f928688abbd95872819320517cc24ca6b2bd08"
+dependencies = [
+ "num-integer",
+]
+
 [[package]]
 name = "proc-macro2"
 version = "1.0.101"
@@ -3286,6 +3329,29 @@ dependencies = [
 "semver",
 ]

+[[package]]
+name = "rustdct"
+version = "0.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8b61555105d6a9bf98797c063c362a1d24ed8ab0431655e38f1cf51e52089551"
+dependencies = [
+ "rustfft",
+]
+
+[[package]]
+name = "rustfft"
+version = "6.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "21db5f9893e91f41798c88680037dba611ca6674703c1a18601b01a72c8adb89"
+dependencies = [
+ "num-complex",
+ "num-integer",
+ "num-traits",
+ "primal-check",
+ "strength_reduce",
+ "transpose",
+]
+
 [[package]]
 name = "rustix"
 version = "1.0.8"
@@ -3624,6 +3690,12 @@ version = "1.2.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"

+[[package]]
+name = "strength_reduce"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "fe895eb47f22e2ddd4dabc02bce419d2e643c8e3b585c78158b349195bc24d82"
+
 [[package]]
 name = "strfmt"
 version = "0.2.5"
@@ -4122,6 +4194,22 @@ dependencies = [
 "once_cell",
 ]

+[[package]]
+name = "transpose"
+version = "0.2.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ad61aed86bc3faea4300c7aee358b4c6d0c8d6ccc36524c96e4c92ccf26e77e"
+dependencies = [
+ "num-integer",
+ "strength_reduce",
+]
+
+[[package]]
+name = "triple_accel"
+version = "0.3.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "622b09ce2fe2df4618636fb92176d205662f59803f39e70d1c333393082de96c"
+
 [[package]]
 name = "try-lock"
 version = "0.2.5"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -59,5 +59,7 @@ ical = "0.11"
 scraper = "0.20"
 base64 = "0.22"
 blake3 = "1.5"
+image_hasher = "3.0"
+bk-tree = "0.5"
 async-trait = "0.1"
 indicatif = "0.17"
--- a/migrations/2026-05-03-000000_add_perceptual_hash/down.sql
+++ b/migrations/2026-05-03-000000_add_perceptual_hash/down.sql
@@ -0,0 +1,8 @@
+DROP INDEX IF EXISTS idx_image_exif_duplicate_of_hash;
+DROP INDEX IF EXISTS idx_image_exif_dhash;
+DROP INDEX IF EXISTS idx_image_exif_phash;
+
+ALTER TABLE image_exif DROP COLUMN duplicate_decided_at;
+ALTER TABLE image_exif DROP COLUMN duplicate_of_hash;
+ALTER TABLE image_exif DROP COLUMN dhash_64;
+ALTER TABLE image_exif DROP COLUMN phash_64;
--- a/migrations/2026-05-03-000000_add_perceptual_hash/up.sql
+++ b/migrations/2026-05-03-000000_add_perceptual_hash/up.sql
@@ -0,0 +1,41 @@
+-- Adds perceptual-hash signals + soft-mark resolution state to image_exif so
+-- the duplicates surface in Apollo can group near-duplicates (re-encoded,
+-- resized, format-converted copies) and let the user demote losers without
+-- touching the file on disk. Image-only for v1: phash_64/dhash_64 are NULL
+-- on videos and on images that fail to decode. See Apollo CLAUDE.md →
+-- Duplicate detection / Caching layer for the policy.
+--
+-- Soft-mark columns are media-type-agnostic — when video perceptual hashing
+-- arrives, it lives in a separate hash-keyed companion table and reuses the
+-- same duplicate_of_hash / duplicate_decided_at machinery.
+
+-- pHash (DCT, 64-bit) packed as i64 for fast XOR + popcount Hamming.
+ALTER TABLE image_exif ADD COLUMN phash_64 BIGINT;
+
+-- dHash (gradient, 64-bit). Cheap, robust to compression/resize. Stored
+-- alongside pHash so the query layer can fall back if either is null.
+ALTER TABLE image_exif ADD COLUMN dhash_64 BIGINT;
+
+-- When non-null, this row is a soft-marked duplicate of the row whose
+-- content_hash matches. The duplicate file stays on disk; the default
+-- /photos listing filters it out. /photos?include_duplicates=true opts
+-- back in (the Apollo duplicates modal uses this).
+ALTER TABLE image_exif ADD COLUMN duplicate_of_hash TEXT;
+
+-- Unix seconds of the resolve. Distinguishes "never reviewed" from
+-- "reviewed and resolved" for the Apollo include_resolved toggle.
+ALTER TABLE image_exif ADD COLUMN duplicate_decided_at BIGINT;
+
+-- Partial indexes — the columns are NULL for the vast majority of rows
+-- during the transitional window and forever for videos / decode failures.
+CREATE INDEX idx_image_exif_phash
+    ON image_exif (phash_64)
+    WHERE phash_64 IS NOT NULL;
+
+CREATE INDEX idx_image_exif_dhash
+    ON image_exif (dhash_64)
+    WHERE dhash_64 IS NOT NULL;
+
+CREATE INDEX idx_image_exif_duplicate_of_hash
+    ON image_exif (duplicate_of_hash)
+    WHERE duplicate_of_hash IS NOT NULL;
--- a/src/bin/backfill_perceptual_hash.rs
+++ b/src/bin/backfill_perceptual_hash.rs
@@ -0,0 +1,243 @@
+//! Backfill `image_exif.phash_64` + `dhash_64` for image rows that
+//! were ingested before perceptual hashing was wired into the watcher.
+//!
+//! The watcher computes perceptual hashes for new images as they're
+//! ingested, so this binary is a one-shot for the historical backlog.
+//! Idempotent — only rows with a non-null content_hash and a null
+//! phash are processed, so re-runs are safe and pick up where they
+//! left off (e.g. after a crash or interrupt).
+//!
+//! Image-only by design: `get_rows_missing_perceptual_hash` filters by
+//! file extension at the DB layer so videos and other non-decodable
+//! media are skipped without round-tripping `image_hasher`. Files that
+//! can't be opened (missing on disk, permission errors) are quietly
+//! left as null and counted as "missing"; on next run, if the file is
+//! restored, the row will surface again.
+
+use std::path::Path;
+use std::sync::{Arc, Mutex};
+use std::time::Instant;
+
+use clap::Parser;
+use log::{error, warn};
+use rayon::prelude::*;
+
+use image_api::bin_progress;
+use image_api::database::{ExifDao, SqliteExifDao, connect};
+use image_api::libraries::{self, Library};
+use image_api::perceptual_hash;
+
+#[derive(Parser, Debug)]
+#[command(name = "backfill_perceptual_hash")]
+#[command(about = "Compute pHash + dHash for image_exif rows missing one")]
+struct Args {
+    /// Max rows to hash per batch. The process loops until no rows remain.
+    #[arg(long, default_value_t = 256)]
+    batch_size: i64,
+
+    /// Rayon parallelism override. 0 uses the default thread pool size.
+    #[arg(long, default_value_t = 0)]
+    parallelism: usize,
+
+    /// Dry-run: log what would be hashed without writing to the DB.
+    #[arg(long)]
+    dry_run: bool,
+}
+
+fn main() -> anyhow::Result<()> {
+    env_logger::init();
+    dotenv::dotenv().ok();
+
+    let args = Args::parse();
+    if args.parallelism > 0 {
+        rayon::ThreadPoolBuilder::new()
+            .num_threads(args.parallelism)
+            .build_global()
+            .expect("Unable to configure rayon thread pool");
+    }
+
+    let base_path = dotenv::var("BASE_PATH").ok();
+    let mut seed_conn = connect();
+    if let Some(base) = base_path.as_deref() {
+        libraries::seed_or_patch_from_env(&mut seed_conn, base);
+    }
+    let libs = libraries::load_all(&mut seed_conn);
+    drop(seed_conn);
+    if libs.is_empty() {
+        anyhow::bail!("No libraries configured; cannot backfill perceptual hashes");
+    }
+    let libs_by_id: std::collections::HashMap<i32, Library> =
+        libs.into_iter().map(|lib| (lib.id, lib)).collect();
+    println!(
+        "Configured libraries: {}",
+        libs_by_id
+            .values()
+            .map(|l| format!("{} -> {}", l.name, l.root_path))
+            .collect::<Vec<_>>()
+            .join(", ")
+    );
+
+    let dao: Arc<Mutex<Box<dyn ExifDao>>> = Arc::new(Mutex::new(Box::new(SqliteExifDao::new())));
+    let ctx = opentelemetry::Context::new();
+
+    let mut total_hashed = 0u64;
+    let mut total_missing = 0u64;
+    let mut total_decode_failures = 0u64;
+    let mut total_errors = 0u64;
+    let start = Instant::now();
+
+    let pb = bin_progress::spinner("perceptual-hashing");
+
+    loop {
+        let rows = {
+            let mut guard = dao.lock().expect("Unable to lock ExifDao");
+            guard
+                .get_rows_missing_perceptual_hash(&ctx, args.batch_size)
+                .map_err(|e| anyhow::anyhow!("DB error: {:?}", e))?
+        };
+        if rows.is_empty() {
+            break;
+        }
+        let batch_size = rows.len();
+        pb.set_message(format!(
+            "batch of {} (hashed={} decode_fail={} missing={} errors={})",
+            batch_size, total_hashed, total_decode_failures, total_missing, total_errors
+        ));
+
+        // Compute perceptual hashes in parallel — CPU-bound, decoder
+        // releases the GIL-equivalent. rayon's default thread pool
+        // matches the host's logical-core count which is the right
+        // ceiling for image_hasher's DCT pass.
+        let results: Vec<(i32, String, FilePerceptualResult)> = rows
+            .into_par_iter()
+            .map(|(library_id, rel_path)| {
+                let abs = libs_by_id
+                    .get(&library_id)
+                    .map(|lib| Path::new(&lib.root_path).join(&rel_path));
+                match abs {
+                    Some(abs_path) if abs_path.exists() => {
+                        match perceptual_hash::compute(&abs_path) {
+                            Some(id) => (library_id, rel_path, FilePerceptualResult::Ok(id)),
+                            None => (library_id, rel_path, FilePerceptualResult::DecodeFailed),
+                        }
+                    }
+                    Some(_) => (library_id, rel_path, FilePerceptualResult::MissingOnDisk),
+                    None => {
+                        warn!("Row refers to unknown library_id {}", library_id);
+                        (library_id, rel_path, FilePerceptualResult::MissingOnDisk)
+                    }
+                }
+            })
+            .collect();
+
+        // Persist sequentially — SQLite writes serialize anyway.
+        if !args.dry_run {
+            let mut guard = dao.lock().expect("Unable to lock ExifDao");
+            for (library_id, rel_path, result) in &results {
+                match result {
+                    FilePerceptualResult::Ok(id) => {
+                        match guard.backfill_perceptual_hash(
+                            &ctx,
+                            *library_id,
+                            rel_path,
+                            Some(id.phash_64),
+                            Some(id.dhash_64),
+                        ) {
+                            Ok(_) => {
+                                total_hashed += 1;
+                                pb.inc(1);
+                            }
+                            Err(e) => {
+                                pb.println(format!("persist error for {}: {:?}", rel_path, e));
+                                total_errors += 1;
+                            }
+                        }
+                    }
+                    FilePerceptualResult::DecodeFailed => {
+                        // Persist phash_64=0/dhash_64=0 as a "tried,
+                        // unhashable" sentinel so this row leaves the
+                        // `phash_64 IS NULL` candidate set and the
+                        // backfill doesn't infinite-loop on a queue of
+                        // unbreakable formats (HEIC, RAW, CMYK JPEGs,
+                        // truncated bytes). The all-zero hash is
+                        // explicitly excluded from clustering by
+                        // is_informative_hash in duplicates.rs, so it
+                        // won't pollute group output — it just becomes
+                        // invisible to the duplicate finder.
+                        log::debug!(
+                            "perceptual decode failed for {} (lib {}); marking unhashable",
+                            rel_path,
+                            library_id
+                        );
+                        match guard.backfill_perceptual_hash(
+                            &ctx,
+                            *library_id,
+                            rel_path,
+                            Some(0),
+                            Some(0),
+                        ) {
+                            Ok(_) => {
+                                total_decode_failures += 1;
+                            }
+                            Err(e) => {
+                                pb.println(format!(
+                                    "persist error (decode-fail sentinel) for {}: {:?}",
+                                    rel_path, e
+                                ));
+                                total_errors += 1;
+                            }
+                        }
+                    }
+                    FilePerceptualResult::MissingOnDisk => {
+                        total_missing += 1;
+                    }
+                }
+            }
+        } else {
+            for (_, rel_path, result) in &results {
+                match result {
+                    FilePerceptualResult::Ok(id) => {
+                        pb.println(format!(
+                            "[dry-run] {} -> phash={:016x} dhash={:016x}",
+                            rel_path, id.phash_64, id.dhash_64
+                        ));
+                        total_hashed += 1;
+                        pb.inc(1);
+                    }
+                    FilePerceptualResult::DecodeFailed => {
+                        total_decode_failures += 1;
+                    }
+                    FilePerceptualResult::MissingOnDisk => {
+                        total_missing += 1;
+                    }
+                }
+            }
+            pb.println(format!(
+                "[dry-run] processed one batch of {}. Stopping — a real run would continue \
+                 until no NULL phash_64 image rows remain.",
+                results.len()
+            ));
+            break;
+        }
+    }
+
+    pb.finish_and_clear();
+    println!(
+        "Done. hashed={}, decode_failed={}, skipped (missing on disk)={}, errors={}, elapsed={:.1}s",
+        total_hashed,
+        total_decode_failures,
+        total_missing,
+        total_errors,
+        start.elapsed().as_secs_f64()
+    );
+    if total_errors > 0 {
+        error!("Backfill completed with {} persist errors", total_errors);
+    }
+    Ok(())
+}
+
+enum FilePerceptualResult {
+    Ok(perceptual_hash::PerceptualIdentity),
+    DecodeFailed,
+    MissingOnDisk,
+}
--- a/src/data/mod.rs
+++ b/src/data/mod.rs
@@ -165,6 +165,15 @@ pub struct FilesRequest {
    /// Optional library filter. Accepts a library id (e.g. "1") or name
    /// (e.g. "main"). When omitted, results span all libraries.
    pub library: Option<String>,
+
+    /// When true, include rows soft-marked as duplicates of another file
+    /// (i.e. `image_exif.duplicate_of_hash IS NOT NULL`). Default false —
+    /// the standard /photos listing hides demoted siblings so the grid
+    /// silently shrinks after a resolve. The Apollo duplicates modal
+    /// passes `true` so it can show both survivors and demoted members
+    /// inside a group.
+    #[serde(default)]
+    pub include_duplicates: Option<bool>,
 }

 #[derive(Copy, Clone, Deserialize, PartialEq, Debug)]
--- a/src/database/mod.rs
+++ b/src/database/mod.rs
@@ -9,6 +9,25 @@ use crate::database::models::{
 };
 use crate::otel::trace_db_call;

+/// Wire shape for a single member of a duplicate group, returned by
+/// `list_duplicates_*` and `lookup_duplicate_row`. Carries everything
+/// the Apollo modal needs to render a member tile and its meta line —
+/// thumbnails are derived from `(library_id, rel_path)` upstream.
+#[derive(Debug, Clone, serde::Serialize)]
+pub struct DuplicateRow {
+    pub library_id: i32,
+    pub rel_path: String,
+    pub content_hash: String,
+    pub size_bytes: Option<i64>,
+    pub date_taken: Option<i64>,
+    pub width: Option<i32>,
+    pub height: Option<i32>,
+    pub phash_64: Option<i64>,
+    pub dhash_64: Option<i64>,
+    pub duplicate_of_hash: Option<String>,
+    pub duplicate_decided_at: Option<i64>,
+}
+
 pub mod calendar_dao;
 pub mod daily_summary_dao;
 pub mod insights_dao;
@@ -377,6 +396,104 @@ pub trait ExifDao: Sync + Send {
        size_bytes: i64,
    ) -> Result<(), DbError>;

+    /// Return image rows that have a `content_hash` but no `phash_64`,
+    /// oldest first. Used by the `backfill_perceptual_hash` binary.
+    /// Filters by image extension at the DB layer to avoid ever asking
+    /// `image_hasher` to decode a video. Returns `(library_id, rel_path)`.
+    fn get_rows_missing_perceptual_hash(
+        &mut self,
+        context: &opentelemetry::Context,
+        limit: i64,
+    ) -> Result<Vec<(i32, String)>, DbError>;
+
+    /// Persist computed perceptual hashes (pHash + dHash) for an
+    /// existing image_exif row. Either column may be left NULL by
+    /// passing `None`, but in practice the binary computes both or
+    /// neither — `image_hasher` either decodes the image and produces
+    /// both signals, or fails entirely.
+    fn backfill_perceptual_hash(
+        &mut self,
+        context: &opentelemetry::Context,
+        library_id: i32,
+        rel_path: &str,
+        phash_64: Option<i64>,
+        dhash_64: Option<i64>,
+    ) -> Result<(), DbError>;
+
+    /// Group exact-hash duplicates: rows whose `content_hash` appears
+    /// more than once across the (optionally library-scoped) corpus.
+    /// Returns one [`DuplicateRow`] per member; callers group by
+    /// `content_hash`. When `include_resolved=false`, rows already
+    /// soft-marked (`duplicate_of_hash IS NOT NULL`) are excluded so
+    /// the modal doesn't re-surface decisions the user already made.
+    fn list_duplicates_exact(
+        &mut self,
+        context: &opentelemetry::Context,
+        library_id: Option<i32>,
+        include_resolved: bool,
+    ) -> Result<Vec<DuplicateRow>, DbError>;
+
+    /// Return all rows with a non-null `phash_64` (optionally library-
+    /// scoped), used by the perceptual-cluster routine in
+    /// [`crate::main`] to single-link cluster via Hamming distance.
+    /// Each returned row is a *distinct content_hash* — exact duplicates
+    /// are collapsed at the DB layer so the in-memory clusterer doesn't
+    /// rediscover them.
+    fn list_perceptual_candidates(
+        &mut self,
+        context: &opentelemetry::Context,
+        library_id: Option<i32>,
+        include_resolved: bool,
+    ) -> Result<Vec<DuplicateRow>, DbError>;
+
+    /// Look up a single row's metadata by `(library_id, rel_path)`. Used
+    /// by the resolve endpoint to map the request payload to the
+    /// underlying `content_hash` before writing the soft-mark. Returns
+    /// `Ok(None)` if the file doesn't exist in `image_exif`.
+    fn lookup_duplicate_row(
+        &mut self,
+        context: &opentelemetry::Context,
+        library_id: i32,
+        rel_path: &str,
+    ) -> Result<Option<DuplicateRow>, DbError>;
+
+    /// Soft-mark a file as a duplicate of `survivor_hash`. Sets
+    /// `duplicate_of_hash` and `duplicate_decided_at` on the row(s)
+    /// matching `(library_id, rel_path)`. The file stays on disk; the
+    /// default `/photos` listing hides it because of the
+    /// `duplicate_of_hash IS NULL` filter.
+    fn set_duplicate_of(
+        &mut self,
+        context: &opentelemetry::Context,
+        library_id: i32,
+        rel_path: &str,
+        survivor_hash: &str,
+        decided_at: i64,
+    ) -> Result<(), DbError>;
+
+    /// Reverse a soft-mark: clears `duplicate_of_hash` and
+    /// `duplicate_decided_at`. Used by the modal's UNRESOLVE chip.
+    fn clear_duplicate_of(
+        &mut self,
+        context: &opentelemetry::Context,
+        library_id: i32,
+        rel_path: &str,
+    ) -> Result<(), DbError>;
+
+    /// Union the tags from `demoted_hash` onto `survivor_hash`. Used at
+    /// resolve time for *perceptual* duplicates (different content_hashes,
+    /// independent tag sets) so the user doesn't lose their tagging work
+    /// when promoting a survivor. Idempotent: a tag already on the survivor
+    /// is left alone. Exact duplicates (same content_hash) don't need this
+    /// because their tag rows are already shared.
+    fn union_perceptual_tags(
+        &mut self,
+        context: &opentelemetry::Context,
+        survivor_hash: &str,
+        demoted_hash: &str,
+        survivor_rel_path: &str,
+    ) -> Result<(), DbError>;
+
    /// Return the first EXIF row with the given content hash (any library).
    /// Used by thumbnail/HLS generation to detect pre-existing derivatives
    /// from another library before regenerating.
@@ -440,11 +557,17 @@ pub trait ExifDao: Sync + Send {
    /// `library_ids` is empty, rows from every library are returned. Used by
    /// `/photos` recursive listing to skip the filesystem walk — the watcher
    /// keeps image_exif in parity with disk via the reconciliation pass.
+    ///
+    /// `include_duplicates=false` filters out rows soft-marked with
+    /// `duplicate_of_hash IS NOT NULL` so the default photo listing hides
+    /// demoted siblings; the Apollo duplicates modal passes `true` to
+    /// see both survivors and demoted members inside a group.
    fn list_rel_paths_for_libraries(
        &mut self,
        context: &opentelemetry::Context,
        library_ids: &[i32],
        path_prefix: Option<&str>,
+        include_duplicates: bool,
    ) -> Result<Vec<(i32, String)>, DbError>;

    /// Delete a single image_exif row scoped to `(library_id, rel_path)`.
@@ -1077,6 +1200,7 @@ impl ExifDao for SqliteExifDao {
        context: &opentelemetry::Context,
        library_ids: &[i32],
        path_prefix: Option<&str>,
+        include_duplicates: bool,
    ) -> Result<Vec<(i32, String)>, DbError> {
        trace_db_call(context, "query", "list_rel_paths_for_libraries", |_span| {
            use schema::image_exif::dsl::*;
@@ -1097,6 +1221,41 @@ impl ExifDao for SqliteExifDao {
                query = query.filter(rel_path.like(pattern).escape('\\'));
            }

+            if !include_duplicates {
+                if library_ids.is_empty() {
+                    // Unscoped (all-libraries) view — every survivor is
+                    // reachable somewhere, so a soft-marked row is
+                    // genuinely a duplicate from the user's perspective.
+                    // Hide it.
+                    query = query.filter(duplicate_of_hash.is_null());
+                } else {
+                    // Scoped to specific libraries: only hide a
+                    // soft-marked row when the survivor is reachable
+                    // *in this view*. If the survivor lives in a
+                    // library the user can't see right now, the
+                    // demoted file is the only copy of those bytes
+                    // they have access to — keep it visible.
+                    //
+                    // Implemented as a correlated NOT EXISTS subquery
+                    // over an aliased image_exif. Library ids are i32
+                    // so format!-inlining the integer list is safe.
+                    use diesel::sql_types::Bool;
+                    let lib_list = library_ids
+                        .iter()
+                        .map(i32::to_string)
+                        .collect::<Vec<_>>()
+                        .join(",");
+                    let raw = format!(
+                        "(image_exif.duplicate_of_hash IS NULL OR NOT EXISTS \
+                         (SELECT 1 FROM image_exif AS survivor \
+                          WHERE survivor.content_hash = image_exif.duplicate_of_hash \
+                            AND survivor.library_id IN ({})))",
+                        lib_list
+                    );
+                    query = query.filter(diesel::dsl::sql::<Bool>(&raw));
+                }
+            }
+
            query
                .load::<(i32, String)>(connection.deref_mut())
                .map_err(|_| anyhow::anyhow!("Query error"))
@@ -1168,6 +1327,421 @@ impl ExifDao for SqliteExifDao {
        )
        .map_err(|_| DbError::new(DbErrorKind::QueryError))
    }
+
+    fn get_rows_missing_perceptual_hash(
+        &mut self,
+        context: &opentelemetry::Context,
+        limit: i64,
+    ) -> Result<Vec<(i32, String)>, DbError> {
+        trace_db_call(
+            context,
+            "query",
+            "get_rows_missing_perceptual_hash",
+            |_span| {
+                use schema::image_exif::dsl::*;
+
+                let mut connection = self.connection.lock().expect("Unable to get ExifDao");
+
+                // Image-only filter via extension. Videos and decode-failures
+                // would always come back NULL otherwise and the binary would
+                // grind through them on every run. The list mirrors the file
+                // formats `image` 0.25 / `image_hasher` 3.x can decode.
+                image_exif
+                    .filter(content_hash.is_not_null())
+                    .filter(phash_64.is_null())
+                    .filter(
+                        rel_path
+                            .like("%.jpg")
+                            .or(rel_path.like("%.jpeg"))
+                            .or(rel_path.like("%.JPG"))
+                            .or(rel_path.like("%.JPEG"))
+                            .or(rel_path.like("%.png"))
+                            .or(rel_path.like("%.PNG"))
+                            .or(rel_path.like("%.webp"))
+                            .or(rel_path.like("%.WEBP"))
+                            .or(rel_path.like("%.tif"))
+                            .or(rel_path.like("%.tiff"))
+                            .or(rel_path.like("%.TIF"))
+                            .or(rel_path.like("%.TIFF"))
+                            .or(rel_path.like("%.avif"))
+                            .or(rel_path.like("%.AVIF")),
+                    )
+                    .select((library_id, rel_path))
+                    .order(id.asc())
+                    .limit(limit)
+                    .load::<(i32, String)>(connection.deref_mut())
+                    .map_err(|_| anyhow::anyhow!("Query error"))
+            },
+        )
+        .map_err(|_| DbError::new(DbErrorKind::QueryError))
+    }
+
+    fn backfill_perceptual_hash(
+        &mut self,
+        context: &opentelemetry::Context,
+        library_id_val: i32,
+        rel_path_val: &str,
+        phash_val: Option<i64>,
+        dhash_val: Option<i64>,
+    ) -> Result<(), DbError> {
+        trace_db_call(context, "update", "backfill_perceptual_hash", |_span| {
+            use schema::image_exif::dsl::*;
+
+            let mut connection = self.connection.lock().expect("Unable to get ExifDao");
+
+            diesel::update(
+                image_exif
+                    .filter(library_id.eq(library_id_val))
+                    .filter(rel_path.eq(rel_path_val)),
+            )
+            .set((phash_64.eq(phash_val), dhash_64.eq(dhash_val)))
+            .execute(connection.deref_mut())
+            .map(|_| ())
+            .map_err(|_| anyhow::anyhow!("Update error"))
+        })
+        .map_err(|_| DbError::new(DbErrorKind::UpdateError))
+    }
+
+    fn list_duplicates_exact(
+        &mut self,
+        context: &opentelemetry::Context,
+        library_id_filter: Option<i32>,
+        include_resolved: bool,
+    ) -> Result<Vec<DuplicateRow>, DbError> {
+        trace_db_call(context, "query", "list_duplicates_exact", |_span| {
+            // Sub-select the content_hashes that appear more than once
+            // (optionally library-scoped), then load the full member rows
+            // for those hashes ordered by hash + library + path so the
+            // caller can stream-group without buffering the full dataset.
+            let mut connection = self.connection.lock().expect("Unable to get ExifDao");
+
+            // Step 1: hashes with count > 1.
+            let dup_hashes: Vec<String> = {
+                use schema::image_exif::dsl::*;
+                let mut q = image_exif
+                    .filter(content_hash.is_not_null())
+                    .group_by(content_hash)
+                    .select(content_hash.assume_not_null())
+                    .having(diesel::dsl::count_star().gt(1))
+                    .into_boxed();
+                if let Some(lib) = library_id_filter {
+                    q = q.filter(library_id.eq(lib));
+                }
+                q.load::<String>(connection.deref_mut())
+                    .map_err(|_| anyhow::anyhow!("Query error"))?
+            };
+
+            if dup_hashes.is_empty() {
+                return Ok(Vec::new());
+            }
+
+            // Step 2: every member row for those hashes.
+            use schema::image_exif::dsl::*;
+            let mut q = image_exif
+                .filter(content_hash.eq_any(&dup_hashes))
+                .select((
+                    library_id,
+                    rel_path,
+                    content_hash.assume_not_null(),
+                    size_bytes,
+                    date_taken,
+                    width,
+                    height,
+                    phash_64,
+                    dhash_64,
+                    duplicate_of_hash,
+                    duplicate_decided_at,
+                ))
+                .order((content_hash.asc(), library_id.asc(), rel_path.asc()))
+                .into_boxed();
+            if let Some(lib) = library_id_filter {
+                q = q.filter(library_id.eq(lib));
+            }
+            if !include_resolved {
+                q = q.filter(duplicate_of_hash.is_null());
+            }
+
+            let rows: Vec<(
+                i32,
+                String,
+                String,
+                Option<i64>,
+                Option<i64>,
+                Option<i32>,
+                Option<i32>,
+                Option<i64>,
+                Option<i64>,
+                Option<String>,
+                Option<i64>,
+            )> = q
+                .load(connection.deref_mut())
+                .map_err(|_| anyhow::anyhow!("Query error"))?;
+
+            Ok(rows
+                .into_iter()
+                .map(|r| DuplicateRow {
+                    library_id: r.0,
+                    rel_path: r.1,
+                    content_hash: r.2,
+                    size_bytes: r.3,
+                    date_taken: r.4,
+                    width: r.5,
+                    height: r.6,
+                    phash_64: r.7,
+                    dhash_64: r.8,
+                    duplicate_of_hash: r.9,
+                    duplicate_decided_at: r.10,
+                })
+                .collect())
+        })
+        .map_err(|_| DbError::new(DbErrorKind::QueryError))
+    }
+
+    fn list_perceptual_candidates(
+        &mut self,
+        context: &opentelemetry::Context,
+        library_id_filter: Option<i32>,
+        include_resolved: bool,
+    ) -> Result<Vec<DuplicateRow>, DbError> {
+        trace_db_call(context, "query", "list_perceptual_candidates", |_span| {
+            use schema::image_exif::dsl::*;
+
+            let mut connection = self.connection.lock().expect("Unable to get ExifDao");
+
+            // For perceptual candidates we want one canonical row per
+            // distinct content_hash — exact dups are clustered by the
+            // exact-dup query and would only pollute the perceptual
+            // graph with zero-distance edges. Diesel doesn't have a
+            // clean `DISTINCT ON`, so we load every row and dedup
+            // client-side keyed on content_hash. The result set is small
+            // (only rows with a phash) and the cost is negligible vs
+            // the BK-tree clustering that follows.
+            let mut q = image_exif
+                .filter(content_hash.is_not_null())
+                .filter(phash_64.is_not_null())
+                .select((
+                    library_id,
+                    rel_path,
+                    content_hash.assume_not_null(),
+                    size_bytes,
+                    date_taken,
+                    width,
+                    height,
+                    phash_64,
+                    dhash_64,
+                    duplicate_of_hash,
+                    duplicate_decided_at,
+                ))
+                .order((content_hash.asc(), library_id.asc(), rel_path.asc()))
+                .into_boxed();
+
+            if let Some(lib) = library_id_filter {
+                q = q.filter(library_id.eq(lib));
+            }
+            if !include_resolved {
+                q = q.filter(duplicate_of_hash.is_null());
+            }
+
+            let rows: Vec<(
+                i32,
+                String,
+                String,
+                Option<i64>,
+                Option<i64>,
+                Option<i32>,
+                Option<i32>,
+                Option<i64>,
+                Option<i64>,
+                Option<String>,
+                Option<i64>,
+            )> = q
+                .load(connection.deref_mut())
+                .map_err(|_| anyhow::anyhow!("Query error"))?;
+
+            // Dedup keyed on content_hash, keeping the first occurrence
+            // (deterministic by the SQL ORDER BY: lowest library_id,
+            // then lexicographically smallest rel_path).
+            let mut seen = std::collections::HashSet::new();
+            let mut out = Vec::with_capacity(rows.len());
+            for r in rows {
+                if seen.insert(r.2.clone()) {
+                    out.push(DuplicateRow {
+                        library_id: r.0,
+                        rel_path: r.1,
+                        content_hash: r.2,
+                        size_bytes: r.3,
+                        date_taken: r.4,
+                        width: r.5,
+                        height: r.6,
+                        phash_64: r.7,
+                        dhash_64: r.8,
+                        duplicate_of_hash: r.9,
+                        duplicate_decided_at: r.10,
+                    });
+                }
+            }
+            Ok(out)
+        })
+        .map_err(|_| DbError::new(DbErrorKind::QueryError))
+    }
+
+    fn lookup_duplicate_row(
+        &mut self,
+        context: &opentelemetry::Context,
+        library_id_val: i32,
+        rel_path_val: &str,
+    ) -> Result<Option<DuplicateRow>, DbError> {
+        trace_db_call(context, "query", "lookup_duplicate_row", |_span| {
+            use schema::image_exif::dsl::*;
+
+            let mut connection = self.connection.lock().expect("Unable to get ExifDao");
+
+            image_exif
+                .filter(library_id.eq(library_id_val))
+                .filter(rel_path.eq(rel_path_val))
+                .filter(content_hash.is_not_null())
+                .select((
+                    library_id,
+                    rel_path,
+                    content_hash.assume_not_null(),
+                    size_bytes,
+                    date_taken,
+                    width,
+                    height,
+                    phash_64,
+                    dhash_64,
+                    duplicate_of_hash,
+                    duplicate_decided_at,
+                ))
+                .first::<(
+                    i32,
+                    String,
+                    String,
+                    Option<i64>,
+                    Option<i64>,
+                    Option<i32>,
+                    Option<i32>,
+                    Option<i64>,
+                    Option<i64>,
+                    Option<String>,
+                    Option<i64>,
+                )>(connection.deref_mut())
+                .optional()
+                .map(|opt| {
+                    opt.map(|r| DuplicateRow {
+                        library_id: r.0,
+                        rel_path: r.1,
+                        content_hash: r.2,
+                        size_bytes: r.3,
+                        date_taken: r.4,
+                        width: r.5,
+                        height: r.6,
+                        phash_64: r.7,
+                        dhash_64: r.8,
+                        duplicate_of_hash: r.9,
+                        duplicate_decided_at: r.10,
+                    })
+                })
+                .map_err(|_| anyhow::anyhow!("Query error"))
+        })
+        .map_err(|_| DbError::new(DbErrorKind::QueryError))
+    }
+
+    fn set_duplicate_of(
+        &mut self,
+        context: &opentelemetry::Context,
+        library_id_val: i32,
+        rel_path_val: &str,
+        survivor_hash: &str,
+        decided_at: i64,
+    ) -> Result<(), DbError> {
+        trace_db_call(context, "update", "set_duplicate_of", |_span| {
+            use schema::image_exif::dsl::*;
+
+            let mut connection = self.connection.lock().expect("Unable to get ExifDao");
+
+            diesel::update(
+                image_exif
+                    .filter(library_id.eq(library_id_val))
+                    .filter(rel_path.eq(rel_path_val)),
+            )
+            .set((
+                duplicate_of_hash.eq(survivor_hash),
+                duplicate_decided_at.eq(decided_at),
+            ))
+            .execute(connection.deref_mut())
+            .map(|_| ())
+            .map_err(|_| anyhow::anyhow!("Update error"))
+        })
+        .map_err(|_| DbError::new(DbErrorKind::UpdateError))
+    }
+
+    fn clear_duplicate_of(
+        &mut self,
+        context: &opentelemetry::Context,
+        library_id_val: i32,
+        rel_path_val: &str,
+    ) -> Result<(), DbError> {
+        trace_db_call(context, "update", "clear_duplicate_of", |_span| {
+            use schema::image_exif::dsl::*;
+
+            let mut connection = self.connection.lock().expect("Unable to get ExifDao");
+
+            diesel::update(
+                image_exif
+                    .filter(library_id.eq(library_id_val))
+                    .filter(rel_path.eq(rel_path_val)),
+            )
+            .set((
+                duplicate_of_hash.eq::<Option<String>>(None),
+                duplicate_decided_at.eq::<Option<i64>>(None),
+            ))
+            .execute(connection.deref_mut())
+            .map(|_| ())
+            .map_err(|_| anyhow::anyhow!("Update error"))
+        })
+        .map_err(|_| DbError::new(DbErrorKind::UpdateError))
+    }
+
+    fn union_perceptual_tags(
+        &mut self,
+        context: &opentelemetry::Context,
+        survivor_hash: &str,
+        demoted_hash: &str,
+        survivor_rel_path: &str,
+    ) -> Result<(), DbError> {
+        trace_db_call(context, "update", "union_perceptual_tags", |_span| {
+            // INSERT OR IGNORE handles two relevant uniqueness paths:
+            //   - tagged_photo (rel_path, tag_id) is the historical key,
+            //     so existing tag rows under the survivor's path collide
+            //     and stay put.
+            //   - The (rel_path, tag_id) collision is the one that
+            //     matters for idempotence; (content_hash, tag_id) at the
+            //     bytes level isn't enforced by SQLite but the read path
+            //     dedups on it, so an extra row would be cosmetic.
+            // Tags whose rel_path differs are inserted, picking up the
+            // survivor's content_hash so they live under the right bytes.
+            let mut connection = self.connection.lock().expect("Unable to get ExifDao");
+
+            diesel::sql_query(
+                "INSERT OR IGNORE INTO tagged_photo (rel_path, tag_id, created_time, content_hash) \
+                 SELECT ?, tag_id, strftime('%s','now'), ? \
+                 FROM tagged_photo \
+                 WHERE content_hash = ? \
+                   AND tag_id NOT IN ( \
+                       SELECT tag_id FROM tagged_photo WHERE content_hash = ? \
+                   )",
+            )
+            .bind::<diesel::sql_types::Text, _>(survivor_rel_path)
+            .bind::<diesel::sql_types::Text, _>(survivor_hash)
+            .bind::<diesel::sql_types::Text, _>(demoted_hash)
+            .bind::<diesel::sql_types::Text, _>(survivor_hash)
+            .execute(connection.deref_mut())
+            .map(|_| ())
+            .map_err(|_| anyhow::anyhow!("Tag union error"))
+        })
+        .map_err(|_| DbError::new(DbErrorKind::UpdateError))
+    }
 }

 #[cfg(test)]
@@ -1204,6 +1778,8 @@ mod exif_dao_tests {
                last_modified: 0,
                content_hash: None,
                size_bytes: None,
+                phash_64: None,
+                dhash_64: None,
            },
        )
        .expect("insert exif row");
--- a/src/database/models.rs
+++ b/src/database/models.rs
@@ -59,6 +59,10 @@ pub struct InsertImageExif {
    pub last_modified: i64,
    pub content_hash: Option<String>,
    pub size_bytes: Option<i64>,
+    /// 64-bit pHash (DCT) packed as i64. NULL for videos and decode failures.
+    pub phash_64: Option<i64>,
+    /// 64-bit dHash (gradient). NULL for videos and decode failures.
+    pub dhash_64: Option<i64>,
 }

 // Field order matches the post-migration column order in `image_exif`.
@@ -86,6 +90,14 @@ pub struct ImageExif {
    pub last_modified: i64,
    pub content_hash: Option<String>,
    pub size_bytes: Option<i64>,
+    pub phash_64: Option<i64>,
+    pub dhash_64: Option<i64>,
+    /// When non-null, this row is a soft-marked duplicate of the file
+    /// whose `content_hash` matches this value. The default `/photos`
+    /// listing filters such rows out.
+    pub duplicate_of_hash: Option<String>,
+    /// Unix seconds at which the resolve was committed.
+    pub duplicate_decided_at: Option<i64>,
 }

 #[derive(Insertable)]
--- a/src/database/schema.rs
+++ b/src/database/schema.rs
@@ -121,6 +121,10 @@ diesel::table! {
        last_modified -> BigInt,
        content_hash -> Nullable<Text>,
        size_bytes -> Nullable<BigInt>,
+        phash_64 -> Nullable<BigInt>,
+        dhash_64 -> Nullable<BigInt>,
+        duplicate_of_hash -> Nullable<Text>,
+        duplicate_decided_at -> Nullable<BigInt>,
    }
 }

--- a/src/duplicates.rs
+++ b/src/duplicates.rs
@@ -0,0 +1,893 @@
+//! Duplicate detection surface — exact (blake3) and perceptual
+//! (pHash + Hamming) groups, plus the soft-mark resolve flow that
+//! Apollo's DUPLICATES modal drives.
+//!
+//! All routes require auth (Claims). Endpoints:
+//!
+//! - `GET  /duplicates/exact?library=&include_resolved=`         — count>1 byte-identical groups.
+//! - `GET  /duplicates/perceptual?library=&threshold=&include_resolved=` — Hamming-clustered groups.
+//! - `POST /duplicates/resolve`                                  — soft-mark demoted siblings.
+//! - `POST /duplicates/unresolve`                                — clear a prior soft-mark.
+//!
+//! Perceptual clustering caches the BK-tree result for 5 minutes so
+//! repeated opens of the modal don't re-cluster the whole library.
+//! Cache invalidation is best-effort: resolve/unresolve clear the
+//! cache, but new files arriving via the watcher don't (the next
+//! 5-minute window picks them up). For a single-user personal tool
+//! that's the right trade-off.
+
+use std::collections::HashMap;
+use std::sync::Mutex;
+use std::time::{Duration, Instant};
+
+use actix_web::{App, HttpRequest, HttpResponse, Responder, dev::ServiceFactory, web};
+use bk_tree::{BKTree, Metric};
+use lazy_static::lazy_static;
+use opentelemetry::trace::{TraceContextExt, Tracer};
+use serde::{Deserialize, Serialize};
+
+use crate::data::Claims;
+use crate::database::{DuplicateRow, ExifDao};
+use crate::libraries;
+use crate::otel::{extract_context_from_request, global_tracer};
+use crate::state::AppState;
+
+// ── Cache ────────────────────────────────────────────────────────────────
+
+const PERCEPTUAL_CACHE_TTL: Duration = Duration::from_secs(300);
+
+#[derive(Clone)]
+struct PerceptualCacheEntry {
+    /// Cache key: (library_id, threshold, include_resolved). `library_id`
+    /// is `None` for "all libraries". Cluster output is the same shape we
+    /// return on the wire so we can serve cached requests with zero work.
+    library_id: Option<i32>,
+    threshold: u32,
+    include_resolved: bool,
+    computed_at: Instant,
+    groups: Vec<DuplicateGroup>,
+}
+
+lazy_static! {
+    static ref PERCEPTUAL_CACHE: Mutex<Option<PerceptualCacheEntry>> = Mutex::new(None);
+}
+
+/// Drop the perceptual-cluster cache. Called from `resolve`/`unresolve`
+/// so the next modal open reflects the soft-mark change immediately.
+fn invalidate_perceptual_cache() {
+    if let Ok(mut guard) = PERCEPTUAL_CACHE.lock() {
+        *guard = None;
+    }
+}
+
+// ── Wire shapes ──────────────────────────────────────────────────────────
+
+#[derive(Serialize, Debug, Clone)]
+pub struct DuplicateMember {
+    pub library_id: i32,
+    pub rel_path: String,
+    pub content_hash: String,
+    pub size_bytes: Option<i64>,
+    pub date_taken: Option<i64>,
+    pub width: Option<i32>,
+    pub height: Option<i32>,
+    pub duplicate_of_hash: Option<String>,
+    pub duplicate_decided_at: Option<i64>,
+}
+
+impl From<DuplicateRow> for DuplicateMember {
+    fn from(r: DuplicateRow) -> Self {
+        Self {
+            library_id: r.library_id,
+            rel_path: r.rel_path,
+            content_hash: r.content_hash,
+            size_bytes: r.size_bytes,
+            date_taken: r.date_taken,
+            width: r.width,
+            height: r.height,
+            duplicate_of_hash: r.duplicate_of_hash,
+            duplicate_decided_at: r.duplicate_decided_at,
+        }
+    }
+}
+
+#[derive(Serialize, Debug, Clone)]
+#[serde(rename_all = "lowercase")]
+pub enum DuplicateKind {
+    Exact,
+    Perceptual,
+}
+
+#[derive(Serialize, Debug, Clone)]
+pub struct DuplicateGroup {
+    pub kind: DuplicateKind,
+    /// Representative content_hash. For exact groups, the shared hash
+    /// (every member has the same one). For perceptual groups, an
+    /// arbitrary cluster member's hash, used only as a stable id for
+    /// the UI to key off.
+    pub representative_hash: String,
+    pub members: Vec<DuplicateMember>,
+}
+
+#[derive(Deserialize, Debug)]
+pub struct ListDuplicatesQuery {
+    pub library: Option<String>,
+    #[serde(default)]
+    pub include_resolved: Option<bool>,
+    /// Perceptual only — Hamming-distance threshold. Ignored on the
+    /// exact endpoint. Defaults to 8 (~12% similarity tolerance, the
+    /// sweet spot for resized/recompressed copies).
+    #[serde(default)]
+    pub threshold: Option<u32>,
+}
+
+#[derive(Deserialize, Debug)]
+pub struct DuplicateMemberRef {
+    pub library_id: i32,
+    pub rel_path: String,
+}
+
+#[derive(Deserialize, Debug)]
+pub struct ResolveDuplicatesReq {
+    pub survivor: DuplicateMemberRef,
+    pub demoted: Vec<DuplicateMemberRef>,
+}
+
+#[derive(Serialize, Debug)]
+pub struct ResolveResponse {
+    pub resolved_count: usize,
+}
+
+#[derive(Deserialize, Debug)]
+pub struct UnresolveDuplicateReq {
+    pub library_id: i32,
+    pub rel_path: String,
+}
+
+// ── Handlers ─────────────────────────────────────────────────────────────
+
+async fn list_exact_handler(
+    _: Claims,
+    request: HttpRequest,
+    app_state: web::Data<AppState>,
+    query: web::Query<ListDuplicatesQuery>,
+    exif_dao: web::Data<Mutex<Box<dyn ExifDao>>>,
+) -> impl Responder {
+    let context = extract_context_from_request(&request);
+    let span = global_tracer().start_with_context("duplicates.list_exact", &context);
+    let span_context = opentelemetry::Context::current_with_span(span);
+
+    let library_id = libraries::resolve_library_param(&app_state, query.library.as_deref())
+        .ok()
+        .flatten()
+        .map(|l| l.id);
+    let include_resolved = query.include_resolved.unwrap_or(false);
+
+    let rows = {
+        let mut dao = exif_dao.lock().expect("exif dao lock");
+        match dao.list_duplicates_exact(&span_context, library_id, include_resolved) {
+            Ok(rows) => rows,
+            Err(e) => {
+                return HttpResponse::InternalServerError().body(format!("{:?}", e));
+            }
+        }
+    };
+
+    let groups = group_exact(rows);
+    HttpResponse::Ok().json(GroupsResponse { groups })
+}
+
+async fn list_perceptual_handler(
+    _: Claims,
+    request: HttpRequest,
+    app_state: web::Data<AppState>,
+    query: web::Query<ListDuplicatesQuery>,
+    exif_dao: web::Data<Mutex<Box<dyn ExifDao>>>,
+) -> impl Responder {
+    let context = extract_context_from_request(&request);
+    let span = global_tracer().start_with_context("duplicates.list_perceptual", &context);
+    let span_context = opentelemetry::Context::current_with_span(span);
+
+    let library_id = libraries::resolve_library_param(&app_state, query.library.as_deref())
+        .ok()
+        .flatten()
+        .map(|l| l.id);
+    let threshold = query.threshold.unwrap_or(8).clamp(0, 32);
+    let include_resolved = query.include_resolved.unwrap_or(false);
+
+    // Cache hit?
+    if let Ok(guard) = PERCEPTUAL_CACHE.lock()
+        && let Some(entry) = guard.as_ref()
+        && entry.library_id == library_id
+        && entry.threshold == threshold
+        && entry.include_resolved == include_resolved
+        && entry.computed_at.elapsed() < PERCEPTUAL_CACHE_TTL
+    {
+        return HttpResponse::Ok().json(GroupsResponse {
+            groups: entry.groups.clone(),
+        });
+    }
+
+    let rows = {
+        let mut dao = exif_dao.lock().expect("exif dao lock");
+        match dao.list_perceptual_candidates(&span_context, library_id, include_resolved) {
+            Ok(rows) => rows,
+            Err(e) => {
+                return HttpResponse::InternalServerError().body(format!("{:?}", e));
+            }
+        }
+    };
+
+    let groups = cluster_perceptual(rows, threshold);
+
+    if let Ok(mut guard) = PERCEPTUAL_CACHE.lock() {
+        *guard = Some(PerceptualCacheEntry {
+            library_id,
+            threshold,
+            include_resolved,
+            computed_at: Instant::now(),
+            groups: groups.clone(),
+        });
+    }
+
+    HttpResponse::Ok().json(GroupsResponse { groups })
+}
+
+async fn resolve_handler(
+    _: Claims,
+    request: HttpRequest,
+    body: web::Json<ResolveDuplicatesReq>,
+    exif_dao: web::Data<Mutex<Box<dyn ExifDao>>>,
+) -> impl Responder {
+    let context = extract_context_from_request(&request);
+    let span = global_tracer().start_with_context("duplicates.resolve", &context);
+    let span_context = opentelemetry::Context::current_with_span(span);
+
+    if body.demoted.is_empty() {
+        return HttpResponse::BadRequest().body("demoted list is empty");
+    }
+
+    let mut dao = exif_dao.lock().expect("exif dao lock");
+
+    // Resolve survivor → its content_hash, plus the canonical rel_path
+    // we'll use as the destination for any tag-union INSERTs.
+    let survivor = match dao.lookup_duplicate_row(
+        &span_context,
+        body.survivor.library_id,
+        &body.survivor.rel_path,
+    ) {
+        Ok(Some(row)) => row,
+        Ok(None) => return HttpResponse::NotFound().body("survivor not found"),
+        Err(e) => return HttpResponse::InternalServerError().body(format!("{:?}", e)),
+    };
+
+    // Survivor must not itself be soft-marked — otherwise the modal is
+    // pointing at a row we've already demoted, which would create a chain.
+    if survivor.duplicate_of_hash.is_some() {
+        return HttpResponse::Conflict().body("survivor is itself soft-marked as a duplicate");
+    }
+
+    let now = chrono::Utc::now().timestamp();
+    let mut resolved_count = 0usize;
+
+    for member_ref in &body.demoted {
+        let demoted = match dao.lookup_duplicate_row(
+            &span_context,
+            member_ref.library_id,
+            &member_ref.rel_path,
+        ) {
+            Ok(Some(row)) => row,
+            Ok(None) => {
+                log::warn!(
+                    "duplicates.resolve: skipping unknown demoted ({}, {})",
+                    member_ref.library_id,
+                    member_ref.rel_path
+                );
+                continue;
+            }
+            Err(e) => {
+                return HttpResponse::InternalServerError().body(format!("{:?}", e));
+            }
+        };
+
+        // Survivor and demoted must not be the same row (would set
+        // duplicate_of_hash to its own hash — recursive nonsense).
+        if demoted.library_id == survivor.library_id && demoted.rel_path == survivor.rel_path {
+            continue;
+        }
+
+        // For perceptual dups (different content_hash), union the
+        // demoted's tag set onto the survivor before flipping the
+        // soft-mark. For exact dups (same content_hash), tags are
+        // already shared at the bytes layer — the union is a no-op.
+        if demoted.content_hash != survivor.content_hash
+            && let Err(e) = dao.union_perceptual_tags(
+                &span_context,
+                &survivor.content_hash,
+                &demoted.content_hash,
+                &survivor.rel_path,
+            )
+        {
+            log::warn!(
+                "duplicates.resolve: tag union failed for {}: {:?}",
+                demoted.rel_path,
+                e
+            );
+            // Continue with the soft-mark anyway — losing tag
+            // continuity is recoverable (unresolve restores the
+            // demoted row's grid presence, and the original tags
+            // never moved off the demoted hash).
+        }
+
+        if let Err(e) = dao.set_duplicate_of(
+            &span_context,
+            demoted.library_id,
+            &demoted.rel_path,
+            &survivor.content_hash,
+            now,
+        ) {
+            return HttpResponse::InternalServerError().body(format!("{:?}", e));
+        }
+
+        resolved_count += 1;
+    }
+
+    drop(dao);
+    invalidate_perceptual_cache();
+
+    HttpResponse::Ok().json(ResolveResponse { resolved_count })
+}
+
+async fn unresolve_handler(
+    _: Claims,
+    request: HttpRequest,
+    body: web::Json<UnresolveDuplicateReq>,
+    exif_dao: web::Data<Mutex<Box<dyn ExifDao>>>,
+) -> impl Responder {
+    let context = extract_context_from_request(&request);
+    let span = global_tracer().start_with_context("duplicates.unresolve", &context);
+    let span_context = opentelemetry::Context::current_with_span(span);
+
+    let mut dao = exif_dao.lock().expect("exif dao lock");
+    if let Err(e) = dao.clear_duplicate_of(&span_context, body.library_id, &body.rel_path) {
+        return HttpResponse::InternalServerError().body(format!("{:?}", e));
+    }
+
+    drop(dao);
+    invalidate_perceptual_cache();
+
+    HttpResponse::Ok().finish()
+}
+
+// ── Grouping / clustering ────────────────────────────────────────────────
+
+#[derive(Serialize, Debug)]
+struct GroupsResponse {
+    groups: Vec<DuplicateGroup>,
+}
+
+fn group_exact(rows: Vec<DuplicateRow>) -> Vec<DuplicateGroup> {
+    let mut by_hash: HashMap<String, Vec<DuplicateRow>> = HashMap::new();
+    for row in rows {
+        by_hash
+            .entry(row.content_hash.clone())
+            .or_default()
+            .push(row);
+    }
+    let mut groups: Vec<DuplicateGroup> = by_hash
+        .into_iter()
+        .filter(|(_, members)| members.len() > 1)
+        .map(|(hash, members)| DuplicateGroup {
+            kind: DuplicateKind::Exact,
+            representative_hash: hash,
+            members: members.into_iter().map(DuplicateMember::from).collect(),
+        })
+        .collect();
+    // Largest groups first (most reward per click), then deterministic.
+    groups.sort_by(|a, b| {
+        b.members
+            .len()
+            .cmp(&a.members.len())
+            .then_with(|| a.representative_hash.cmp(&b.representative_hash))
+    });
+    groups
+}
+
+/// Bits set in a "useful" perceptual hash. Real photographic content
+/// produces ~50/50 bit distributions; anything outside the [16, 48]
+/// band is low-entropy structure (uniform skies, black frames,
+/// monochrome scans, faded film) where pHash collapses to near-
+/// uniform values that Hamming-trivially across hundreds of unrelated
+/// images. The 8/56 band that shipped first was too permissive —
+/// even at threshold=4 the false-positive cluster persisted.
+const MIN_INFORMATIVE_POPCOUNT: u32 = 16;
+const MAX_INFORMATIVE_POPCOUNT: u32 = 64 - MIN_INFORMATIVE_POPCOUNT;
+
+#[inline]
+fn is_informative_hash(h: i64) -> bool {
+    let pop = (h as u64).count_ones();
+    (MIN_INFORMATIVE_POPCOUNT..=MAX_INFORMATIVE_POPCOUNT).contains(&pop)
+}
+
+/// dHash gets a stricter threshold than pHash. pHash is the
+/// candidate-discovery signal (BK-tree neighbourhood lookup); dHash
+/// is the validation signal that has to actively agree before we
+/// union. Splitting the budget asymmetrically means a real near-dup
+/// (which scores well on both) survives while an incidental pHash
+/// collision (uniform-content false positive) gets vetoed.
+///
+/// Floor of 2 so threshold=4 still allows a 1-bit jitter in dHash —
+/// genuine resampling can flip a low-frequency gradient bit even
+/// when the visual content is identical.
+#[inline]
+fn dhash_threshold(phash_threshold: u32) -> u32 {
+    (phash_threshold / 2).max(2)
+}
+
+/// Single-link cluster the input rows by Hamming distance over their
+/// pHash, with `threshold` as the maximum distance for an edge. Rows
+/// without a pHash, or with a degenerate (low-entropy) pHash, are
+/// excluded — they'd chain together unrelated images.
+///
+/// Two-signal validation: the BK-tree gives candidate pairs cheaply,
+/// then we additionally require dHash agreement before unioning. pHash
+/// alone is too permissive; pairing it with dHash collapses the false-
+/// positive cluster significantly (different DCT vs gradient
+/// signatures on real near-dups still both stay close, but spurious
+/// pHash collisions on uniform images don't survive the dHash check).
+///
+/// Implementation: BK-tree neighbourhood lookup per row, union-find
+/// over the validated edges. O(N log N) instead of the O(N²) naive
+/// pairwise scan; on a 1.26M-row library that's the difference between
+/// "responds in 1.5 s" and "responds in 25 minutes".
+fn cluster_perceptual(rows: Vec<DuplicateRow>, threshold: u32) -> Vec<DuplicateGroup> {
+    let candidates: Vec<DuplicateRow> = rows
+        .into_iter()
+        .filter(|r| r.phash_64.is_some_and(is_informative_hash))
+        .collect();
+    if candidates.len() < 2 {
+        return Vec::new();
+    }
+
+    // Build BK-tree keyed on (phash_u64, index-in-candidates).
+    let mut tree: BKTree<HashKey, HammingMetric> = BKTree::new(HammingMetric);
+    for (idx, row) in candidates.iter().enumerate() {
+        if let Some(p) = row.phash_64 {
+            tree.add(HashKey {
+                phash: p as u64,
+                idx,
+            });
+        }
+    }
+
+    // Union-find over edges within `threshold`. For a candidate pair
+    // surfaced by the pHash BK-tree, require dHash within a *stricter*
+    // threshold (`dhash_threshold(threshold)`) before unioning. pHash
+    // agreement on low-entropy structure can be incidental; pHash
+    // agreement AND dHash within roughly half that distance is a
+    // strong near-dup signal. dHash on either side missing → reject
+    // (was: trust pHash alone). Missing dHash means we can't validate
+    // the candidate, and the false-positive cost outweighs the rare
+    // case of a partial backfill.
+    let dhash_max = dhash_threshold(threshold);
+    let mut uf = UnionFind::new(candidates.len());
+    for (idx, row) in candidates.iter().enumerate() {
+        let Some(p) = row.phash_64 else { continue };
+        let key = HashKey {
+            phash: p as u64,
+            idx,
+        };
+        for (_, neighbour) in tree.find(&key, threshold) {
+            if neighbour.idx == idx {
+                continue;
+            }
+            let other = &candidates[neighbour.idx];
+            let dhash_ok = match (row.dhash_64, other.dhash_64) {
+                (Some(a), Some(b)) => {
+                    (a as u64 ^ b as u64).count_ones() <= dhash_max
+                        && is_informative_hash(a)
+                        && is_informative_hash(b)
+                }
+                _ => false,
+            };
+            if dhash_ok {
+                uf.union(idx, neighbour.idx);
+            }
+        }
+    }
+
+    // Bucket by root.
+    let mut by_root: HashMap<usize, Vec<DuplicateRow>> = HashMap::new();
+    for (idx, row) in candidates.into_iter().enumerate() {
+        let root = uf.find(idx);
+        by_root.entry(root).or_default().push(row);
+    }
+
+    // Medoid-validate each cluster to break single-link chains.
+    // Single-link unions any pair within threshold; that means a chain
+    // A↔B↔C can collapse into one cluster even when A and C aren't
+    // similar. The medoid pass picks the cluster's most-central member
+    // and drops any other whose distance to it exceeds threshold —
+    // chains lose their tail, dense real-near-dup clusters keep all
+    // members. Discard clusters that drop below 2 after refinement.
+    let groups: Vec<DuplicateGroup> = by_root
+        .into_values()
+        .filter_map(|cluster| refine_cluster(cluster, threshold, dhash_max))
+        .map(|cluster| {
+            let representative_hash = cluster[0].content_hash.clone();
+            DuplicateGroup {
+                kind: DuplicateKind::Perceptual,
+                representative_hash,
+                members: cluster.into_iter().map(DuplicateMember::from).collect(),
+            }
+        })
+        .collect();
+    let mut groups = groups;
+    groups.sort_by(|a, b| {
+        b.members
+            .len()
+            .cmp(&a.members.len())
+            .then_with(|| a.representative_hash.cmp(&b.representative_hash))
+    });
+    groups
+}
+
+/// Tighten a single-link cluster to its medoid neighbourhood. Returns
+/// `None` when fewer than 2 members survive — caller drops the cluster.
+fn refine_cluster(
+    cluster: Vec<DuplicateRow>,
+    phash_max: u32,
+    dhash_max: u32,
+) -> Option<Vec<DuplicateRow>> {
+    if cluster.len() < 2 {
+        return None;
+    }
+    if cluster.len() == 2 {
+        // No chain can exist with only two members; the union-find
+        // already guaranteed both signals validated when joining.
+        return Some(cluster);
+    }
+
+    // Pick the medoid: member whose summed pHash+dHash distance to the
+    // rest of the cluster is smallest. Stable-deterministic via the
+    // first-best-wins tie break (lower content_hash wins via natural
+    // iteration order from the BK-tree input ordering).
+    let phashes: Vec<u64> = cluster
+        .iter()
+        .map(|r| r.phash_64.unwrap_or(0) as u64)
+        .collect();
+    let dhashes: Vec<u64> = cluster
+        .iter()
+        .map(|r| r.dhash_64.unwrap_or(0) as u64)
+        .collect();
+
+    let mut best_idx = 0usize;
+    let mut best_score = u32::MAX;
+    for i in 0..cluster.len() {
+        let mut score: u32 = 0;
+        for j in 0..cluster.len() {
+            if i == j {
+                continue;
+            }
+            score = score.saturating_add((phashes[i] ^ phashes[j]).count_ones());
+            score = score.saturating_add((dhashes[i] ^ dhashes[j]).count_ones());
+        }
+        if score < best_score {
+            best_score = score;
+            best_idx = i;
+        }
+    }
+
+    let medoid_phash = phashes[best_idx];
+    let medoid_dhash = dhashes[best_idx];
+
+    let kept: Vec<DuplicateRow> = cluster
+        .into_iter()
+        .enumerate()
+        .filter(|(i, _)| {
+            *i == best_idx
+                || ((phashes[*i] ^ medoid_phash).count_ones() <= phash_max
+                    && (dhashes[*i] ^ medoid_dhash).count_ones() <= dhash_max)
+        })
+        .map(|(_, r)| r)
+        .collect();
+
+    if kept.len() < 2 { None } else { Some(kept) }
+}
+
+#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)]
+struct HashKey {
+    phash: u64,
+    idx: usize,
+}
+
+struct HammingMetric;
+
+impl Metric<HashKey> for HammingMetric {
+    fn distance(&self, a: &HashKey, b: &HashKey) -> u32 {
+        (a.phash ^ b.phash).count_ones()
+    }
+
+    fn threshold_distance(&self, a: &HashKey, b: &HashKey, _: u32) -> Option<u32> {
+        Some(self.distance(a, b))
+    }
+}
+
+struct UnionFind {
+    parent: Vec<usize>,
+    rank: Vec<u8>,
+}
+
+impl UnionFind {
+    fn new(n: usize) -> Self {
+        Self {
+            parent: (0..n).collect(),
+            rank: vec![0; n],
+        }
+    }
+
+    fn find(&mut self, x: usize) -> usize {
+        if self.parent[x] != x {
+            let root = self.find(self.parent[x]);
+            self.parent[x] = root;
+        }
+        self.parent[x]
+    }
+
+    fn union(&mut self, a: usize, b: usize) {
+        let ra = self.find(a);
+        let rb = self.find(b);
+        if ra == rb {
+            return;
+        }
+        if self.rank[ra] < self.rank[rb] {
+            self.parent[ra] = rb;
+        } else if self.rank[ra] > self.rank[rb] {
+            self.parent[rb] = ra;
+        } else {
+            self.parent[rb] = ra;
+            self.rank[ra] += 1;
+        }
+    }
+}
+
+// ── Routing ──────────────────────────────────────────────────────────────
+
+pub fn add_duplicate_services<T>(app: App<T>) -> App<T>
+where
+    T: ServiceFactory<
+            actix_web::dev::ServiceRequest,
+            Config = (),
+            Error = actix_web::Error,
+            InitError = (),
+        >,
+{
+    app.service(web::resource("/duplicates/exact").route(web::get().to(list_exact_handler)))
+        .service(
+            web::resource("/duplicates/perceptual").route(web::get().to(list_perceptual_handler)),
+        )
+        .service(web::resource("/duplicates/resolve").route(web::post().to(resolve_handler)))
+        .service(web::resource("/duplicates/unresolve").route(web::post().to(unresolve_handler)))
+}
+
+// ── Tests ────────────────────────────────────────────────────────────────
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn row(library_id: i32, rel: &str, hash: &str, phash: Option<i64>) -> DuplicateRow {
+        DuplicateRow {
+            library_id,
+            rel_path: rel.into(),
+            content_hash: hash.into(),
+            size_bytes: Some(1000),
+            date_taken: None,
+            width: None,
+            height: None,
+            phash_64: phash,
+            dhash_64: None,
+            duplicate_of_hash: None,
+            duplicate_decided_at: None,
+        }
+    }
+
+    #[test]
+    fn group_exact_collapses_by_hash() {
+        let rows = vec![
+            row(1, "a.jpg", "h1", None),
+            row(1, "b.jpg", "h1", None),
+            row(2, "c.jpg", "h1", None),
+            row(1, "lonely.jpg", "h2", None),
+        ];
+        let groups = group_exact(rows);
+        assert_eq!(groups.len(), 1);
+        assert_eq!(groups[0].representative_hash, "h1");
+        assert_eq!(groups[0].members.len(), 3);
+    }
+
+    /// All hashes used below have popcount in the "informative"
+    /// 8..=56 band so they survive the entropy filter that keeps
+    /// solid-colour images out of the cluster graph.
+    const INFORMATIVE_BASE: i64 = 0x55AA_55AA_55AA_55AA; // popcount = 32
+    const INFORMATIVE_NEAR: i64 = 0x55AA_55AA_55AA_55AB; // 1-bit away from BASE
+    const INFORMATIVE_FAR: i64 = 0x6996_6996_6996_6996; // 32-bits away from BASE
+
+    fn row_with_dhash(
+        library_id: i32,
+        rel: &str,
+        hash: &str,
+        phash: Option<i64>,
+        dhash: Option<i64>,
+    ) -> DuplicateRow {
+        DuplicateRow {
+            library_id,
+            rel_path: rel.into(),
+            content_hash: hash.into(),
+            size_bytes: Some(1000),
+            date_taken: None,
+            width: None,
+            height: None,
+            phash_64: phash,
+            dhash_64: dhash,
+            duplicate_of_hash: None,
+            duplicate_decided_at: None,
+        }
+    }
+
+    #[test]
+    fn cluster_perceptual_unites_close_hashes() {
+        // Two rows near each other on both pHash and dHash; one far
+        // on pHash. Threshold 4 should merge the close pair.
+        let rows = vec![
+            row_with_dhash(
+                1,
+                "a.jpg",
+                "h1",
+                Some(INFORMATIVE_BASE),
+                Some(INFORMATIVE_BASE),
+            ),
+            row_with_dhash(
+                1,
+                "b.jpg",
+                "h2",
+                Some(INFORMATIVE_NEAR),
+                Some(INFORMATIVE_NEAR),
+            ),
+            row_with_dhash(
+                1,
+                "c.jpg",
+                "h3",
+                Some(INFORMATIVE_FAR),
+                Some(INFORMATIVE_FAR),
+            ),
+        ];
+        let groups = cluster_perceptual(rows, 4);
+        assert_eq!(groups.len(), 1);
+        assert_eq!(groups[0].members.len(), 2);
+        let paths: Vec<&str> = groups[0]
+            .members
+            .iter()
+            .map(|m| m.rel_path.as_str())
+            .collect();
+        assert!(paths.contains(&"a.jpg"));
+        assert!(paths.contains(&"b.jpg"));
+    }
+
+    #[test]
+    fn cluster_perceptual_threshold_zero_drops_distinct() {
+        let rows = vec![
+            row_with_dhash(
+                1,
+                "a.jpg",
+                "h1",
+                Some(INFORMATIVE_BASE),
+                Some(INFORMATIVE_BASE),
+            ),
+            row_with_dhash(
+                1,
+                "b.jpg",
+                "h2",
+                Some(INFORMATIVE_NEAR),
+                Some(INFORMATIVE_NEAR),
+            ),
+        ];
+        let groups = cluster_perceptual(rows, 0);
+        assert!(groups.is_empty());
+    }
+
+    #[test]
+    fn cluster_perceptual_skips_singletons() {
+        let rows = vec![row(1, "alone.jpg", "h1", Some(INFORMATIVE_BASE))];
+        assert!(cluster_perceptual(rows, 8).is_empty());
+    }
+
+    #[test]
+    fn cluster_perceptual_filters_low_entropy_hashes() {
+        // Both 0 (popcount 0) and i64::MAX (popcount 63) fall outside
+        // the informative band. A pair of these would trivially match
+        // (Hamming distance to each other small or zero) without the
+        // entropy filter — that's exactly the regression that was
+        // producing a giant first cluster of solid-colour images.
+        let rows = vec![
+            row(1, "blank-a.jpg", "h1", Some(0)),
+            row(1, "blank-b.jpg", "h2", Some(0)),
+            row(1, "white-a.jpg", "h3", Some(i64::MAX)),
+            row(1, "white-b.jpg", "h4", Some(i64::MAX)),
+        ];
+        assert!(cluster_perceptual(rows, 8).is_empty());
+    }
+
+    #[test]
+    fn cluster_perceptual_requires_dhash_agreement() {
+        // pHash within threshold but dHash far apart — the candidate
+        // edge from the BK-tree must be rejected. Without the dHash
+        // double-check this would form a 2-member cluster.
+        let rows = vec![
+            row_with_dhash(
+                1,
+                "a.jpg",
+                "h1",
+                Some(INFORMATIVE_BASE),
+                Some(INFORMATIVE_BASE),
+            ),
+            row_with_dhash(
+                1,
+                "b.jpg",
+                "h2",
+                Some(INFORMATIVE_NEAR),
+                Some(INFORMATIVE_FAR),
+            ),
+        ];
+        assert!(cluster_perceptual(rows, 4).is_empty());
+    }
+
+    #[test]
+    fn cluster_perceptual_breaks_long_chain_at_medoid() {
+        // 4-link chain at threshold=2 with pairwise distances chosen
+        // so single-link unions all four but the endpoints sit past
+        // the medoid's neighbourhood. Bit positions hop by exactly 2
+        // bits per step, in non-overlapping nibbles, so consecutive
+        // hops compose into wider distant-pair distances:
+        //   A↔B = 2, B↔C = 2, C↔D = 2,
+        //   A↔C = 4, B↔D = 4, A↔D = 6.
+        // Medoid (B or C) keeps Δ ≤ 2 of itself; the far endpoint
+        // gets chopped, leaving exactly 3 members.
+        const A: i64 = 0x55AA_55AA_55AA_55AA;
+        const B: i64 = 0x55AA_55AA_55AA_55A9; // ^0x03 last byte
+        const C: i64 = 0x55AA_55AA_55AA_55A5; // ^0x0C from B
+        const D: i64 = 0x55AA_55AA_55AA_5595; // ^0x30 from C
+        let rows = vec![
+            row_with_dhash(1, "a.jpg", "h1", Some(A), Some(A)),
+            row_with_dhash(1, "b.jpg", "h2", Some(B), Some(B)),
+            row_with_dhash(1, "c.jpg", "h3", Some(C), Some(C)),
+            row_with_dhash(1, "d.jpg", "h4", Some(D), Some(D)),
+        ];
+        let groups = cluster_perceptual(rows, 2);
+        assert_eq!(groups.len(), 1);
+        assert_eq!(
+            groups[0].members.len(),
+            3,
+            "medoid pass should chop one chain endpoint past Δ=2"
+        );
+    }
+
+    /// Sanity-check the BK-tree's metric, which is what the duplicates
+    /// path actually clusters on.
+    #[test]
+    fn hamming_metric_is_symmetric() {
+        let m = HammingMetric;
+        let a = HashKey {
+            phash: 0b1010,
+            idx: 0,
+        };
+        let b = HashKey {
+            phash: 0b0101,
+            idx: 1,
+        };
+        let d1 = m.distance(&a, &b);
+        let d2 = m.distance(&b, &a);
+        assert_eq!(d1, d2);
+        assert_eq!(d1, 4);
+    }
+}
--- a/src/files.rs
+++ b/src/files.rs
@@ -583,9 +583,10 @@ pub async fn list_photos<TagD: TagDao, FS: FileSystemAccess>(
        } else {
            Some(trimmed)
        };
+        let include_duplicates = req.include_duplicates.unwrap_or(false);
        let rows = {
            let mut dao = exif_dao.lock().expect("Unable to get ExifDao");
-            dao.list_rel_paths_for_libraries(&span_context, &lib_ids, prefix)
+            dao.list_rel_paths_for_libraries(&span_context, &lib_ids, prefix, include_duplicates)
                .unwrap_or_else(|e| {
                    warn!("list_rel_paths_for_libraries failed: {:?}", e);
                    Vec::new()
@@ -1503,6 +1504,10 @@ mod tests {
                last_modified: data.last_modified,
                content_hash: data.content_hash.clone(),
                size_bytes: data.size_bytes,
+                phash_64: data.phash_64,
+                dhash_64: data.dhash_64,
+                duplicate_of_hash: None,
+                duplicate_decided_at: None,
            })
        }

@@ -1542,6 +1547,10 @@ mod tests {
                last_modified: data.last_modified,
                content_hash: data.content_hash.clone(),
                size_bytes: data.size_bytes,
+                phash_64: data.phash_64,
+                dhash_64: data.dhash_64,
+                duplicate_of_hash: None,
+                duplicate_decided_at: None,
            })
        }

@@ -1689,6 +1698,7 @@ mod tests {
            _context: &opentelemetry::Context,
            _library_ids: &[i32],
            _path_prefix: Option<&str>,
+            _include_duplicates: bool,
        ) -> Result<Vec<(i32, String)>, DbError> {
            Ok(vec![])
        }
@@ -1719,6 +1729,82 @@ mod tests {
        ) -> Result<Vec<(i32, String)>, DbError> {
            Ok(Vec::new())
        }
+
+        fn get_rows_missing_perceptual_hash(
+            &mut self,
+            _context: &opentelemetry::Context,
+            _limit: i64,
+        ) -> Result<Vec<(i32, String)>, DbError> {
+            Ok(Vec::new())
+        }
+
+        fn backfill_perceptual_hash(
+            &mut self,
+            _context: &opentelemetry::Context,
+            _library_id: i32,
+            _rel_path: &str,
+            _phash_64: Option<i64>,
+            _dhash_64: Option<i64>,
+        ) -> Result<(), DbError> {
+            Ok(())
+        }
+
+        fn list_duplicates_exact(
+            &mut self,
+            _context: &opentelemetry::Context,
+            _library_id: Option<i32>,
+            _include_resolved: bool,
+        ) -> Result<Vec<crate::database::DuplicateRow>, DbError> {
+            Ok(Vec::new())
+        }
+
+        fn list_perceptual_candidates(
+            &mut self,
+            _context: &opentelemetry::Context,
+            _library_id: Option<i32>,
+            _include_resolved: bool,
+        ) -> Result<Vec<crate::database::DuplicateRow>, DbError> {
+            Ok(Vec::new())
+        }
+
+        fn lookup_duplicate_row(
+            &mut self,
+            _context: &opentelemetry::Context,
+            _library_id: i32,
+            _rel_path: &str,
+        ) -> Result<Option<crate::database::DuplicateRow>, DbError> {
+            Ok(None)
+        }
+
+        fn set_duplicate_of(
+            &mut self,
+            _context: &opentelemetry::Context,
+            _library_id: i32,
+            _rel_path: &str,
+            _survivor_hash: &str,
+            _decided_at: i64,
+        ) -> Result<(), DbError> {
+            Ok(())
+        }
+
+        fn clear_duplicate_of(
+            &mut self,
+            _context: &opentelemetry::Context,
+            _library_id: i32,
+            _rel_path: &str,
+        ) -> Result<(), DbError> {
+            Ok(())
+        }
+
+        fn union_perceptual_tags(
+            &mut self,
+            _context: &opentelemetry::Context,
+            _survivor_hash: &str,
+            _demoted_hash: &str,
+            _survivor_rel_path: &str,
+        ) -> Result<(), DbError> {
+            Ok(())
+        }
    }

    mod api {
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -10,6 +10,7 @@ pub mod cleanup;
 pub mod content_hash;
 pub mod data;
 pub mod database;
+pub mod duplicates;
 pub mod error;
 pub mod exif;
 pub mod face_watch;
@@ -23,6 +24,7 @@ pub mod library_maintenance;
 pub mod memories;
 pub mod otel;
 pub mod parsers;
+pub mod perceptual_hash;
 pub mod service;
 pub mod state;
 pub mod tags;
--- a/src/main.rs
+++ b/src/main.rs
@@ -64,6 +64,7 @@ mod auth;
 mod content_hash;
 mod data;
 mod database;
+mod duplicates;
 mod error;
 mod exif;
 mod face_watch;
@@ -73,6 +74,7 @@ mod files;
 mod geo;
 mod libraries;
 mod library_maintenance;
+mod perceptual_hash;
 mod state;
 mod tags;
 mod utils;
@@ -530,6 +532,11 @@ async fn set_image_gps(
            .ok()
            .map(|c| c.content_hash),
        size_bytes: content_hash::compute(&full_path).ok().map(|c| c.size_bytes),
+        // GPS-update path doesn't touch perceptual hashes either; columns
+        // ignored by update_exif. Compute best-effort so a new file lands
+        // with a usable signal; failure just leaves prior values in place.
+        phash_64: perceptual_hash::compute(&full_path).map(|h| h.phash_64),
+        dhash_64: perceptual_hash::compute(&full_path).map(|h| h.dhash_64),
    };

    let updated = {
@@ -652,6 +659,37 @@ async fn upload_image(
            &full_path.to_str().unwrap().to_string(),
            true,
        ) {
+            // Pre-write content-hash check: if these exact bytes already
+            // exist anywhere in any library (and aren't themselves
+            // soft-marked as duplicates), don't write the file. Return
+            // 409 with the canonical sibling so the mobile app can show
+            // a friendly "already in your library" toast.
+            let upload_hash = blake3::Hasher::new()
+                .update(&file_content)
+                .finalize()
+                .to_hex()
+                .to_string();
+            {
+                let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
+                if let Ok(Some(existing)) = dao.find_by_content_hash(&span_context, &upload_hash)
+                    && existing.duplicate_of_hash.is_none()
+                {
+                    let library_name = libraries::load_all(&mut crate::database::connect())
+                        .into_iter()
+                        .find(|l| l.id == existing.library_id)
+                        .map(|l| l.name);
+                    span.set_status(Status::Ok);
+                    return HttpResponse::Conflict().json(serde_json::json!({
+                        "duplicate_of": {
+                            "library_id": existing.library_id,
+                            "rel_path": existing.file_path,
+                        },
+                        "content_hash": upload_hash,
+                        "library_name": library_name,
+                    }));
+                }
+            }
+
            let context =
                opentelemetry::Context::new().with_remote_span_context(span.span_context().clone());
            tracer
@@ -710,6 +748,7 @@ async fn upload_image(
                                (None, None)
                            }
                        };
+                        let perceptual = perceptual_hash::compute(&uploaded_path);
                        let insert_exif = InsertImageExif {
                            library_id: target_library.id,
                            file_path: relative_path.clone(),
@@ -731,6 +770,8 @@ async fn upload_image(
                            last_modified: timestamp,
                            content_hash,
                            size_bytes,
+                            phash_64: perceptual.map(|h| h.phash_64),
+                            dhash_64: perceptual.map(|h| h.dhash_64),
                        };

                        if let Ok(mut dao) = exif_dao.lock() {
@@ -1661,6 +1702,7 @@ fn main() -> std::io::Result<()> {
                .add_feature(add_tag_services::<_, SqliteTagDao>)
                .add_feature(knowledge::add_knowledge_services::<_, SqliteKnowledgeDao>)
                .add_feature(faces::add_face_services::<_, faces::SqliteFaceDao>)
+                .add_feature(duplicates::add_duplicate_services)
                .app_data(app_data.clone())
                .app_data::<Data<RealFileSystem>>(Data::new(RealFileSystem::new(
                    app_data.base_path.clone(),
@@ -2309,6 +2351,12 @@ fn process_new_files(
                }
            };

+            // Perceptual hashes (pHash + dHash). Best-effort — None for
+            // videos and decode failures. Drives near-duplicate detection
+            // in the Apollo duplicates surface; failure here is non-fatal
+            // and never blocks indexing.
+            let perceptual = perceptual_hash::compute(&file_path);
+
            // EXIF is best-effort enrichment. When extraction fails (or the
            // file type doesn't support EXIF) we still store a row with all
            // EXIF fields NULL; the file remains visible to sort-by-date
@@ -2360,6 +2408,8 @@ fn process_new_files(
                last_modified: timestamp,
                content_hash,
                size_bytes,
+                phash_64: perceptual.map(|h| h.phash_64),
+                dhash_64: perceptual.map(|h| h.dhash_64),
            };

            let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
--- a/src/perceptual_hash.rs
+++ b/src/perceptual_hash.rs
@@ -0,0 +1,159 @@
+//! Perceptual image hashing for near-duplicate detection.
+//!
+//! Two 64-bit signals per image, packed into i64 for storage and fast
+//! Hamming distance via XOR + popcount:
+//!
+//! - **pHash (DCT)** — robust to lossy recompression, format conversion,
+//!   moderate brightness/contrast shifts. The primary signal.
+//! - **dHash (gradient)** — much cheaper to compute, robust to scaling
+//!   and small crops. Acts as a fallback / corroboration when pHash is
+//!   ambiguous (very flat images can collide).
+//!
+//! Image-only by design. Videos, decode failures, and any image we
+//! can't open all return `None` — perceptual hash failure is non-fatal
+//! and must not block the indexer; the file is still hashed by blake3
+//! and exact-match dedup keeps working.
+
+use std::path::Path;
+
+use image_hasher::{HashAlg, HasherConfig};
+
+/// 64-bit perceptual fingerprint pair.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct PerceptualIdentity {
+    pub phash_64: i64,
+    pub dhash_64: i64,
+}
+
+/// Compute pHash + dHash for an image at `path`. Returns `None` on
+/// decode failure (unsupported format, corrupt bytes, video, etc.) —
+/// callers should treat that as "no perceptual signal available" and
+/// proceed with exact-match dedup only.
+pub fn compute(path: &Path) -> Option<PerceptualIdentity> {
+    let img = image::open(path).ok()?;
+
+    // 8x8 = 64 bits, the standard size for pHash/dHash. Larger sizes
+    // give more discriminative power but no longer fit in i64 and the
+    // marginal robustness isn't worth the storage / index cost for a
+    // personal-scale library.
+    let phash = HasherConfig::new()
+        .hash_alg(HashAlg::Mean)
+        .hash_size(8, 8)
+        .preproc_dct()
+        .to_hasher()
+        .hash_image(&img);
+
+    let dhash = HasherConfig::new()
+        .hash_alg(HashAlg::Gradient)
+        .hash_size(8, 8)
+        .to_hasher()
+        .hash_image(&img);
+
+    Some(PerceptualIdentity {
+        phash_64: bytes_to_i64(phash.as_bytes())?,
+        dhash_64: bytes_to_i64(dhash.as_bytes())?,
+    })
+}
+
+/// Hamming distance between two 64-bit perceptual hashes. The primary
+/// query primitive: two images are "near-duplicates" when this is below
+/// a threshold (default 8 for pHash, ~12% similarity tolerance). The
+/// duplicates module clusters via a BK-tree which uses its own copy of
+/// this calculation; this helper is kept for ad-hoc tools and tests.
+#[allow(dead_code)]
+#[inline]
+pub fn hamming_distance(a: i64, b: i64) -> u32 {
+    (a ^ b).count_ones()
+}
+
+fn bytes_to_i64(bytes: &[u8]) -> Option<i64> {
+    if bytes.len() < 8 {
+        return None;
+    }
+    let mut buf = [0u8; 8];
+    buf.copy_from_slice(&bytes[..8]);
+    Some(i64::from_be_bytes(buf))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use image::{ImageBuffer, Rgb};
+
+    fn write_test_image(path: &Path, seed: u32) {
+        // Deterministic-but-distinct image content: simple gradient with
+        // a per-seed offset. Gives pHash/dHash a real signal to work
+        // with (a uniform image collapses to all-zero hashes).
+        let img: ImageBuffer<Rgb<u8>, Vec<u8>> = ImageBuffer::from_fn(64, 64, |x, y| {
+            let r = ((x + seed) & 0xFF) as u8;
+            let g = ((y + seed * 2) & 0xFF) as u8;
+            let b = ((x ^ y ^ seed) & 0xFF) as u8;
+            Rgb([r, g, b])
+        });
+        img.save(path).unwrap();
+    }
+
+    #[test]
+    fn identical_bytes_yield_identical_hashes() {
+        let dir = tempfile::tempdir().unwrap();
+        let a = dir.path().join("a.png");
+        let b = dir.path().join("b.png");
+        write_test_image(&a, 42);
+        write_test_image(&b, 42);
+        let ha = compute(&a).expect("hash a");
+        let hb = compute(&b).expect("hash b");
+        assert_eq!(ha, hb);
+        assert_eq!(hamming_distance(ha.phash_64, hb.phash_64), 0);
+    }
+
+    #[test]
+    fn distinct_images_have_distinct_hashes() {
+        let dir = tempfile::tempdir().unwrap();
+        let a = dir.path().join("a.png");
+        let b = dir.path().join("b.png");
+        write_test_image(&a, 42);
+        write_test_image(&b, 123);
+        let ha = compute(&a).expect("hash a");
+        let hb = compute(&b).expect("hash b");
+        assert_ne!(ha.phash_64, hb.phash_64);
+    }
+
+    #[test]
+    fn resized_copy_is_near_duplicate_under_threshold() {
+        // The whole point of perceptual hashing: a resized copy of the
+        // same source image should land within a small Hamming distance
+        // of the original. We check the dHash specifically because it's
+        // the more resize-robust of the two; pHash is also tight but
+        // gradient-based dHash gives the most reliable signal here.
+        let dir = tempfile::tempdir().unwrap();
+        let a = dir.path().join("a.png");
+        write_test_image(&a, 7);
+        let img = image::open(&a).unwrap();
+        let small = img.resize_exact(32, 32, image::imageops::FilterType::Lanczos3);
+        let b = dir.path().join("b.png");
+        small.save(&b).unwrap();
+
+        let ha = compute(&a).expect("hash a");
+        let hb = compute(&b).expect("hash b");
+        let d_dhash = hamming_distance(ha.dhash_64, hb.dhash_64);
+        assert!(
+            d_dhash <= 8,
+            "expected dhash Hamming distance <= 8 for resized copy, got {}",
+            d_dhash
+        );
+    }
+
+    #[test]
+    fn unsupported_path_returns_none() {
+        let dir = tempfile::tempdir().unwrap();
+        let p = dir.path().join("notanimage.txt");
+        std::fs::write(&p, b"hello").unwrap();
+        assert!(compute(&p).is_none());
+    }
+
+    #[test]
+    fn missing_file_returns_none() {
+        let p = Path::new("/nonexistent/path/that/does/not/exist.png");
+        assert!(compute(p).is_none());
+    }
+}