duplicates: perceptual hash + soft-mark resolution + upload 409

Adds pHash + dHash columns alongside the existing blake3 content_hash so
near-duplicates (re-encoded, resized, format-converted copies) become
queryable. /duplicates/{exact,perceptual} return groups; /duplicates/
{resolve,unresolve} flip a duplicate_of_hash soft-mark on losing rows
and union perceptual-only tag sets onto the survivor. The default
/photos listing filters duplicate_of_hash IS NULL so demoted siblings
stop cluttering the grid; include_duplicates=true opts back in for
Apollo's review modal. Upload now hashes bytes pre-write and returns
409 with the canonical sibling when a file's bytes already exist.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Cameron Cordes
2026-05-03 17:36:01 -04:00
parent 4340b164eb
commit 7584cd8792
14 changed files with 1852 additions and 1 deletions

View File

@@ -9,6 +9,25 @@ use crate::database::models::{
};
use crate::otel::trace_db_call;
/// Wire shape for a single member of a duplicate group, returned by
/// `list_duplicates_*` and `lookup_duplicate_row`. Carries everything
/// the Apollo modal needs to render a member tile and its meta line —
/// thumbnails are derived from `(library_id, rel_path)` upstream.
#[derive(Debug, Clone, serde::Serialize)]
pub struct DuplicateRow {
pub library_id: i32,
pub rel_path: String,
pub content_hash: String,
pub size_bytes: Option<i64>,
pub date_taken: Option<i64>,
pub width: Option<i32>,
pub height: Option<i32>,
pub phash_64: Option<i64>,
pub dhash_64: Option<i64>,
pub duplicate_of_hash: Option<String>,
pub duplicate_decided_at: Option<i64>,
}
pub mod calendar_dao;
pub mod daily_summary_dao;
pub mod insights_dao;
@@ -377,6 +396,104 @@ pub trait ExifDao: Sync + Send {
size_bytes: i64,
) -> Result<(), DbError>;
/// Return image rows that have a `content_hash` but no `phash_64`,
/// oldest first. Used by the `backfill_perceptual_hash` binary.
/// Filters by image extension at the DB layer to avoid ever asking
/// `image_hasher` to decode a video. Returns `(library_id, rel_path)`.
fn get_rows_missing_perceptual_hash(
&mut self,
context: &opentelemetry::Context,
limit: i64,
) -> Result<Vec<(i32, String)>, DbError>;
/// Persist computed perceptual hashes (pHash + dHash) for an
/// existing image_exif row. Either column may be left NULL by
/// passing `None`, but in practice the binary computes both or
/// neither — `image_hasher` either decodes the image and produces
/// both signals, or fails entirely.
fn backfill_perceptual_hash(
&mut self,
context: &opentelemetry::Context,
library_id: i32,
rel_path: &str,
phash_64: Option<i64>,
dhash_64: Option<i64>,
) -> Result<(), DbError>;
/// Group exact-hash duplicates: rows whose `content_hash` appears
/// more than once across the (optionally library-scoped) corpus.
/// Returns one [`DuplicateRow`] per member; callers group by
/// `content_hash`. When `include_resolved=false`, rows already
/// soft-marked (`duplicate_of_hash IS NOT NULL`) are excluded so
/// the modal doesn't re-surface decisions the user already made.
fn list_duplicates_exact(
&mut self,
context: &opentelemetry::Context,
library_id: Option<i32>,
include_resolved: bool,
) -> Result<Vec<DuplicateRow>, DbError>;
/// Return all rows with a non-null `phash_64` (optionally library-
/// scoped), used by the perceptual-cluster routine in
/// [`crate::main`] to single-link cluster via Hamming distance.
/// Each returned row is a *distinct content_hash* — exact duplicates
/// are collapsed at the DB layer so the in-memory clusterer doesn't
/// rediscover them.
fn list_perceptual_candidates(
&mut self,
context: &opentelemetry::Context,
library_id: Option<i32>,
include_resolved: bool,
) -> Result<Vec<DuplicateRow>, DbError>;
/// Look up a single row's metadata by `(library_id, rel_path)`. Used
/// by the resolve endpoint to map the request payload to the
/// underlying `content_hash` before writing the soft-mark. Returns
/// `Ok(None)` if the file doesn't exist in `image_exif`.
fn lookup_duplicate_row(
&mut self,
context: &opentelemetry::Context,
library_id: i32,
rel_path: &str,
) -> Result<Option<DuplicateRow>, DbError>;
/// Soft-mark a file as a duplicate of `survivor_hash`. Sets
/// `duplicate_of_hash` and `duplicate_decided_at` on the row(s)
/// matching `(library_id, rel_path)`. The file stays on disk; the
/// default `/photos` listing hides it because of the
/// `duplicate_of_hash IS NULL` filter.
fn set_duplicate_of(
&mut self,
context: &opentelemetry::Context,
library_id: i32,
rel_path: &str,
survivor_hash: &str,
decided_at: i64,
) -> Result<(), DbError>;
/// Reverse a soft-mark: clears `duplicate_of_hash` and
/// `duplicate_decided_at`. Used by the modal's UNRESOLVE chip.
fn clear_duplicate_of(
&mut self,
context: &opentelemetry::Context,
library_id: i32,
rel_path: &str,
) -> Result<(), DbError>;
/// Union the tags from `demoted_hash` onto `survivor_hash`. Used at
/// resolve time for *perceptual* duplicates (different content_hashes,
/// independent tag sets) so the user doesn't lose their tagging work
/// when promoting a survivor. Idempotent: a tag already on the survivor
/// is left alone. Exact duplicates (same content_hash) don't need this
/// because their tag rows are already shared.
fn union_perceptual_tags(
&mut self,
context: &opentelemetry::Context,
survivor_hash: &str,
demoted_hash: &str,
survivor_rel_path: &str,
) -> Result<(), DbError>;
/// Return the first EXIF row with the given content hash (any library).
/// Used by thumbnail/HLS generation to detect pre-existing derivatives
/// from another library before regenerating.
@@ -440,11 +557,17 @@ pub trait ExifDao: Sync + Send {
/// `library_ids` is empty, rows from every library are returned. Used by
/// `/photos` recursive listing to skip the filesystem walk — the watcher
/// keeps image_exif in parity with disk via the reconciliation pass.
///
/// `include_duplicates=false` filters out rows soft-marked with
/// `duplicate_of_hash IS NOT NULL` so the default photo listing hides
/// demoted siblings; the Apollo duplicates modal passes `true` to
/// see both survivors and demoted members inside a group.
fn list_rel_paths_for_libraries(
&mut self,
context: &opentelemetry::Context,
library_ids: &[i32],
path_prefix: Option<&str>,
include_duplicates: bool,
) -> Result<Vec<(i32, String)>, DbError>;
/// Delete a single image_exif row scoped to `(library_id, rel_path)`.
@@ -1077,6 +1200,7 @@ impl ExifDao for SqliteExifDao {
context: &opentelemetry::Context,
library_ids: &[i32],
path_prefix: Option<&str>,
include_duplicates: bool,
) -> Result<Vec<(i32, String)>, DbError> {
trace_db_call(context, "query", "list_rel_paths_for_libraries", |_span| {
use schema::image_exif::dsl::*;
@@ -1097,6 +1221,10 @@ impl ExifDao for SqliteExifDao {
query = query.filter(rel_path.like(pattern).escape('\\'));
}
if !include_duplicates {
query = query.filter(duplicate_of_hash.is_null());
}
query
.load::<(i32, String)>(connection.deref_mut())
.map_err(|_| anyhow::anyhow!("Query error"))
@@ -1168,6 +1296,421 @@ impl ExifDao for SqliteExifDao {
)
.map_err(|_| DbError::new(DbErrorKind::QueryError))
}
fn get_rows_missing_perceptual_hash(
&mut self,
context: &opentelemetry::Context,
limit: i64,
) -> Result<Vec<(i32, String)>, DbError> {
trace_db_call(
context,
"query",
"get_rows_missing_perceptual_hash",
|_span| {
use schema::image_exif::dsl::*;
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
// Image-only filter via extension. Videos and decode-failures
// would always come back NULL otherwise and the binary would
// grind through them on every run. The list mirrors the file
// formats `image` 0.25 / `image_hasher` 3.x can decode.
image_exif
.filter(content_hash.is_not_null())
.filter(phash_64.is_null())
.filter(
rel_path
.like("%.jpg")
.or(rel_path.like("%.jpeg"))
.or(rel_path.like("%.JPG"))
.or(rel_path.like("%.JPEG"))
.or(rel_path.like("%.png"))
.or(rel_path.like("%.PNG"))
.or(rel_path.like("%.webp"))
.or(rel_path.like("%.WEBP"))
.or(rel_path.like("%.tif"))
.or(rel_path.like("%.tiff"))
.or(rel_path.like("%.TIF"))
.or(rel_path.like("%.TIFF"))
.or(rel_path.like("%.avif"))
.or(rel_path.like("%.AVIF")),
)
.select((library_id, rel_path))
.order(id.asc())
.limit(limit)
.load::<(i32, String)>(connection.deref_mut())
.map_err(|_| anyhow::anyhow!("Query error"))
},
)
.map_err(|_| DbError::new(DbErrorKind::QueryError))
}
fn backfill_perceptual_hash(
&mut self,
context: &opentelemetry::Context,
library_id_val: i32,
rel_path_val: &str,
phash_val: Option<i64>,
dhash_val: Option<i64>,
) -> Result<(), DbError> {
trace_db_call(context, "update", "backfill_perceptual_hash", |_span| {
use schema::image_exif::dsl::*;
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
diesel::update(
image_exif
.filter(library_id.eq(library_id_val))
.filter(rel_path.eq(rel_path_val)),
)
.set((phash_64.eq(phash_val), dhash_64.eq(dhash_val)))
.execute(connection.deref_mut())
.map(|_| ())
.map_err(|_| anyhow::anyhow!("Update error"))
})
.map_err(|_| DbError::new(DbErrorKind::UpdateError))
}
fn list_duplicates_exact(
&mut self,
context: &opentelemetry::Context,
library_id_filter: Option<i32>,
include_resolved: bool,
) -> Result<Vec<DuplicateRow>, DbError> {
trace_db_call(context, "query", "list_duplicates_exact", |_span| {
// Sub-select the content_hashes that appear more than once
// (optionally library-scoped), then load the full member rows
// for those hashes ordered by hash + library + path so the
// caller can stream-group without buffering the full dataset.
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
// Step 1: hashes with count > 1.
let dup_hashes: Vec<String> = {
use schema::image_exif::dsl::*;
let mut q = image_exif
.filter(content_hash.is_not_null())
.group_by(content_hash)
.select(content_hash.assume_not_null())
.having(diesel::dsl::count_star().gt(1))
.into_boxed();
if let Some(lib) = library_id_filter {
q = q.filter(library_id.eq(lib));
}
q.load::<String>(connection.deref_mut())
.map_err(|_| anyhow::anyhow!("Query error"))?
};
if dup_hashes.is_empty() {
return Ok(Vec::new());
}
// Step 2: every member row for those hashes.
use schema::image_exif::dsl::*;
let mut q = image_exif
.filter(content_hash.eq_any(&dup_hashes))
.select((
library_id,
rel_path,
content_hash.assume_not_null(),
size_bytes,
date_taken,
width,
height,
phash_64,
dhash_64,
duplicate_of_hash,
duplicate_decided_at,
))
.order((content_hash.asc(), library_id.asc(), rel_path.asc()))
.into_boxed();
if let Some(lib) = library_id_filter {
q = q.filter(library_id.eq(lib));
}
if !include_resolved {
q = q.filter(duplicate_of_hash.is_null());
}
let rows: Vec<(
i32,
String,
String,
Option<i64>,
Option<i64>,
Option<i32>,
Option<i32>,
Option<i64>,
Option<i64>,
Option<String>,
Option<i64>,
)> = q
.load(connection.deref_mut())
.map_err(|_| anyhow::anyhow!("Query error"))?;
Ok(rows
.into_iter()
.map(|r| DuplicateRow {
library_id: r.0,
rel_path: r.1,
content_hash: r.2,
size_bytes: r.3,
date_taken: r.4,
width: r.5,
height: r.6,
phash_64: r.7,
dhash_64: r.8,
duplicate_of_hash: r.9,
duplicate_decided_at: r.10,
})
.collect())
})
.map_err(|_| DbError::new(DbErrorKind::QueryError))
}
fn list_perceptual_candidates(
&mut self,
context: &opentelemetry::Context,
library_id_filter: Option<i32>,
include_resolved: bool,
) -> Result<Vec<DuplicateRow>, DbError> {
trace_db_call(context, "query", "list_perceptual_candidates", |_span| {
use schema::image_exif::dsl::*;
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
// For perceptual candidates we want one canonical row per
// distinct content_hash — exact dups are clustered by the
// exact-dup query and would only pollute the perceptual
// graph with zero-distance edges. Diesel doesn't have a
// clean `DISTINCT ON`, so we load every row and dedup
// client-side keyed on content_hash. The result set is small
// (only rows with a phash) and the cost is negligible vs
// the BK-tree clustering that follows.
let mut q = image_exif
.filter(content_hash.is_not_null())
.filter(phash_64.is_not_null())
.select((
library_id,
rel_path,
content_hash.assume_not_null(),
size_bytes,
date_taken,
width,
height,
phash_64,
dhash_64,
duplicate_of_hash,
duplicate_decided_at,
))
.order((content_hash.asc(), library_id.asc(), rel_path.asc()))
.into_boxed();
if let Some(lib) = library_id_filter {
q = q.filter(library_id.eq(lib));
}
if !include_resolved {
q = q.filter(duplicate_of_hash.is_null());
}
let rows: Vec<(
i32,
String,
String,
Option<i64>,
Option<i64>,
Option<i32>,
Option<i32>,
Option<i64>,
Option<i64>,
Option<String>,
Option<i64>,
)> = q
.load(connection.deref_mut())
.map_err(|_| anyhow::anyhow!("Query error"))?;
// Dedup keyed on content_hash, keeping the first occurrence
// (deterministic by the SQL ORDER BY: lowest library_id,
// then lexicographically smallest rel_path).
let mut seen = std::collections::HashSet::new();
let mut out = Vec::with_capacity(rows.len());
for r in rows {
if seen.insert(r.2.clone()) {
out.push(DuplicateRow {
library_id: r.0,
rel_path: r.1,
content_hash: r.2,
size_bytes: r.3,
date_taken: r.4,
width: r.5,
height: r.6,
phash_64: r.7,
dhash_64: r.8,
duplicate_of_hash: r.9,
duplicate_decided_at: r.10,
});
}
}
Ok(out)
})
.map_err(|_| DbError::new(DbErrorKind::QueryError))
}
fn lookup_duplicate_row(
&mut self,
context: &opentelemetry::Context,
library_id_val: i32,
rel_path_val: &str,
) -> Result<Option<DuplicateRow>, DbError> {
trace_db_call(context, "query", "lookup_duplicate_row", |_span| {
use schema::image_exif::dsl::*;
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
image_exif
.filter(library_id.eq(library_id_val))
.filter(rel_path.eq(rel_path_val))
.filter(content_hash.is_not_null())
.select((
library_id,
rel_path,
content_hash.assume_not_null(),
size_bytes,
date_taken,
width,
height,
phash_64,
dhash_64,
duplicate_of_hash,
duplicate_decided_at,
))
.first::<(
i32,
String,
String,
Option<i64>,
Option<i64>,
Option<i32>,
Option<i32>,
Option<i64>,
Option<i64>,
Option<String>,
Option<i64>,
)>(connection.deref_mut())
.optional()
.map(|opt| {
opt.map(|r| DuplicateRow {
library_id: r.0,
rel_path: r.1,
content_hash: r.2,
size_bytes: r.3,
date_taken: r.4,
width: r.5,
height: r.6,
phash_64: r.7,
dhash_64: r.8,
duplicate_of_hash: r.9,
duplicate_decided_at: r.10,
})
})
.map_err(|_| anyhow::anyhow!("Query error"))
})
.map_err(|_| DbError::new(DbErrorKind::QueryError))
}
fn set_duplicate_of(
&mut self,
context: &opentelemetry::Context,
library_id_val: i32,
rel_path_val: &str,
survivor_hash: &str,
decided_at: i64,
) -> Result<(), DbError> {
trace_db_call(context, "update", "set_duplicate_of", |_span| {
use schema::image_exif::dsl::*;
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
diesel::update(
image_exif
.filter(library_id.eq(library_id_val))
.filter(rel_path.eq(rel_path_val)),
)
.set((
duplicate_of_hash.eq(survivor_hash),
duplicate_decided_at.eq(decided_at),
))
.execute(connection.deref_mut())
.map(|_| ())
.map_err(|_| anyhow::anyhow!("Update error"))
})
.map_err(|_| DbError::new(DbErrorKind::UpdateError))
}
fn clear_duplicate_of(
&mut self,
context: &opentelemetry::Context,
library_id_val: i32,
rel_path_val: &str,
) -> Result<(), DbError> {
trace_db_call(context, "update", "clear_duplicate_of", |_span| {
use schema::image_exif::dsl::*;
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
diesel::update(
image_exif
.filter(library_id.eq(library_id_val))
.filter(rel_path.eq(rel_path_val)),
)
.set((
duplicate_of_hash.eq::<Option<String>>(None),
duplicate_decided_at.eq::<Option<i64>>(None),
))
.execute(connection.deref_mut())
.map(|_| ())
.map_err(|_| anyhow::anyhow!("Update error"))
})
.map_err(|_| DbError::new(DbErrorKind::UpdateError))
}
fn union_perceptual_tags(
&mut self,
context: &opentelemetry::Context,
survivor_hash: &str,
demoted_hash: &str,
survivor_rel_path: &str,
) -> Result<(), DbError> {
trace_db_call(context, "update", "union_perceptual_tags", |_span| {
// INSERT OR IGNORE handles two relevant uniqueness paths:
// - tagged_photo (rel_path, tag_id) is the historical key,
// so existing tag rows under the survivor's path collide
// and stay put.
// - The (rel_path, tag_id) collision is the one that
// matters for idempotence; (content_hash, tag_id) at the
// bytes level isn't enforced by SQLite but the read path
// dedups on it, so an extra row would be cosmetic.
// Tags whose rel_path differs are inserted, picking up the
// survivor's content_hash so they live under the right bytes.
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
diesel::sql_query(
"INSERT OR IGNORE INTO tagged_photo (rel_path, tag_id, created_time, content_hash) \
SELECT ?, tag_id, strftime('%s','now'), ? \
FROM tagged_photo \
WHERE content_hash = ? \
AND tag_id NOT IN ( \
SELECT tag_id FROM tagged_photo WHERE content_hash = ? \
)",
)
.bind::<diesel::sql_types::Text, _>(survivor_rel_path)
.bind::<diesel::sql_types::Text, _>(survivor_hash)
.bind::<diesel::sql_types::Text, _>(demoted_hash)
.bind::<diesel::sql_types::Text, _>(survivor_hash)
.execute(connection.deref_mut())
.map(|_| ())
.map_err(|_| anyhow::anyhow!("Tag union error"))
})
.map_err(|_| DbError::new(DbErrorKind::UpdateError))
}
}
#[cfg(test)]
@@ -1204,6 +1747,8 @@ mod exif_dao_tests {
last_modified: 0,
content_hash: None,
size_bytes: None,
phash_64: None,
dhash_64: None,
},
)
.expect("insert exif row");

View File

@@ -59,6 +59,10 @@ pub struct InsertImageExif {
pub last_modified: i64,
pub content_hash: Option<String>,
pub size_bytes: Option<i64>,
/// 64-bit pHash (DCT) packed as i64. NULL for videos and decode failures.
pub phash_64: Option<i64>,
/// 64-bit dHash (gradient). NULL for videos and decode failures.
pub dhash_64: Option<i64>,
}
// Field order matches the post-migration column order in `image_exif`.
@@ -86,6 +90,14 @@ pub struct ImageExif {
pub last_modified: i64,
pub content_hash: Option<String>,
pub size_bytes: Option<i64>,
pub phash_64: Option<i64>,
pub dhash_64: Option<i64>,
/// When non-null, this row is a soft-marked duplicate of the file
/// whose `content_hash` matches this value. The default `/photos`
/// listing filters such rows out.
pub duplicate_of_hash: Option<String>,
/// Unix seconds at which the resolve was committed.
pub duplicate_decided_at: Option<i64>,
}
#[derive(Insertable)]

View File

@@ -121,6 +121,10 @@ diesel::table! {
last_modified -> BigInt,
content_hash -> Nullable<Text>,
size_bytes -> Nullable<BigInt>,
phash_64 -> Nullable<BigInt>,
dhash_64 -> Nullable<BigInt>,
duplicate_of_hash -> Nullable<Text>,
duplicate_decided_at -> Nullable<BigInt>,
}
}