duplicates: perceptual hash + soft-mark resolution + upload 409
Adds pHash + dHash columns alongside the existing blake3 content_hash so
near-duplicates (re-encoded, resized, format-converted copies) become
queryable. /duplicates/{exact,perceptual} return groups; /duplicates/
{resolve,unresolve} flip a duplicate_of_hash soft-mark on losing rows
and union perceptual-only tag sets onto the survivor. The default
/photos listing filters duplicate_of_hash IS NULL so demoted siblings
stop cluttering the grid; include_duplicates=true opts back in for
Apollo's review modal. Upload now hashes bytes pre-write and returns
409 with the canonical sibling when a file's bytes already exist.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
159
src/perceptual_hash.rs
Normal file
159
src/perceptual_hash.rs
Normal file
@@ -0,0 +1,159 @@
|
||||
//! Perceptual image hashing for near-duplicate detection.
|
||||
//!
|
||||
//! Two 64-bit signals per image, packed into i64 for storage and fast
|
||||
//! Hamming distance via XOR + popcount:
|
||||
//!
|
||||
//! - **pHash (DCT)** — robust to lossy recompression, format conversion,
|
||||
//! moderate brightness/contrast shifts. The primary signal.
|
||||
//! - **dHash (gradient)** — much cheaper to compute, robust to scaling
|
||||
//! and small crops. Acts as a fallback / corroboration when pHash is
|
||||
//! ambiguous (very flat images can collide).
|
||||
//!
|
||||
//! Image-only by design. Videos, decode failures, and any image we
|
||||
//! can't open all return `None` — perceptual hash failure is non-fatal
|
||||
//! and must not block the indexer; the file is still hashed by blake3
|
||||
//! and exact-match dedup keeps working.
|
||||
|
||||
use std::path::Path;
|
||||
|
||||
use image_hasher::{HashAlg, HasherConfig};
|
||||
|
||||
/// 64-bit perceptual fingerprint pair.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||
pub struct PerceptualIdentity {
|
||||
pub phash_64: i64,
|
||||
pub dhash_64: i64,
|
||||
}
|
||||
|
||||
/// Compute pHash + dHash for an image at `path`. Returns `None` on
|
||||
/// decode failure (unsupported format, corrupt bytes, video, etc.) —
|
||||
/// callers should treat that as "no perceptual signal available" and
|
||||
/// proceed with exact-match dedup only.
|
||||
pub fn compute(path: &Path) -> Option<PerceptualIdentity> {
|
||||
let img = image::open(path).ok()?;
|
||||
|
||||
// 8x8 = 64 bits, the standard size for pHash/dHash. Larger sizes
|
||||
// give more discriminative power but no longer fit in i64 and the
|
||||
// marginal robustness isn't worth the storage / index cost for a
|
||||
// personal-scale library.
|
||||
let phash = HasherConfig::new()
|
||||
.hash_alg(HashAlg::Mean)
|
||||
.hash_size(8, 8)
|
||||
.preproc_dct()
|
||||
.to_hasher()
|
||||
.hash_image(&img);
|
||||
|
||||
let dhash = HasherConfig::new()
|
||||
.hash_alg(HashAlg::Gradient)
|
||||
.hash_size(8, 8)
|
||||
.to_hasher()
|
||||
.hash_image(&img);
|
||||
|
||||
Some(PerceptualIdentity {
|
||||
phash_64: bytes_to_i64(phash.as_bytes())?,
|
||||
dhash_64: bytes_to_i64(dhash.as_bytes())?,
|
||||
})
|
||||
}
|
||||
|
||||
/// Hamming distance between two 64-bit perceptual hashes. The primary
|
||||
/// query primitive: two images are "near-duplicates" when this is below
|
||||
/// a threshold (default 8 for pHash, ~12% similarity tolerance). The
|
||||
/// duplicates module clusters via a BK-tree which uses its own copy of
|
||||
/// this calculation; this helper is kept for ad-hoc tools and tests.
|
||||
#[allow(dead_code)]
|
||||
#[inline]
|
||||
pub fn hamming_distance(a: i64, b: i64) -> u32 {
|
||||
(a ^ b).count_ones()
|
||||
}
|
||||
|
||||
fn bytes_to_i64(bytes: &[u8]) -> Option<i64> {
|
||||
if bytes.len() < 8 {
|
||||
return None;
|
||||
}
|
||||
let mut buf = [0u8; 8];
|
||||
buf.copy_from_slice(&bytes[..8]);
|
||||
Some(i64::from_be_bytes(buf))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use image::{ImageBuffer, Rgb};
|
||||
|
||||
fn write_test_image(path: &Path, seed: u32) {
|
||||
// Deterministic-but-distinct image content: simple gradient with
|
||||
// a per-seed offset. Gives pHash/dHash a real signal to work
|
||||
// with (a uniform image collapses to all-zero hashes).
|
||||
let img: ImageBuffer<Rgb<u8>, Vec<u8>> = ImageBuffer::from_fn(64, 64, |x, y| {
|
||||
let r = ((x + seed) & 0xFF) as u8;
|
||||
let g = ((y + seed * 2) & 0xFF) as u8;
|
||||
let b = ((x ^ y ^ seed) & 0xFF) as u8;
|
||||
Rgb([r, g, b])
|
||||
});
|
||||
img.save(path).unwrap();
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn identical_bytes_yield_identical_hashes() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let a = dir.path().join("a.png");
|
||||
let b = dir.path().join("b.png");
|
||||
write_test_image(&a, 42);
|
||||
write_test_image(&b, 42);
|
||||
let ha = compute(&a).expect("hash a");
|
||||
let hb = compute(&b).expect("hash b");
|
||||
assert_eq!(ha, hb);
|
||||
assert_eq!(hamming_distance(ha.phash_64, hb.phash_64), 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn distinct_images_have_distinct_hashes() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let a = dir.path().join("a.png");
|
||||
let b = dir.path().join("b.png");
|
||||
write_test_image(&a, 42);
|
||||
write_test_image(&b, 123);
|
||||
let ha = compute(&a).expect("hash a");
|
||||
let hb = compute(&b).expect("hash b");
|
||||
assert_ne!(ha.phash_64, hb.phash_64);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn resized_copy_is_near_duplicate_under_threshold() {
|
||||
// The whole point of perceptual hashing: a resized copy of the
|
||||
// same source image should land within a small Hamming distance
|
||||
// of the original. We check the dHash specifically because it's
|
||||
// the more resize-robust of the two; pHash is also tight but
|
||||
// gradient-based dHash gives the most reliable signal here.
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let a = dir.path().join("a.png");
|
||||
write_test_image(&a, 7);
|
||||
let img = image::open(&a).unwrap();
|
||||
let small = img.resize_exact(32, 32, image::imageops::FilterType::Lanczos3);
|
||||
let b = dir.path().join("b.png");
|
||||
small.save(&b).unwrap();
|
||||
|
||||
let ha = compute(&a).expect("hash a");
|
||||
let hb = compute(&b).expect("hash b");
|
||||
let d_dhash = hamming_distance(ha.dhash_64, hb.dhash_64);
|
||||
assert!(
|
||||
d_dhash <= 8,
|
||||
"expected dhash Hamming distance <= 8 for resized copy, got {}",
|
||||
d_dhash
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn unsupported_path_returns_none() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let p = dir.path().join("notanimage.txt");
|
||||
std::fs::write(&p, b"hello").unwrap();
|
||||
assert!(compute(&p).is_none());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn missing_file_returns_none() {
|
||||
let p = Path::new("/nonexistent/path/that/does/not/exist.png");
|
||||
assert!(compute(p).is_none());
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user