Adds pHash + dHash columns alongside the existing blake3 content_hash so
near-duplicates (re-encoded, resized, format-converted copies) become
queryable. /duplicates/{exact,perceptual} return groups; /duplicates/
{resolve,unresolve} flip a duplicate_of_hash soft-mark on losing rows
and union perceptual-only tag sets onto the survivor. The default
/photos listing filters duplicate_of_hash IS NULL so demoted siblings
stop cluttering the grid; include_duplicates=true opts back in for
Apollo's review modal. Upload now hashes bytes pre-write and returns
409 with the canonical sibling when a file's bytes already exist.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
160 lines
5.6 KiB
Rust
160 lines
5.6 KiB
Rust
//! Perceptual image hashing for near-duplicate detection.
|
|
//!
|
|
//! Two 64-bit signals per image, packed into i64 for storage and fast
|
|
//! Hamming distance via XOR + popcount:
|
|
//!
|
|
//! - **pHash (DCT)** — robust to lossy recompression, format conversion,
|
|
//! moderate brightness/contrast shifts. The primary signal.
|
|
//! - **dHash (gradient)** — much cheaper to compute, robust to scaling
|
|
//! and small crops. Acts as a fallback / corroboration when pHash is
|
|
//! ambiguous (very flat images can collide).
|
|
//!
|
|
//! Image-only by design. Videos, decode failures, and any image we
|
|
//! can't open all return `None` — perceptual hash failure is non-fatal
|
|
//! and must not block the indexer; the file is still hashed by blake3
|
|
//! and exact-match dedup keeps working.
|
|
|
|
use std::path::Path;
|
|
|
|
use image_hasher::{HashAlg, HasherConfig};
|
|
|
|
/// 64-bit perceptual fingerprint pair.
|
|
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
|
pub struct PerceptualIdentity {
|
|
pub phash_64: i64,
|
|
pub dhash_64: i64,
|
|
}
|
|
|
|
/// Compute pHash + dHash for an image at `path`. Returns `None` on
|
|
/// decode failure (unsupported format, corrupt bytes, video, etc.) —
|
|
/// callers should treat that as "no perceptual signal available" and
|
|
/// proceed with exact-match dedup only.
|
|
pub fn compute(path: &Path) -> Option<PerceptualIdentity> {
|
|
let img = image::open(path).ok()?;
|
|
|
|
// 8x8 = 64 bits, the standard size for pHash/dHash. Larger sizes
|
|
// give more discriminative power but no longer fit in i64 and the
|
|
// marginal robustness isn't worth the storage / index cost for a
|
|
// personal-scale library.
|
|
let phash = HasherConfig::new()
|
|
.hash_alg(HashAlg::Mean)
|
|
.hash_size(8, 8)
|
|
.preproc_dct()
|
|
.to_hasher()
|
|
.hash_image(&img);
|
|
|
|
let dhash = HasherConfig::new()
|
|
.hash_alg(HashAlg::Gradient)
|
|
.hash_size(8, 8)
|
|
.to_hasher()
|
|
.hash_image(&img);
|
|
|
|
Some(PerceptualIdentity {
|
|
phash_64: bytes_to_i64(phash.as_bytes())?,
|
|
dhash_64: bytes_to_i64(dhash.as_bytes())?,
|
|
})
|
|
}
|
|
|
|
/// Hamming distance between two 64-bit perceptual hashes. The primary
|
|
/// query primitive: two images are "near-duplicates" when this is below
|
|
/// a threshold (default 8 for pHash, ~12% similarity tolerance). The
|
|
/// duplicates module clusters via a BK-tree which uses its own copy of
|
|
/// this calculation; this helper is kept for ad-hoc tools and tests.
|
|
#[allow(dead_code)]
|
|
#[inline]
|
|
pub fn hamming_distance(a: i64, b: i64) -> u32 {
|
|
(a ^ b).count_ones()
|
|
}
|
|
|
|
fn bytes_to_i64(bytes: &[u8]) -> Option<i64> {
|
|
if bytes.len() < 8 {
|
|
return None;
|
|
}
|
|
let mut buf = [0u8; 8];
|
|
buf.copy_from_slice(&bytes[..8]);
|
|
Some(i64::from_be_bytes(buf))
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use image::{ImageBuffer, Rgb};
|
|
|
|
fn write_test_image(path: &Path, seed: u32) {
|
|
// Deterministic-but-distinct image content: simple gradient with
|
|
// a per-seed offset. Gives pHash/dHash a real signal to work
|
|
// with (a uniform image collapses to all-zero hashes).
|
|
let img: ImageBuffer<Rgb<u8>, Vec<u8>> = ImageBuffer::from_fn(64, 64, |x, y| {
|
|
let r = ((x + seed) & 0xFF) as u8;
|
|
let g = ((y + seed * 2) & 0xFF) as u8;
|
|
let b = ((x ^ y ^ seed) & 0xFF) as u8;
|
|
Rgb([r, g, b])
|
|
});
|
|
img.save(path).unwrap();
|
|
}
|
|
|
|
#[test]
|
|
fn identical_bytes_yield_identical_hashes() {
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let a = dir.path().join("a.png");
|
|
let b = dir.path().join("b.png");
|
|
write_test_image(&a, 42);
|
|
write_test_image(&b, 42);
|
|
let ha = compute(&a).expect("hash a");
|
|
let hb = compute(&b).expect("hash b");
|
|
assert_eq!(ha, hb);
|
|
assert_eq!(hamming_distance(ha.phash_64, hb.phash_64), 0);
|
|
}
|
|
|
|
#[test]
|
|
fn distinct_images_have_distinct_hashes() {
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let a = dir.path().join("a.png");
|
|
let b = dir.path().join("b.png");
|
|
write_test_image(&a, 42);
|
|
write_test_image(&b, 123);
|
|
let ha = compute(&a).expect("hash a");
|
|
let hb = compute(&b).expect("hash b");
|
|
assert_ne!(ha.phash_64, hb.phash_64);
|
|
}
|
|
|
|
#[test]
|
|
fn resized_copy_is_near_duplicate_under_threshold() {
|
|
// The whole point of perceptual hashing: a resized copy of the
|
|
// same source image should land within a small Hamming distance
|
|
// of the original. We check the dHash specifically because it's
|
|
// the more resize-robust of the two; pHash is also tight but
|
|
// gradient-based dHash gives the most reliable signal here.
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let a = dir.path().join("a.png");
|
|
write_test_image(&a, 7);
|
|
let img = image::open(&a).unwrap();
|
|
let small = img.resize_exact(32, 32, image::imageops::FilterType::Lanczos3);
|
|
let b = dir.path().join("b.png");
|
|
small.save(&b).unwrap();
|
|
|
|
let ha = compute(&a).expect("hash a");
|
|
let hb = compute(&b).expect("hash b");
|
|
let d_dhash = hamming_distance(ha.dhash_64, hb.dhash_64);
|
|
assert!(
|
|
d_dhash <= 8,
|
|
"expected dhash Hamming distance <= 8 for resized copy, got {}",
|
|
d_dhash
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn unsupported_path_returns_none() {
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let p = dir.path().join("notanimage.txt");
|
|
std::fs::write(&p, b"hello").unwrap();
|
|
assert!(compute(&p).is_none());
|
|
}
|
|
|
|
#[test]
|
|
fn missing_file_returns_none() {
|
|
let p = Path::new("/nonexistent/path/that/does/not/exist.png");
|
|
assert!(compute(p).is_none());
|
|
}
|
|
}
|