//! Perceptual image hashing for near-duplicate detection. //! //! Two 64-bit signals per image, packed into i64 for storage and fast //! Hamming distance via XOR + popcount: //! //! - **pHash (DCT)** — robust to lossy recompression, format conversion, //! moderate brightness/contrast shifts. The primary signal. //! - **dHash (gradient)** — much cheaper to compute, robust to scaling //! and small crops. Acts as a fallback / corroboration when pHash is //! ambiguous (very flat images can collide). //! //! Image-only by design. Videos, decode failures, and any image we //! can't open all return `None` — perceptual hash failure is non-fatal //! and must not block the indexer; the file is still hashed by blake3 //! and exact-match dedup keeps working. use std::path::Path; use image_hasher::{HashAlg, HasherConfig}; /// 64-bit perceptual fingerprint pair. #[derive(Clone, Copy, Debug, PartialEq, Eq)] pub struct PerceptualIdentity { pub phash_64: i64, pub dhash_64: i64, } /// Compute pHash + dHash for an image at `path`. Returns `None` on /// decode failure (unsupported format, corrupt bytes, video, etc.) — /// callers should treat that as "no perceptual signal available" and /// proceed with exact-match dedup only. pub fn compute(path: &Path) -> Option { let img = image::open(path).ok()?; // 8x8 = 64 bits, the standard size for pHash/dHash. Larger sizes // give more discriminative power but no longer fit in i64 and the // marginal robustness isn't worth the storage / index cost for a // personal-scale library. let phash = HasherConfig::new() .hash_alg(HashAlg::Mean) .hash_size(8, 8) .preproc_dct() .to_hasher() .hash_image(&img); let dhash = HasherConfig::new() .hash_alg(HashAlg::Gradient) .hash_size(8, 8) .to_hasher() .hash_image(&img); Some(PerceptualIdentity { phash_64: bytes_to_i64(phash.as_bytes())?, dhash_64: bytes_to_i64(dhash.as_bytes())?, }) } /// Hamming distance between two 64-bit perceptual hashes. The primary /// query primitive: two images are "near-duplicates" when this is below /// a threshold (default 8 for pHash, ~12% similarity tolerance). The /// duplicates module clusters via a BK-tree which uses its own copy of /// this calculation; this helper is kept for ad-hoc tools and tests. #[allow(dead_code)] #[inline] pub fn hamming_distance(a: i64, b: i64) -> u32 { (a ^ b).count_ones() } fn bytes_to_i64(bytes: &[u8]) -> Option { if bytes.len() < 8 { return None; } let mut buf = [0u8; 8]; buf.copy_from_slice(&bytes[..8]); Some(i64::from_be_bytes(buf)) } #[cfg(test)] mod tests { use super::*; use image::{ImageBuffer, Rgb}; fn write_test_image(path: &Path, seed: u32) { // Deterministic-but-distinct image content: simple gradient with // a per-seed offset. Gives pHash/dHash a real signal to work // with (a uniform image collapses to all-zero hashes). let img: ImageBuffer, Vec> = ImageBuffer::from_fn(64, 64, |x, y| { let r = ((x + seed) & 0xFF) as u8; let g = ((y + seed * 2) & 0xFF) as u8; let b = ((x ^ y ^ seed) & 0xFF) as u8; Rgb([r, g, b]) }); img.save(path).unwrap(); } #[test] fn identical_bytes_yield_identical_hashes() { let dir = tempfile::tempdir().unwrap(); let a = dir.path().join("a.png"); let b = dir.path().join("b.png"); write_test_image(&a, 42); write_test_image(&b, 42); let ha = compute(&a).expect("hash a"); let hb = compute(&b).expect("hash b"); assert_eq!(ha, hb); assert_eq!(hamming_distance(ha.phash_64, hb.phash_64), 0); } #[test] fn distinct_images_have_distinct_hashes() { let dir = tempfile::tempdir().unwrap(); let a = dir.path().join("a.png"); let b = dir.path().join("b.png"); write_test_image(&a, 42); write_test_image(&b, 123); let ha = compute(&a).expect("hash a"); let hb = compute(&b).expect("hash b"); assert_ne!(ha.phash_64, hb.phash_64); } #[test] fn resized_copy_is_near_duplicate_under_threshold() { // The whole point of perceptual hashing: a resized copy of the // same source image should land within a small Hamming distance // of the original. We check the dHash specifically because it's // the more resize-robust of the two; pHash is also tight but // gradient-based dHash gives the most reliable signal here. let dir = tempfile::tempdir().unwrap(); let a = dir.path().join("a.png"); write_test_image(&a, 7); let img = image::open(&a).unwrap(); let small = img.resize_exact(32, 32, image::imageops::FilterType::Lanczos3); let b = dir.path().join("b.png"); small.save(&b).unwrap(); let ha = compute(&a).expect("hash a"); let hb = compute(&b).expect("hash b"); let d_dhash = hamming_distance(ha.dhash_64, hb.dhash_64); assert!( d_dhash <= 8, "expected dhash Hamming distance <= 8 for resized copy, got {}", d_dhash ); } #[test] fn unsupported_path_returns_none() { let dir = tempfile::tempdir().unwrap(); let p = dir.path().join("notanimage.txt"); std::fs::write(&p, b"hello").unwrap(); assert!(compute(&p).is_none()); } #[test] fn missing_file_returns_none() { let p = Path::new("/nonexistent/path/that/does/not/exist.png"); assert!(compute(p).is_none()); } }