Adds blake3 content hashing as the basis for derivative dedup (thumbnails, HLS) across libraries. Computed inline by the watcher on ingest and by a new `backfill_hashes` binary for historical rows. Key changes: - `content_hash` and `size_bytes` are now populated on new image_exif rows; a new ExifDao surface (`get_rows_missing_hash`, `backfill_content_hash`, `find_by_content_hash`) supports backfill and future hash-keyed lookups. - The watcher now registers every image/video in image_exif, not just files with parseable EXIF. EXIF becomes optional enrichment; videos and other non-EXIF files still get a hashed row. This also makes DB-indexed sort/filter cover the full library. - `/image` thumbnail serve dual-looks up hash-keyed path first, then falls back to the legacy mirrored layout. - Upload flow accepts `?library=` query param + hashes uploaded files. - Store_exif logs the underlying Diesel error on insert failure so constraint violations surface instead of hiding behind a generic InsertError. - New migration normalizes rel_path separators to forward slash across all tables, deduplicating any rows that collide after normalization. Fixes spurious UNIQUE violations from mixed backslash/forward-slash paths on Windows ingest. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
104 lines
3.4 KiB
Rust
104 lines
3.4 KiB
Rust
//! Content-based file identity used to dedup derivative outputs
|
|
//! (thumbnails, HLS segments) across libraries.
|
|
//!
|
|
//! Hashes are computed with blake3 streaming so that network-mounted
|
|
//! libraries don't need to load whole files into memory. The result is
|
|
//! a 64-character hex string; we shard derivative directories on the
|
|
//! first two characters to keep any single directory's fanout bounded.
|
|
|
|
use std::fs::File;
|
|
use std::io::{self, Read};
|
|
use std::path::{Path, PathBuf};
|
|
|
|
/// Size of the read buffer used when streaming a file through blake3.
|
|
/// 1 MiB trades a bit of RSS for fewer syscalls on slow network mounts.
|
|
const HASH_BUFFER_SIZE: usize = 1024 * 1024;
|
|
|
|
/// Hash identity of a file, together with its byte length.
|
|
#[derive(Clone, Debug, PartialEq, Eq)]
|
|
pub struct FileIdentity {
|
|
pub content_hash: String,
|
|
pub size_bytes: i64,
|
|
}
|
|
|
|
/// Stream a file through blake3 and return the hex-encoded digest + size.
|
|
pub fn compute(path: &Path) -> io::Result<FileIdentity> {
|
|
let mut file = File::open(path)?;
|
|
let size_bytes = file.metadata()?.len() as i64;
|
|
|
|
let mut hasher = blake3::Hasher::new();
|
|
let mut buf = vec![0u8; HASH_BUFFER_SIZE];
|
|
loop {
|
|
let n = file.read(&mut buf)?;
|
|
if n == 0 {
|
|
break;
|
|
}
|
|
hasher.update(&buf[..n]);
|
|
}
|
|
|
|
Ok(FileIdentity {
|
|
content_hash: hasher.finalize().to_hex().to_string(),
|
|
size_bytes,
|
|
})
|
|
}
|
|
|
|
/// Hash-keyed thumbnail path: `<thumbs_dir>/<hash[..2]>/<hash>.jpg`.
|
|
/// Generation and serving both consult this first; the legacy mirrored
|
|
/// path acts as a fallback for pre-backfill rows.
|
|
pub fn thumbnail_path(thumbs_dir: &Path, hash: &str) -> PathBuf {
|
|
let shard = shard_prefix(hash);
|
|
thumbs_dir.join(shard).join(format!("{}.jpg", hash))
|
|
}
|
|
|
|
/// Hash-keyed HLS output directory: `<video_dir>/<hash[..2]>/<hash>/`.
|
|
/// The playlist lives at `playlist.m3u8` inside this directory and its
|
|
/// segments are co-located so HLS relative references Just Work.
|
|
pub fn hls_dir(video_dir: &Path, hash: &str) -> PathBuf {
|
|
let shard = shard_prefix(hash);
|
|
video_dir.join(shard).join(hash)
|
|
}
|
|
|
|
fn shard_prefix(hash: &str) -> &str {
|
|
let end = hash.char_indices().nth(2).map(|(i, _)| i).unwrap_or(hash.len());
|
|
&hash[..end]
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn identical_content_yields_identical_hash() {
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let a = dir.path().join("a.bin");
|
|
let b = dir.path().join("b.bin");
|
|
std::fs::write(&a, b"hello world").unwrap();
|
|
std::fs::write(&b, b"hello world").unwrap();
|
|
let ha = compute(&a).unwrap();
|
|
let hb = compute(&b).unwrap();
|
|
assert_eq!(ha, hb);
|
|
assert_eq!(ha.size_bytes, 11);
|
|
}
|
|
|
|
#[test]
|
|
fn different_content_yields_different_hash() {
|
|
let dir = tempfile::tempdir().unwrap();
|
|
let a = dir.path().join("a.bin");
|
|
let b = dir.path().join("b.bin");
|
|
std::fs::write(&a, b"aaa").unwrap();
|
|
std::fs::write(&b, b"bbb").unwrap();
|
|
assert_ne!(compute(&a).unwrap(), compute(&b).unwrap());
|
|
}
|
|
|
|
#[test]
|
|
fn derivative_paths_shard_by_first_two_hex() {
|
|
let thumbs = Path::new("/tmp/thumbs");
|
|
let p = thumbnail_path(thumbs, "abcdef0123");
|
|
assert_eq!(p, PathBuf::from("/tmp/thumbs/ab/abcdef0123.jpg"));
|
|
|
|
let video = Path::new("/tmp/video");
|
|
let d = hls_dir(video, "1234deadbeef");
|
|
assert_eq!(d, PathBuf::from("/tmp/video/12/1234deadbeef"));
|
|
}
|
|
}
|