feat: add content_hash backfill + register every media file
Adds blake3 content hashing as the basis for derivative dedup (thumbnails, HLS) across libraries. Computed inline by the watcher on ingest and by a new `backfill_hashes` binary for historical rows. Key changes: - `content_hash` and `size_bytes` are now populated on new image_exif rows; a new ExifDao surface (`get_rows_missing_hash`, `backfill_content_hash`, `find_by_content_hash`) supports backfill and future hash-keyed lookups. - The watcher now registers every image/video in image_exif, not just files with parseable EXIF. EXIF becomes optional enrichment; videos and other non-EXIF files still get a hashed row. This also makes DB-indexed sort/filter cover the full library. - `/image` thumbnail serve dual-looks up hash-keyed path first, then falls back to the legacy mirrored layout. - Upload flow accepts `?library=` query param + hashes uploaded files. - Store_exif logs the underlying Diesel error on insert failure so constraint violations surface instead of hiding behind a generic InsertError. - New migration normalizes rel_path separators to forward slash across all tables, deduplicating any rows that collide after normalization. Fixes spurious UNIQUE violations from mixed backslash/forward-slash paths on Windows ingest. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
103
src/content_hash.rs
Normal file
103
src/content_hash.rs
Normal file
@@ -0,0 +1,103 @@
|
||||
//! Content-based file identity used to dedup derivative outputs
|
||||
//! (thumbnails, HLS segments) across libraries.
|
||||
//!
|
||||
//! Hashes are computed with blake3 streaming so that network-mounted
|
||||
//! libraries don't need to load whole files into memory. The result is
|
||||
//! a 64-character hex string; we shard derivative directories on the
|
||||
//! first two characters to keep any single directory's fanout bounded.
|
||||
|
||||
use std::fs::File;
|
||||
use std::io::{self, Read};
|
||||
use std::path::{Path, PathBuf};
|
||||
|
||||
/// Size of the read buffer used when streaming a file through blake3.
|
||||
/// 1 MiB trades a bit of RSS for fewer syscalls on slow network mounts.
|
||||
const HASH_BUFFER_SIZE: usize = 1024 * 1024;
|
||||
|
||||
/// Hash identity of a file, together with its byte length.
|
||||
#[derive(Clone, Debug, PartialEq, Eq)]
|
||||
pub struct FileIdentity {
|
||||
pub content_hash: String,
|
||||
pub size_bytes: i64,
|
||||
}
|
||||
|
||||
/// Stream a file through blake3 and return the hex-encoded digest + size.
|
||||
pub fn compute(path: &Path) -> io::Result<FileIdentity> {
|
||||
let mut file = File::open(path)?;
|
||||
let size_bytes = file.metadata()?.len() as i64;
|
||||
|
||||
let mut hasher = blake3::Hasher::new();
|
||||
let mut buf = vec![0u8; HASH_BUFFER_SIZE];
|
||||
loop {
|
||||
let n = file.read(&mut buf)?;
|
||||
if n == 0 {
|
||||
break;
|
||||
}
|
||||
hasher.update(&buf[..n]);
|
||||
}
|
||||
|
||||
Ok(FileIdentity {
|
||||
content_hash: hasher.finalize().to_hex().to_string(),
|
||||
size_bytes,
|
||||
})
|
||||
}
|
||||
|
||||
/// Hash-keyed thumbnail path: `<thumbs_dir>/<hash[..2]>/<hash>.jpg`.
|
||||
/// Generation and serving both consult this first; the legacy mirrored
|
||||
/// path acts as a fallback for pre-backfill rows.
|
||||
pub fn thumbnail_path(thumbs_dir: &Path, hash: &str) -> PathBuf {
|
||||
let shard = shard_prefix(hash);
|
||||
thumbs_dir.join(shard).join(format!("{}.jpg", hash))
|
||||
}
|
||||
|
||||
/// Hash-keyed HLS output directory: `<video_dir>/<hash[..2]>/<hash>/`.
|
||||
/// The playlist lives at `playlist.m3u8` inside this directory and its
|
||||
/// segments are co-located so HLS relative references Just Work.
|
||||
pub fn hls_dir(video_dir: &Path, hash: &str) -> PathBuf {
|
||||
let shard = shard_prefix(hash);
|
||||
video_dir.join(shard).join(hash)
|
||||
}
|
||||
|
||||
fn shard_prefix(hash: &str) -> &str {
|
||||
let end = hash.char_indices().nth(2).map(|(i, _)| i).unwrap_or(hash.len());
|
||||
&hash[..end]
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn identical_content_yields_identical_hash() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let a = dir.path().join("a.bin");
|
||||
let b = dir.path().join("b.bin");
|
||||
std::fs::write(&a, b"hello world").unwrap();
|
||||
std::fs::write(&b, b"hello world").unwrap();
|
||||
let ha = compute(&a).unwrap();
|
||||
let hb = compute(&b).unwrap();
|
||||
assert_eq!(ha, hb);
|
||||
assert_eq!(ha.size_bytes, 11);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn different_content_yields_different_hash() {
|
||||
let dir = tempfile::tempdir().unwrap();
|
||||
let a = dir.path().join("a.bin");
|
||||
let b = dir.path().join("b.bin");
|
||||
std::fs::write(&a, b"aaa").unwrap();
|
||||
std::fs::write(&b, b"bbb").unwrap();
|
||||
assert_ne!(compute(&a).unwrap(), compute(&b).unwrap());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn derivative_paths_shard_by_first_two_hex() {
|
||||
let thumbs = Path::new("/tmp/thumbs");
|
||||
let p = thumbnail_path(thumbs, "abcdef0123");
|
||||
assert_eq!(p, PathBuf::from("/tmp/thumbs/ab/abcdef0123.jpg"));
|
||||
|
||||
let video = Path::new("/tmp/video");
|
||||
let d = hls_dir(video, "1234deadbeef");
|
||||
assert_eq!(d, PathBuf::from("/tmp/video/12/1234deadbeef"));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user