Files
ImageApi/src/content_hash.rs
Cameron 524f00b068 feat: add content_hash backfill + register every media file
Adds blake3 content hashing as the basis for derivative dedup
(thumbnails, HLS) across libraries. Computed inline by the watcher on
ingest and by a new `backfill_hashes` binary for historical rows.

Key changes:
- `content_hash` and `size_bytes` are now populated on new image_exif
  rows; a new ExifDao surface (`get_rows_missing_hash`,
  `backfill_content_hash`, `find_by_content_hash`) supports backfill and
  future hash-keyed lookups.
- The watcher now registers every image/video in image_exif, not just
  files with parseable EXIF. EXIF becomes optional enrichment; videos
  and other non-EXIF files still get a hashed row. This also makes
  DB-indexed sort/filter cover the full library.
- `/image` thumbnail serve dual-looks up hash-keyed path first, then
  falls back to the legacy mirrored layout.
- Upload flow accepts `?library=` query param + hashes uploaded files.
- Store_exif logs the underlying Diesel error on insert failure so
  constraint violations surface instead of hiding behind a generic
  InsertError.
- New migration normalizes rel_path separators to forward slash across
  all tables, deduplicating any rows that collide after normalization.
  Fixes spurious UNIQUE violations from mixed backslash/forward-slash
  paths on Windows ingest.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-04-17 16:25:39 -04:00

104 lines
3.4 KiB
Rust

//! Content-based file identity used to dedup derivative outputs
//! (thumbnails, HLS segments) across libraries.
//!
//! Hashes are computed with blake3 streaming so that network-mounted
//! libraries don't need to load whole files into memory. The result is
//! a 64-character hex string; we shard derivative directories on the
//! first two characters to keep any single directory's fanout bounded.
use std::fs::File;
use std::io::{self, Read};
use std::path::{Path, PathBuf};
/// Size of the read buffer used when streaming a file through blake3.
/// 1 MiB trades a bit of RSS for fewer syscalls on slow network mounts.
const HASH_BUFFER_SIZE: usize = 1024 * 1024;
/// Hash identity of a file, together with its byte length.
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct FileIdentity {
pub content_hash: String,
pub size_bytes: i64,
}
/// Stream a file through blake3 and return the hex-encoded digest + size.
pub fn compute(path: &Path) -> io::Result<FileIdentity> {
let mut file = File::open(path)?;
let size_bytes = file.metadata()?.len() as i64;
let mut hasher = blake3::Hasher::new();
let mut buf = vec![0u8; HASH_BUFFER_SIZE];
loop {
let n = file.read(&mut buf)?;
if n == 0 {
break;
}
hasher.update(&buf[..n]);
}
Ok(FileIdentity {
content_hash: hasher.finalize().to_hex().to_string(),
size_bytes,
})
}
/// Hash-keyed thumbnail path: `<thumbs_dir>/<hash[..2]>/<hash>.jpg`.
/// Generation and serving both consult this first; the legacy mirrored
/// path acts as a fallback for pre-backfill rows.
pub fn thumbnail_path(thumbs_dir: &Path, hash: &str) -> PathBuf {
let shard = shard_prefix(hash);
thumbs_dir.join(shard).join(format!("{}.jpg", hash))
}
/// Hash-keyed HLS output directory: `<video_dir>/<hash[..2]>/<hash>/`.
/// The playlist lives at `playlist.m3u8` inside this directory and its
/// segments are co-located so HLS relative references Just Work.
pub fn hls_dir(video_dir: &Path, hash: &str) -> PathBuf {
let shard = shard_prefix(hash);
video_dir.join(shard).join(hash)
}
fn shard_prefix(hash: &str) -> &str {
let end = hash.char_indices().nth(2).map(|(i, _)| i).unwrap_or(hash.len());
&hash[..end]
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn identical_content_yields_identical_hash() {
let dir = tempfile::tempdir().unwrap();
let a = dir.path().join("a.bin");
let b = dir.path().join("b.bin");
std::fs::write(&a, b"hello world").unwrap();
std::fs::write(&b, b"hello world").unwrap();
let ha = compute(&a).unwrap();
let hb = compute(&b).unwrap();
assert_eq!(ha, hb);
assert_eq!(ha.size_bytes, 11);
}
#[test]
fn different_content_yields_different_hash() {
let dir = tempfile::tempdir().unwrap();
let a = dir.path().join("a.bin");
let b = dir.path().join("b.bin");
std::fs::write(&a, b"aaa").unwrap();
std::fs::write(&b, b"bbb").unwrap();
assert_ne!(compute(&a).unwrap(), compute(&b).unwrap());
}
#[test]
fn derivative_paths_shard_by_first_two_hex() {
let thumbs = Path::new("/tmp/thumbs");
let p = thumbnail_path(thumbs, "abcdef0123");
assert_eq!(p, PathBuf::from("/tmp/thumbs/ab/abcdef0123.jpg"));
let video = Path::new("/tmp/video");
let d = hls_dir(video, "1234deadbeef");
assert_eq!(d, PathBuf::from("/tmp/video/12/1234deadbeef"));
}
}