Files
ImageApi/src/content_hash.rs
Cameron Cordes fb4df4b195 style: cargo fmt sweep
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 19:01:00 -04:00

144 lines
5.0 KiB
Rust

//! Content-based file identity used to dedup derivative outputs
//! (thumbnails, HLS segments) across libraries.
//!
//! Hashes are computed with blake3 streaming so that network-mounted
//! libraries don't need to load whole files into memory. The result is
//! a 64-character hex string; we shard derivative directories on the
//! first two characters to keep any single directory's fanout bounded.
use std::fs::File;
use std::io::{self, Read};
use std::path::{Path, PathBuf};
/// Size of the read buffer used when streaming a file through blake3.
/// 1 MiB trades a bit of RSS for fewer syscalls on slow network mounts.
const HASH_BUFFER_SIZE: usize = 1024 * 1024;
/// Hash identity of a file, together with its byte length.
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct FileIdentity {
pub content_hash: String,
pub size_bytes: i64,
}
/// Stream a file through blake3 and return the hex-encoded digest + size.
pub fn compute(path: &Path) -> io::Result<FileIdentity> {
let mut file = File::open(path)?;
let size_bytes = file.metadata()?.len() as i64;
let mut hasher = blake3::Hasher::new();
let mut buf = vec![0u8; HASH_BUFFER_SIZE];
loop {
let n = file.read(&mut buf)?;
if n == 0 {
break;
}
hasher.update(&buf[..n]);
}
Ok(FileIdentity {
content_hash: hasher.finalize().to_hex().to_string(),
size_bytes,
})
}
/// Hash-keyed thumbnail path: `<thumbs_dir>/<hash[..2]>/<hash>.jpg`.
/// Generation and serving both consult this first; the legacy mirrored
/// path acts as a fallback for pre-backfill rows.
pub fn thumbnail_path(thumbs_dir: &Path, hash: &str) -> PathBuf {
let shard = shard_prefix(hash);
thumbs_dir.join(shard).join(format!("{}.jpg", hash))
}
/// Hash-keyed HLS output directory: `<video_dir>/<hash[..2]>/<hash>/`.
/// The playlist lives at `playlist.m3u8` inside this directory and its
/// segments are co-located so HLS relative references Just Work.
///
/// Allow-dead until Branch B/C rewires the HLS pipeline to use it; the
/// helper lives here today so Branch A's path layout decisions stay
/// adjacent to thumbnail/legacy ones.
#[allow(dead_code)]
pub fn hls_dir(video_dir: &Path, hash: &str) -> PathBuf {
let shard = shard_prefix(hash);
video_dir.join(shard).join(hash)
}
/// Library-scoped legacy mirrored path:
/// `<derivative_dir>/<library_id>/<rel_path>`. Used as the fallback when
/// `content_hash` isn't available — the library prefix prevents the
/// "lib1 wrote `vacation/IMG.jpg` first, lib2 sees thumb_path.exists()
/// and serves the wrong image" failure mode.
///
/// Existing single-library deployments may already have thumbnails at the
/// bare-legacy `<derivative_dir>/<rel_path>` shape; serving code is
/// expected to check both this scoped path and the bare-legacy path so
/// nothing 404s during the transition.
pub fn library_scoped_legacy_path(
derivative_dir: &Path,
library_id: i32,
rel_path: impl AsRef<Path>,
) -> PathBuf {
derivative_dir.join(library_id.to_string()).join(rel_path)
}
fn shard_prefix(hash: &str) -> &str {
let end = hash
.char_indices()
.nth(2)
.map(|(i, _)| i)
.unwrap_or(hash.len());
&hash[..end]
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn identical_content_yields_identical_hash() {
let dir = tempfile::tempdir().unwrap();
let a = dir.path().join("a.bin");
let b = dir.path().join("b.bin");
std::fs::write(&a, b"hello world").unwrap();
std::fs::write(&b, b"hello world").unwrap();
let ha = compute(&a).unwrap();
let hb = compute(&b).unwrap();
assert_eq!(ha, hb);
assert_eq!(ha.size_bytes, 11);
}
#[test]
fn different_content_yields_different_hash() {
let dir = tempfile::tempdir().unwrap();
let a = dir.path().join("a.bin");
let b = dir.path().join("b.bin");
std::fs::write(&a, b"aaa").unwrap();
std::fs::write(&b, b"bbb").unwrap();
assert_ne!(compute(&a).unwrap(), compute(&b).unwrap());
}
#[test]
fn derivative_paths_shard_by_first_two_hex() {
let thumbs = Path::new("/tmp/thumbs");
let p = thumbnail_path(thumbs, "abcdef0123");
assert_eq!(p, PathBuf::from("/tmp/thumbs/ab/abcdef0123.jpg"));
let video = Path::new("/tmp/video");
let d = hls_dir(video, "1234deadbeef");
assert_eq!(d, PathBuf::from("/tmp/video/12/1234deadbeef"));
}
#[test]
fn library_scoped_legacy_path_prefixes_with_library_id() {
let thumbs = Path::new("/tmp/thumbs");
let p = library_scoped_legacy_path(thumbs, 7, "vacation/IMG.jpg");
assert_eq!(p, PathBuf::from("/tmp/thumbs/7/vacation/IMG.jpg"));
// Same rel_path, different library — different output. This is
// the whole point: lib 1 and lib 2 don't clobber each other.
let p1 = library_scoped_legacy_path(thumbs, 1, "vacation/IMG.jpg");
let p2 = library_scoped_legacy_path(thumbs, 2, "vacation/IMG.jpg");
assert_ne!(p1, p2);
}
}