//! Content-based file identity used to dedup derivative outputs //! (thumbnails, HLS segments) across libraries. //! //! Hashes are computed with blake3 streaming so that network-mounted //! libraries don't need to load whole files into memory. The result is //! a 64-character hex string; we shard derivative directories on the //! first two characters to keep any single directory's fanout bounded. use std::fs::File; use std::io::{self, Read}; use std::path::{Path, PathBuf}; /// Size of the read buffer used when streaming a file through blake3. /// 1 MiB trades a bit of RSS for fewer syscalls on slow network mounts. const HASH_BUFFER_SIZE: usize = 1024 * 1024; /// Hash identity of a file, together with its byte length. #[derive(Clone, Debug, PartialEq, Eq)] pub struct FileIdentity { pub content_hash: String, pub size_bytes: i64, } /// Stream a file through blake3 and return the hex-encoded digest + size. pub fn compute(path: &Path) -> io::Result { let mut file = File::open(path)?; let size_bytes = file.metadata()?.len() as i64; let mut hasher = blake3::Hasher::new(); let mut buf = vec![0u8; HASH_BUFFER_SIZE]; loop { let n = file.read(&mut buf)?; if n == 0 { break; } hasher.update(&buf[..n]); } Ok(FileIdentity { content_hash: hasher.finalize().to_hex().to_string(), size_bytes, }) } /// Hash-keyed thumbnail path: `//.jpg`. /// Generation and serving both consult this first; the legacy mirrored /// path acts as a fallback for pre-backfill rows. pub fn thumbnail_path(thumbs_dir: &Path, hash: &str) -> PathBuf { let shard = shard_prefix(hash); thumbs_dir.join(shard).join(format!("{}.jpg", hash)) } /// Hash-keyed HLS output directory: `///`. /// The playlist lives at `playlist.m3u8` inside this directory and its /// segments are co-located so HLS relative references Just Work. #[allow(dead_code)] pub fn hls_dir(video_dir: &Path, hash: &str) -> PathBuf { let shard = shard_prefix(hash); video_dir.join(shard).join(hash) } fn shard_prefix(hash: &str) -> &str { let end = hash .char_indices() .nth(2) .map(|(i, _)| i) .unwrap_or(hash.len()); &hash[..end] } #[cfg(test)] mod tests { use super::*; #[test] fn identical_content_yields_identical_hash() { let dir = tempfile::tempdir().unwrap(); let a = dir.path().join("a.bin"); let b = dir.path().join("b.bin"); std::fs::write(&a, b"hello world").unwrap(); std::fs::write(&b, b"hello world").unwrap(); let ha = compute(&a).unwrap(); let hb = compute(&b).unwrap(); assert_eq!(ha, hb); assert_eq!(ha.size_bytes, 11); } #[test] fn different_content_yields_different_hash() { let dir = tempfile::tempdir().unwrap(); let a = dir.path().join("a.bin"); let b = dir.path().join("b.bin"); std::fs::write(&a, b"aaa").unwrap(); std::fs::write(&b, b"bbb").unwrap(); assert_ne!(compute(&a).unwrap(), compute(&b).unwrap()); } #[test] fn derivative_paths_shard_by_first_two_hex() { let thumbs = Path::new("/tmp/thumbs"); let p = thumbnail_path(thumbs, "abcdef0123"); assert_eq!(p, PathBuf::from("/tmp/thumbs/ab/abcdef0123.jpg")); let video = Path::new("/tmp/video"); let d = hls_dir(video, "1234deadbeef"); assert_eq!(d, PathBuf::from("/tmp/video/12/1234deadbeef")); } }