//! Content-based file identity used to dedup derivative outputs //! (thumbnails, HLS segments) across libraries. //! //! Hashes are computed with blake3 streaming so that network-mounted //! libraries don't need to load whole files into memory. The result is //! a 64-character hex string; we shard derivative directories on the //! first two characters to keep any single directory's fanout bounded. use std::fs::File; use std::io::{self, Read}; use std::path::{Path, PathBuf}; /// Size of the read buffer used when streaming a file through blake3. /// 1 MiB trades a bit of RSS for fewer syscalls on slow network mounts. const HASH_BUFFER_SIZE: usize = 1024 * 1024; /// Hash identity of a file, together with its byte length. #[derive(Clone, Debug, PartialEq, Eq)] pub struct FileIdentity { pub content_hash: String, pub size_bytes: i64, } /// Stream a file through blake3 and return the hex-encoded digest + size. pub fn compute(path: &Path) -> io::Result { let mut file = File::open(path)?; let size_bytes = file.metadata()?.len() as i64; let mut hasher = blake3::Hasher::new(); let mut buf = vec![0u8; HASH_BUFFER_SIZE]; loop { let n = file.read(&mut buf)?; if n == 0 { break; } hasher.update(&buf[..n]); } Ok(FileIdentity { content_hash: hasher.finalize().to_hex().to_string(), size_bytes, }) } /// Hash-keyed thumbnail path: `//.jpg`. /// Generation and serving both consult this first; the legacy mirrored /// path acts as a fallback for pre-backfill rows. pub fn thumbnail_path(thumbs_dir: &Path, hash: &str) -> PathBuf { let shard = shard_prefix(hash); thumbs_dir.join(shard).join(format!("{}.jpg", hash)) } /// Hash-keyed HLS output directory: `///`. /// The playlist lives at `playlist.m3u8` inside this directory and its /// segments are co-located so HLS relative references Just Work. /// /// Allow-dead until Branch B/C rewires the HLS pipeline to use it; the /// helper lives here today so Branch A's path layout decisions stay /// adjacent to thumbnail/legacy ones. #[allow(dead_code)] pub fn hls_dir(video_dir: &Path, hash: &str) -> PathBuf { let shard = shard_prefix(hash); video_dir.join(shard).join(hash) } /// Library-scoped legacy mirrored path: /// `//`. Used as the fallback when /// `content_hash` isn't available — the library prefix prevents the /// "lib1 wrote `vacation/IMG.jpg` first, lib2 sees thumb_path.exists() /// and serves the wrong image" failure mode. /// /// Existing single-library deployments may already have thumbnails at the /// bare-legacy `/` shape; serving code is /// expected to check both this scoped path and the bare-legacy path so /// nothing 404s during the transition. pub fn library_scoped_legacy_path( derivative_dir: &Path, library_id: i32, rel_path: impl AsRef, ) -> PathBuf { derivative_dir .join(library_id.to_string()) .join(rel_path) } fn shard_prefix(hash: &str) -> &str { let end = hash .char_indices() .nth(2) .map(|(i, _)| i) .unwrap_or(hash.len()); &hash[..end] } #[cfg(test)] mod tests { use super::*; #[test] fn identical_content_yields_identical_hash() { let dir = tempfile::tempdir().unwrap(); let a = dir.path().join("a.bin"); let b = dir.path().join("b.bin"); std::fs::write(&a, b"hello world").unwrap(); std::fs::write(&b, b"hello world").unwrap(); let ha = compute(&a).unwrap(); let hb = compute(&b).unwrap(); assert_eq!(ha, hb); assert_eq!(ha.size_bytes, 11); } #[test] fn different_content_yields_different_hash() { let dir = tempfile::tempdir().unwrap(); let a = dir.path().join("a.bin"); let b = dir.path().join("b.bin"); std::fs::write(&a, b"aaa").unwrap(); std::fs::write(&b, b"bbb").unwrap(); assert_ne!(compute(&a).unwrap(), compute(&b).unwrap()); } #[test] fn derivative_paths_shard_by_first_two_hex() { let thumbs = Path::new("/tmp/thumbs"); let p = thumbnail_path(thumbs, "abcdef0123"); assert_eq!(p, PathBuf::from("/tmp/thumbs/ab/abcdef0123.jpg")); let video = Path::new("/tmp/video"); let d = hls_dir(video, "1234deadbeef"); assert_eq!(d, PathBuf::from("/tmp/video/12/1234deadbeef")); } #[test] fn library_scoped_legacy_path_prefixes_with_library_id() { let thumbs = Path::new("/tmp/thumbs"); let p = library_scoped_legacy_path(thumbs, 7, "vacation/IMG.jpg"); assert_eq!(p, PathBuf::from("/tmp/thumbs/7/vacation/IMG.jpg")); // Same rel_path, different library — different output. This is // the whole point: lib 1 and lib 2 don't clobber each other. let p1 = library_scoped_legacy_path(thumbs, 1, "vacation/IMG.jpg"); let p2 = library_scoped_legacy_path(thumbs, 2, "vacation/IMG.jpg"); assert_ne!(p1, p2); } }