diff --git a/Cargo.lock b/Cargo.lock index 3e1e750..e301577 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -474,6 +474,12 @@ dependencies = [ "syn", ] +[[package]] +name = "arrayref" +version = "0.3.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb" + [[package]] name = "arrayvec" version = "0.7.6" @@ -572,6 +578,20 @@ version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6099cdc01846bc367c4e7dd630dc5966dccf36b652fae7a74e17b640411a91b2" +[[package]] +name = "blake3" +version = "1.8.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4d2d5991425dfd0785aed03aedcf0b321d61975c9b5b3689c774a2610ae0b51e" +dependencies = [ + "arrayref", + "arrayvec", + "cc", + "cfg-if", + "constant_time_eq", + "cpufeatures 0.3.0", +] + [[package]] name = "block-buffer" version = "0.10.4" @@ -766,6 +786,12 @@ version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75" +[[package]] +name = "constant_time_eq" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b" + [[package]] name = "convert_case" version = "0.4.0" @@ -808,6 +834,15 @@ dependencies = [ "libc", ] +[[package]] +name = "cpufeatures" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201" +dependencies = [ + "libc", +] + [[package]] name = "crc32fast" version = "1.5.0" @@ -1810,6 +1845,7 @@ dependencies = [ "anyhow", "base64", "bcrypt", + "blake3", "chrono", "clap", "diesel", @@ -3365,7 +3401,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba" dependencies = [ "cfg-if", - "cpufeatures", + "cpufeatures 0.2.17", "digest", ] diff --git a/Cargo.toml b/Cargo.toml index 88b9f09..4dd9da2 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -55,3 +55,4 @@ zerocopy = "0.8" ical = "0.11" scraper = "0.20" base64 = "0.22" +blake3 = "1.5" diff --git a/migrations/2026-04-17-000100_normalize_path_separators/down.sql b/migrations/2026-04-17-000100_normalize_path_separators/down.sql new file mode 100644 index 0000000..4f3169c --- /dev/null +++ b/migrations/2026-04-17-000100_normalize_path_separators/down.sql @@ -0,0 +1,4 @@ +-- No-op: there's no sensible way to recover which rows originally used +-- backslashes, and there's no reason to want backslashes back. The +-- deleted duplicates are also gone. +SELECT 1; diff --git a/migrations/2026-04-17-000100_normalize_path_separators/up.sql b/migrations/2026-04-17-000100_normalize_path_separators/up.sql new file mode 100644 index 0000000..fc3bcdf --- /dev/null +++ b/migrations/2026-04-17-000100_normalize_path_separators/up.sql @@ -0,0 +1,85 @@ +-- Normalize `rel_path` columns to forward slashes. Windows ingest +-- historically produced a mix of `\` and `/`, which broke lookups and +-- caused spurious UNIQUE-constraint violations on re-registration. +-- +-- SQLite enforces UNIQUE per-row during UPDATE, so we have to drop +-- losing duplicates BEFORE normalizing. For each table that has a +-- UNIQUE on rel_path, we delete rows whose normalized form already +-- exists in canonical (forward-slash) form — keeping the existing +-- forward-slash row as the survivor. Then a flat UPDATE finishes the +-- job for remaining backslash rows. + +-- image_exif: UNIQUE(library_id, rel_path) +DELETE FROM image_exif + WHERE rel_path LIKE '%\%' + AND EXISTS ( + SELECT 1 FROM image_exif AS other + WHERE other.library_id = image_exif.library_id + AND other.rel_path = REPLACE(image_exif.rel_path, '\', '/') + AND other.id != image_exif.id + ); +UPDATE image_exif + SET rel_path = REPLACE(rel_path, '\', '/') + WHERE rel_path LIKE '%\%'; + +-- favorites: UNIQUE(userid, rel_path) +DELETE FROM favorites + WHERE rel_path LIKE '%\%' + AND EXISTS ( + SELECT 1 FROM favorites AS other + WHERE other.userid = favorites.userid + AND other.rel_path = REPLACE(favorites.rel_path, '\', '/') + AND other.id != favorites.id + ); +UPDATE favorites + SET rel_path = REPLACE(rel_path, '\', '/') + WHERE rel_path LIKE '%\%'; + +-- tagged_photo: UNIQUE(rel_path, tag_id) +DELETE FROM tagged_photo + WHERE rel_path LIKE '%\%' + AND EXISTS ( + SELECT 1 FROM tagged_photo AS other + WHERE other.tag_id = tagged_photo.tag_id + AND other.rel_path = REPLACE(tagged_photo.rel_path, '\', '/') + AND other.id != tagged_photo.id + ); +UPDATE tagged_photo + SET rel_path = REPLACE(rel_path, '\', '/') + WHERE rel_path LIKE '%\%'; + +-- entity_photo_links: UNIQUE(entity_id, library_id, rel_path, role) +DELETE FROM entity_photo_links + WHERE rel_path LIKE '%\%' + AND EXISTS ( + SELECT 1 FROM entity_photo_links AS other + WHERE other.entity_id = entity_photo_links.entity_id + AND other.library_id = entity_photo_links.library_id + AND other.role = entity_photo_links.role + AND other.rel_path = REPLACE(entity_photo_links.rel_path, '\', '/') + AND other.id != entity_photo_links.id + ); +UPDATE entity_photo_links + SET rel_path = REPLACE(rel_path, '\', '/') + WHERE rel_path LIKE '%\%'; + +-- video_preview_clips: UNIQUE(library_id, rel_path) +DELETE FROM video_preview_clips + WHERE rel_path LIKE '%\%' + AND EXISTS ( + SELECT 1 FROM video_preview_clips AS other + WHERE other.library_id = video_preview_clips.library_id + AND other.rel_path = REPLACE(video_preview_clips.rel_path, '\', '/') + AND other.id != video_preview_clips.id + ); +UPDATE video_preview_clips + SET rel_path = REPLACE(rel_path, '\', '/') + WHERE rel_path LIKE '%\%'; + +-- photo_insights has no UNIQUE on rel_path (history table), so a plain +-- normalize is safe. +UPDATE photo_insights + SET rel_path = REPLACE(rel_path, '\', '/') + WHERE rel_path LIKE '%\%'; + +ANALYZE; diff --git a/src/bin/backfill_hashes.rs b/src/bin/backfill_hashes.rs new file mode 100644 index 0000000..807c386 --- /dev/null +++ b/src/bin/backfill_hashes.rs @@ -0,0 +1,184 @@ +//! Backfill `image_exif.content_hash` + `size_bytes` for rows that were +//! ingested before hash computation was wired into the watcher. +//! +//! The watcher computes hashes for new files as they're ingested, so this +//! binary is a one-shot tool for the historical backlog. Safe to re-run; +//! only rows with NULL content_hash are processed. + +use std::path::Path; +use std::sync::{Arc, Mutex}; +use std::time::Instant; + +use clap::Parser; +use rayon::prelude::*; + +use image_api::content_hash; +use image_api::database::{ExifDao, SqliteExifDao, connect}; +use image_api::libraries::{self, Library}; + +#[derive(Parser, Debug)] +#[command(name = "backfill_hashes")] +#[command(about = "Compute content_hash for image_exif rows missing one")] +struct Args { + /// Max rows to hash per batch. The process loops until no rows remain. + #[arg(long, default_value_t = 500)] + batch_size: i64, + + /// Rayon parallelism override. 0 uses the default thread pool size. + #[arg(long, default_value_t = 0)] + parallelism: usize, + + /// Dry-run: log what would be hashed without writing to the DB. + #[arg(long)] + dry_run: bool, +} + +fn main() -> anyhow::Result<()> { + env_logger::init(); + dotenv::dotenv().ok(); + + let args = Args::parse(); + if args.parallelism > 0 { + rayon::ThreadPoolBuilder::new() + .num_threads(args.parallelism) + .build_global() + .expect("Unable to configure rayon thread pool"); + } + + // Resolve libraries (patch placeholder if still unset) so we can map + // library_id back to a root_path on disk. + let base_path = dotenv::var("BASE_PATH").ok(); + let mut seed_conn = connect(); + if let Some(base) = base_path.as_deref() { + libraries::seed_or_patch_from_env(&mut seed_conn, base); + } + let libs = libraries::load_all(&mut seed_conn); + drop(seed_conn); + if libs.is_empty() { + anyhow::bail!("No libraries configured; cannot backfill hashes"); + } + let libs_by_id: std::collections::HashMap = + libs.into_iter().map(|lib| (lib.id, lib)).collect(); + println!( + "Configured libraries: {}", + libs_by_id + .values() + .map(|l| format!("{} -> {}", l.name, l.root_path)) + .collect::>() + .join(", ") + ); + + let dao: Arc>> = + Arc::new(Mutex::new(Box::new(SqliteExifDao::new()))); + let ctx = opentelemetry::Context::new(); + + let mut total_hashed = 0u64; + let mut total_missing = 0u64; + let mut total_errors = 0u64; + let start = Instant::now(); + + loop { + let rows = { + let mut guard = dao.lock().expect("Unable to lock ExifDao"); + guard + .get_rows_missing_hash(&ctx, args.batch_size) + .map_err(|e| anyhow::anyhow!("DB error: {:?}", e))? + }; + if rows.is_empty() { + break; + } + println!("Processing batch of {} rows", rows.len()); + + // Compute hashes in parallel (I/O-bound; rayon helps on local disks, + // throttled by network on SMB mounts — use --parallelism to tune). + let results: Vec<(i32, String, Option)> = rows + .into_par_iter() + .map(|(library_id, rel_path)| { + let abs = libs_by_id + .get(&library_id) + .map(|lib| Path::new(&lib.root_path).join(&rel_path)); + match abs { + Some(abs_path) if abs_path.exists() => { + match content_hash::compute(&abs_path) { + Ok(id) => (library_id, rel_path, Some(id)), + Err(e) => { + eprintln!("hash error for {}: {:?}", abs_path.display(), e); + (library_id, rel_path, None) + } + } + } + Some(_) => (library_id, rel_path, None), // file missing on disk + None => { + eprintln!("Row refers to unknown library_id {}", library_id); + (library_id, rel_path, None) + } + } + }) + .collect(); + + // Persist sequentially — SQLite writes serialize anyway. + if !args.dry_run { + let mut guard = dao.lock().expect("Unable to lock ExifDao"); + for (library_id, rel_path, ident) in &results { + match ident { + Some(id) => { + match guard.backfill_content_hash( + &ctx, + *library_id, + rel_path, + &id.content_hash, + id.size_bytes, + ) { + Ok(_) => total_hashed += 1, + Err(e) => { + eprintln!("persist error for {}: {:?}", rel_path, e); + total_errors += 1; + } + } + } + None => { + total_missing += 1; + } + } + } + } else { + for (_, rel_path, ident) in &results { + match ident { + Some(id) => { + println!( + "[dry-run] {} -> {} ({} bytes)", + rel_path, id.content_hash, id.size_bytes + ); + total_hashed += 1; + } + None => { + total_missing += 1; + } + } + } + println!( + "[dry-run] processed one batch of {}. Stopping — a real run would continue \ + until no NULL content_hash rows remain.", + results.len() + ); + break; + } + + let elapsed = start.elapsed().as_secs_f64().max(0.001); + let rate = total_hashed as f64 / elapsed; + println!( + " hashed={} missing={} errors={} ({:.1} files/sec)", + total_hashed, total_missing, total_errors, rate + ); + } + + println!(); + println!( + "Done. hashed={}, skipped (missing on disk)={}, errors={}, elapsed={:.1}s", + total_hashed, + total_missing, + total_errors, + start.elapsed().as_secs_f64() + ); + Ok(()) +} diff --git a/src/bin/migrate_exif.rs b/src/bin/migrate_exif.rs index 3266a89..2f8f868 100644 --- a/src/bin/migrate_exif.rs +++ b/src/bin/migrate_exif.rs @@ -67,7 +67,7 @@ fn main() -> anyhow::Result<()> { let context = opentelemetry::Context::new(); let relative_path = match path.strip_prefix(&base) { - Ok(p) => p.to_str().unwrap().to_string(), + Ok(p) => p.to_str().unwrap().replace('\\', "/"), Err(_) => { eprintln!( "Error: Could not create relative path for {}", diff --git a/src/content_hash.rs b/src/content_hash.rs new file mode 100644 index 0000000..63be295 --- /dev/null +++ b/src/content_hash.rs @@ -0,0 +1,103 @@ +//! Content-based file identity used to dedup derivative outputs +//! (thumbnails, HLS segments) across libraries. +//! +//! Hashes are computed with blake3 streaming so that network-mounted +//! libraries don't need to load whole files into memory. The result is +//! a 64-character hex string; we shard derivative directories on the +//! first two characters to keep any single directory's fanout bounded. + +use std::fs::File; +use std::io::{self, Read}; +use std::path::{Path, PathBuf}; + +/// Size of the read buffer used when streaming a file through blake3. +/// 1 MiB trades a bit of RSS for fewer syscalls on slow network mounts. +const HASH_BUFFER_SIZE: usize = 1024 * 1024; + +/// Hash identity of a file, together with its byte length. +#[derive(Clone, Debug, PartialEq, Eq)] +pub struct FileIdentity { + pub content_hash: String, + pub size_bytes: i64, +} + +/// Stream a file through blake3 and return the hex-encoded digest + size. +pub fn compute(path: &Path) -> io::Result { + let mut file = File::open(path)?; + let size_bytes = file.metadata()?.len() as i64; + + let mut hasher = blake3::Hasher::new(); + let mut buf = vec![0u8; HASH_BUFFER_SIZE]; + loop { + let n = file.read(&mut buf)?; + if n == 0 { + break; + } + hasher.update(&buf[..n]); + } + + Ok(FileIdentity { + content_hash: hasher.finalize().to_hex().to_string(), + size_bytes, + }) +} + +/// Hash-keyed thumbnail path: `//.jpg`. +/// Generation and serving both consult this first; the legacy mirrored +/// path acts as a fallback for pre-backfill rows. +pub fn thumbnail_path(thumbs_dir: &Path, hash: &str) -> PathBuf { + let shard = shard_prefix(hash); + thumbs_dir.join(shard).join(format!("{}.jpg", hash)) +} + +/// Hash-keyed HLS output directory: `///`. +/// The playlist lives at `playlist.m3u8` inside this directory and its +/// segments are co-located so HLS relative references Just Work. +pub fn hls_dir(video_dir: &Path, hash: &str) -> PathBuf { + let shard = shard_prefix(hash); + video_dir.join(shard).join(hash) +} + +fn shard_prefix(hash: &str) -> &str { + let end = hash.char_indices().nth(2).map(|(i, _)| i).unwrap_or(hash.len()); + &hash[..end] +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn identical_content_yields_identical_hash() { + let dir = tempfile::tempdir().unwrap(); + let a = dir.path().join("a.bin"); + let b = dir.path().join("b.bin"); + std::fs::write(&a, b"hello world").unwrap(); + std::fs::write(&b, b"hello world").unwrap(); + let ha = compute(&a).unwrap(); + let hb = compute(&b).unwrap(); + assert_eq!(ha, hb); + assert_eq!(ha.size_bytes, 11); + } + + #[test] + fn different_content_yields_different_hash() { + let dir = tempfile::tempdir().unwrap(); + let a = dir.path().join("a.bin"); + let b = dir.path().join("b.bin"); + std::fs::write(&a, b"aaa").unwrap(); + std::fs::write(&b, b"bbb").unwrap(); + assert_ne!(compute(&a).unwrap(), compute(&b).unwrap()); + } + + #[test] + fn derivative_paths_shard_by_first_two_hex() { + let thumbs = Path::new("/tmp/thumbs"); + let p = thumbnail_path(thumbs, "abcdef0123"); + assert_eq!(p, PathBuf::from("/tmp/thumbs/ab/abcdef0123.jpg")); + + let video = Path::new("/tmp/video"); + let d = hls_dir(video, "1234deadbeef"); + assert_eq!(d, PathBuf::from("/tmp/video/12/1234deadbeef")); + } +} diff --git a/src/database/mod.rs b/src/database/mod.rs index e1c1c01..f5fe56a 100644 --- a/src/database/mod.rs +++ b/src/database/mod.rs @@ -312,6 +312,35 @@ pub trait ExifDao: Sync + Send { base_path: &str, recursive: bool, ) -> Result)>, DbError>; + + /// Return rows that still lack a `content_hash`, oldest first. Used by + /// the `backfill_hashes` binary to batch through the historical + /// backlog. Returns `(library_id, rel_path)` tuples so the caller can + /// resolve each file on disk. + fn get_rows_missing_hash( + &mut self, + context: &opentelemetry::Context, + limit: i64, + ) -> Result, DbError>; + + /// Persist the computed blake3 hash + file size for an existing row. + fn backfill_content_hash( + &mut self, + context: &opentelemetry::Context, + library_id: i32, + rel_path: &str, + hash: &str, + size_bytes: i64, + ) -> Result<(), DbError>; + + /// Return the first EXIF row with the given content hash (any library). + /// Used by thumbnail/HLS generation to detect pre-existing derivatives + /// from another library before regenerating. + fn find_by_content_hash( + &mut self, + context: &opentelemetry::Context, + hash: &str, + ) -> Result, DbError>; } pub struct SqliteExifDao { @@ -346,13 +375,21 @@ impl ExifDao for SqliteExifDao { diesel::insert_into(image_exif) .values(&exif_data) .execute(connection.deref_mut()) - .map_err(|_| anyhow::anyhow!("Insert error"))?; + .map_err(|e| { + log::warn!( + "image_exif insert failed (lib={}, rel_path={:?}): {}", + exif_data.library_id, + exif_data.file_path, + e + ); + anyhow::anyhow!("Insert error: {}", e) + })?; image_exif .filter(library_id.eq(exif_data.library_id)) .filter(rel_path.eq(&exif_data.file_path)) .first::(connection.deref_mut()) - .map_err(|_| anyhow::anyhow!("Query error")) + .map_err(|e| anyhow::anyhow!("Post-insert lookup failed: {}", e)) }) .map_err(|_| DbError::new(DbErrorKind::InsertError)) } @@ -672,4 +709,70 @@ impl ExifDao for SqliteExifDao { }) .map_err(|_| DbError::new(DbErrorKind::QueryError)) } + + fn get_rows_missing_hash( + &mut self, + context: &opentelemetry::Context, + limit: i64, + ) -> Result, DbError> { + trace_db_call(context, "query", "get_rows_missing_hash", |_span| { + use schema::image_exif::dsl::*; + + let mut connection = self.connection.lock().expect("Unable to get ExifDao"); + + image_exif + .filter(content_hash.is_null()) + .select((library_id, rel_path)) + .order(id.asc()) + .limit(limit) + .load::<(i32, String)>(connection.deref_mut()) + .map_err(|_| anyhow::anyhow!("Query error")) + }) + .map_err(|_| DbError::new(DbErrorKind::QueryError)) + } + + fn backfill_content_hash( + &mut self, + context: &opentelemetry::Context, + library_id_val: i32, + rel_path_val: &str, + hash: &str, + size_val: i64, + ) -> Result<(), DbError> { + trace_db_call(context, "update", "backfill_content_hash", |_span| { + use schema::image_exif::dsl::*; + + let mut connection = self.connection.lock().expect("Unable to get ExifDao"); + + diesel::update( + image_exif + .filter(library_id.eq(library_id_val)) + .filter(rel_path.eq(rel_path_val)), + ) + .set((content_hash.eq(hash), size_bytes.eq(size_val))) + .execute(connection.deref_mut()) + .map(|_| ()) + .map_err(|_| anyhow::anyhow!("Update error")) + }) + .map_err(|_| DbError::new(DbErrorKind::UpdateError)) + } + + fn find_by_content_hash( + &mut self, + context: &opentelemetry::Context, + hash: &str, + ) -> Result, DbError> { + trace_db_call(context, "query", "find_by_content_hash", |_span| { + use schema::image_exif::dsl::*; + + let mut connection = self.connection.lock().expect("Unable to get ExifDao"); + + image_exif + .filter(content_hash.eq(hash)) + .first::(connection.deref_mut()) + .optional() + .map_err(|_| anyhow::anyhow!("Query error")) + }) + .map_err(|_| DbError::new(DbErrorKind::QueryError)) + } } diff --git a/src/files.rs b/src/files.rs index 3c25597..acb8fc5 100644 --- a/src/files.rs +++ b/src/files.rs @@ -1360,6 +1360,33 @@ mod tests { ) -> Result)>, DbError> { todo!() } + + fn get_rows_missing_hash( + &mut self, + _context: &opentelemetry::Context, + _limit: i64, + ) -> Result, DbError> { + Ok(Vec::new()) + } + + fn backfill_content_hash( + &mut self, + _context: &opentelemetry::Context, + _library_id: i32, + _rel_path: &str, + _hash: &str, + _size_bytes: i64, + ) -> Result<(), DbError> { + Ok(()) + } + + fn find_by_content_hash( + &mut self, + _context: &opentelemetry::Context, + _hash: &str, + ) -> Result, DbError> { + Ok(None) + } } mod api { diff --git a/src/lib.rs b/src/lib.rs index 12e0bc0..d74fc2b 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -4,6 +4,7 @@ extern crate diesel; pub mod ai; pub mod auth; pub mod cleanup; +pub mod content_hash; pub mod data; pub mod database; pub mod error; diff --git a/src/main.rs b/src/main.rs index cec0474..7305bae 100644 --- a/src/main.rs +++ b/src/main.rs @@ -61,6 +61,7 @@ mod error; mod exif; mod file_types; mod files; +mod content_hash; mod geo; mod libraries; mod state; @@ -96,6 +97,7 @@ async fn get_image( request: HttpRequest, req: web::Query, app_state: Data, + exif_dao: Data>>, ) -> impl Responder { let tracer = global_tracer(); let context = extract_context_from_request(&request); @@ -108,16 +110,45 @@ async fn get_image( let relative_path = path .strip_prefix(&app_state.base_path) .expect("Error stripping base path prefix from thumbnail"); + let relative_path_str = relative_path.to_string_lossy().replace('\\', "/"); let thumbs = &app_state.thumbnail_path; - let mut thumb_path = Path::new(&thumbs).join(relative_path); + let legacy_thumb_path = Path::new(&thumbs).join(relative_path); - // If it's a video and GIF format is requested, try to serve GIF thumbnail + // Gif thumbnails are a separate lookup (video GIF previews). + // Dual-lookup for gif is out of scope; preserve existing flow. if req.format == Some(ThumbnailFormat::Gif) && is_video_file(&path) { - thumb_path = Path::new(&app_state.gif_path).join(relative_path); - thumb_path.set_extension("gif"); + let mut gif_path = Path::new(&app_state.gif_path).join(relative_path); + gif_path.set_extension("gif"); + trace!("Gif thumbnail path: {:?}", gif_path); + if let Ok(file) = NamedFile::open(&gif_path) { + span.set_status(Status::Ok); + return file + .use_etag(true) + .use_last_modified(true) + .prefer_utf8(true) + .into_response(&request); + } } + // Resolve the hash-keyed thumbnail (if the row already has a + // content_hash) and fall back to the legacy mirrored path. + let hash_thumb_path: Option = { + let mut dao = exif_dao.lock().expect("Unable to lock ExifDao"); + match dao.get_exif(&context, &relative_path_str) { + Ok(Some(row)) => row + .content_hash + .as_deref() + .map(|h| content_hash::thumbnail_path(Path::new(thumbs), h)), + _ => None, + } + }; + let thumb_path = hash_thumb_path + .as_ref() + .filter(|p| p.exists()) + .cloned() + .unwrap_or_else(|| legacy_thumb_path.clone()); + // Handle circular thumbnail request if req.shape == Some(ThumbnailShape::Circle) { match create_circular_thumbnail(&thumb_path, thumbs).await { @@ -141,8 +172,6 @@ async fn get_image( trace!("Thumbnail path: {:?}", thumb_path); if let Ok(file) = NamedFile::open(&thumb_path) { span.set_status(Status::Ok); - // The NamedFile will automatically set the correct content-type - // Enable ETag and set cache headers for thumbnails (1 day cache) return file .use_etag(true) .use_last_modified(true) @@ -406,11 +435,23 @@ async fn upload_image( .expect("Error stripping library root prefix") .to_str() .unwrap() - .to_string(); + .replace('\\', "/"); match exif::extract_exif_from_path(&uploaded_path) { Ok(exif_data) => { let timestamp = Utc::now().timestamp(); + let (content_hash, size_bytes) = + match content_hash::compute(&uploaded_path) { + Ok(id) => (Some(id.content_hash), Some(id.size_bytes)), + Err(e) => { + warn!( + "Failed to hash uploaded {}: {:?}", + uploaded_path.display(), + e + ); + (None, None) + } + }; let insert_exif = InsertImageExif { library_id: target_library.id, file_path: relative_path.clone(), @@ -430,8 +471,8 @@ async fn upload_image( date_taken: exif_data.date_taken, created_time: timestamp, last_modified: timestamp, - content_hash: None, - size_bytes: None, + content_hash, + size_bytes, }; if let Ok(mut dao) = exif_dao.lock() { @@ -1566,11 +1607,13 @@ fn process_new_files( .filter(|entry| is_image(entry) || is_video(entry)) .filter_map(|entry| { let file_path = entry.path().to_path_buf(); + // Canonical rel_path is forward-slash regardless of OS so DB + // comparisons against the batch EXIF lookup line up. let relative_path = file_path .strip_prefix(base_path) .ok()? .to_str()? - .to_string(); + .replace('\\', "/"); Some((file_path, relative_path)) }) .collect(); @@ -1600,82 +1643,107 @@ fn process_new_files( }; let mut new_files_found = false; - let mut files_needing_exif = Vec::new(); + let mut files_needing_row = Vec::new(); - // Check each file for missing thumbnail or EXIF data + // Register every image/video file in image_exif. Rows without EXIF + // still carry library_id, rel_path, content_hash, and size_bytes so + // derivative dedup and DB-indexed sort/filter work for every file, + // not just photos with parseable EXIF. for (file_path, relative_path) in &files { - // Check if thumbnail exists let thumb_path = thumbnail_directory.join(relative_path); let needs_thumbnail = !thumb_path.exists(); + let needs_row = !existing_exif_paths.contains_key(relative_path); - // Check if EXIF data exists (for supported files) - let needs_exif = if exif::supports_exif(file_path) { - !existing_exif_paths.contains_key(relative_path) - } else { - false - }; - - if needs_thumbnail || needs_exif { + if needs_thumbnail || needs_row { new_files_found = true; if needs_thumbnail { info!("New file detected (missing thumbnail): {}", relative_path); } - if needs_exif { - files_needing_exif.push((file_path.clone(), relative_path.clone())); + if needs_row { + files_needing_row.push((file_path.clone(), relative_path.clone())); } } } - // Process EXIF data for files that need it - if !files_needing_exif.is_empty() { + if !files_needing_row.is_empty() { info!( - "Processing EXIF data for {} files", - files_needing_exif.len() + "Registering {} new files in image_exif", + files_needing_row.len() ); - for (file_path, relative_path) in files_needing_exif { - match exif::extract_exif_from_path(&file_path) { - Ok(exif_data) => { - let timestamp = Utc::now().timestamp(); - let insert_exif = InsertImageExif { - library_id: library.id, - file_path: relative_path.clone(), - camera_make: exif_data.camera_make, - camera_model: exif_data.camera_model, - lens_model: exif_data.lens_model, - width: exif_data.width, - height: exif_data.height, - orientation: exif_data.orientation, - gps_latitude: exif_data.gps_latitude.map(|v| v as f32), - gps_longitude: exif_data.gps_longitude.map(|v| v as f32), - gps_altitude: exif_data.gps_altitude.map(|v| v as f32), - focal_length: exif_data.focal_length.map(|v| v as f32), - aperture: exif_data.aperture.map(|v| v as f32), - shutter_speed: exif_data.shutter_speed, - iso: exif_data.iso, - date_taken: exif_data.date_taken, - created_time: timestamp, - last_modified: timestamp, - content_hash: None, - size_bytes: None, - }; + for (file_path, relative_path) in files_needing_row { + let timestamp = Utc::now().timestamp(); - let mut dao = exif_dao.lock().expect("Unable to lock ExifDao"); - if let Err(e) = dao.store_exif(&context, insert_exif) { - error!("Failed to store EXIF data for {}: {:?}", relative_path, e); - } else { - debug!("EXIF data stored for {}", relative_path); + // Hash + size from filesystem metadata — always attempted so + // every file gets a content_hash, even when EXIF is absent. + let (content_hash, size_bytes) = match content_hash::compute(&file_path) { + Ok(id) => (Some(id.content_hash), Some(id.size_bytes)), + Err(e) => { + warn!("Failed to hash {}: {:?}", file_path.display(), e); + (None, None) + } + }; + + // EXIF is best-effort enrichment. When extraction fails (or the + // file type doesn't support EXIF) we still store a row with all + // EXIF fields NULL; the file remains visible to sort-by-date + // and tag queries via its rel_path and filesystem timestamps. + let exif_fields = if exif::supports_exif(&file_path) { + match exif::extract_exif_from_path(&file_path) { + Ok(data) => Some(data), + Err(e) => { + debug!( + "No EXIF or parse error for {}: {:?}", + file_path.display(), + e + ); + None } } - Err(e) => { - debug!( - "No EXIF data or error extracting from {}: {:?}", - file_path.display(), - e - ); - } + } else { + None + }; + + let insert_exif = InsertImageExif { + library_id: library.id, + file_path: relative_path.clone(), + camera_make: exif_fields.as_ref().and_then(|e| e.camera_make.clone()), + camera_model: exif_fields.as_ref().and_then(|e| e.camera_model.clone()), + lens_model: exif_fields.as_ref().and_then(|e| e.lens_model.clone()), + width: exif_fields.as_ref().and_then(|e| e.width), + height: exif_fields.as_ref().and_then(|e| e.height), + orientation: exif_fields.as_ref().and_then(|e| e.orientation), + gps_latitude: exif_fields + .as_ref() + .and_then(|e| e.gps_latitude.map(|v| v as f32)), + gps_longitude: exif_fields + .as_ref() + .and_then(|e| e.gps_longitude.map(|v| v as f32)), + gps_altitude: exif_fields + .as_ref() + .and_then(|e| e.gps_altitude.map(|v| v as f32)), + focal_length: exif_fields + .as_ref() + .and_then(|e| e.focal_length.map(|v| v as f32)), + aperture: exif_fields + .as_ref() + .and_then(|e| e.aperture.map(|v| v as f32)), + shutter_speed: exif_fields.as_ref().and_then(|e| e.shutter_speed.clone()), + iso: exif_fields.as_ref().and_then(|e| e.iso), + date_taken: exif_fields.as_ref().and_then(|e| e.date_taken), + created_time: timestamp, + last_modified: timestamp, + content_hash, + size_bytes, + }; + + let mut dao = exif_dao.lock().expect("Unable to lock ExifDao"); + if let Err(e) = dao.store_exif(&context, insert_exif) { + error!("Failed to register {} in image_exif: {:?}", relative_path, e); + } else { + debug!("Registered {} in image_exif", relative_path); } } }