004 Multi-library Support #54

Merged
cameron merged 19 commits from 004-multi-library into master 2026-04-21 01:55:23 +00:00
11 changed files with 681 additions and 69 deletions
Showing only changes of commit 0aaea91cc2 - Show all commits

38
Cargo.lock generated
View File

@@ -474,6 +474,12 @@ dependencies = [
"syn",
]
[[package]]
name = "arrayref"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "76a2e8124351fda1ef8aaaa3bbd7ebbcb486bbcd4225aca0aa0d84bb2db8fecb"
[[package]]
name = "arrayvec"
version = "0.7.6"
@@ -572,6 +578,20 @@ version = "2.6.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6099cdc01846bc367c4e7dd630dc5966dccf36b652fae7a74e17b640411a91b2"
[[package]]
name = "blake3"
version = "1.8.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4d2d5991425dfd0785aed03aedcf0b321d61975c9b5b3689c774a2610ae0b51e"
dependencies = [
"arrayref",
"arrayvec",
"cc",
"cfg-if",
"constant_time_eq",
"cpufeatures 0.3.0",
]
[[package]]
name = "block-buffer"
version = "0.10.4"
@@ -766,6 +786,12 @@ version = "1.0.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b05b61dc5112cbb17e4b6cd61790d9845d13888356391624cbe7e41efeac1e75"
[[package]]
name = "constant_time_eq"
version = "0.4.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "3d52eff69cd5e647efe296129160853a42795992097e8af39800e1060caeea9b"
[[package]]
name = "convert_case"
version = "0.4.0"
@@ -808,6 +834,15 @@ dependencies = [
"libc",
]
[[package]]
name = "cpufeatures"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8b2a41393f66f16b0823bb79094d54ac5fbd34ab292ddafb9a0456ac9f87d201"
dependencies = [
"libc",
]
[[package]]
name = "crc32fast"
version = "1.5.0"
@@ -1810,6 +1845,7 @@ dependencies = [
"anyhow",
"base64",
"bcrypt",
"blake3",
"chrono",
"clap",
"diesel",
@@ -3365,7 +3401,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e3bf829a2d51ab4a5ddf1352d8470c140cadc8301b2ae1789db023f01cedd6ba"
dependencies = [
"cfg-if",
"cpufeatures",
"cpufeatures 0.2.17",
"digest",
]

View File

@@ -55,3 +55,4 @@ zerocopy = "0.8"
ical = "0.11"
scraper = "0.20"
base64 = "0.22"
blake3 = "1.5"

View File

@@ -0,0 +1,4 @@
-- No-op: there's no sensible way to recover which rows originally used
-- backslashes, and there's no reason to want backslashes back. The
-- deleted duplicates are also gone.
SELECT 1;

View File

@@ -0,0 +1,85 @@
-- Normalize `rel_path` columns to forward slashes. Windows ingest
-- historically produced a mix of `\` and `/`, which broke lookups and
-- caused spurious UNIQUE-constraint violations on re-registration.
--
-- SQLite enforces UNIQUE per-row during UPDATE, so we have to drop
-- losing duplicates BEFORE normalizing. For each table that has a
-- UNIQUE on rel_path, we delete rows whose normalized form already
-- exists in canonical (forward-slash) form — keeping the existing
-- forward-slash row as the survivor. Then a flat UPDATE finishes the
-- job for remaining backslash rows.
-- image_exif: UNIQUE(library_id, rel_path)
DELETE FROM image_exif
WHERE rel_path LIKE '%\%'
AND EXISTS (
SELECT 1 FROM image_exif AS other
WHERE other.library_id = image_exif.library_id
AND other.rel_path = REPLACE(image_exif.rel_path, '\', '/')
AND other.id != image_exif.id
);
UPDATE image_exif
SET rel_path = REPLACE(rel_path, '\', '/')
WHERE rel_path LIKE '%\%';
-- favorites: UNIQUE(userid, rel_path)
DELETE FROM favorites
WHERE rel_path LIKE '%\%'
AND EXISTS (
SELECT 1 FROM favorites AS other
WHERE other.userid = favorites.userid
AND other.rel_path = REPLACE(favorites.rel_path, '\', '/')
AND other.id != favorites.id
);
UPDATE favorites
SET rel_path = REPLACE(rel_path, '\', '/')
WHERE rel_path LIKE '%\%';
-- tagged_photo: UNIQUE(rel_path, tag_id)
DELETE FROM tagged_photo
WHERE rel_path LIKE '%\%'
AND EXISTS (
SELECT 1 FROM tagged_photo AS other
WHERE other.tag_id = tagged_photo.tag_id
AND other.rel_path = REPLACE(tagged_photo.rel_path, '\', '/')
AND other.id != tagged_photo.id
);
UPDATE tagged_photo
SET rel_path = REPLACE(rel_path, '\', '/')
WHERE rel_path LIKE '%\%';
-- entity_photo_links: UNIQUE(entity_id, library_id, rel_path, role)
DELETE FROM entity_photo_links
WHERE rel_path LIKE '%\%'
AND EXISTS (
SELECT 1 FROM entity_photo_links AS other
WHERE other.entity_id = entity_photo_links.entity_id
AND other.library_id = entity_photo_links.library_id
AND other.role = entity_photo_links.role
AND other.rel_path = REPLACE(entity_photo_links.rel_path, '\', '/')
AND other.id != entity_photo_links.id
);
UPDATE entity_photo_links
SET rel_path = REPLACE(rel_path, '\', '/')
WHERE rel_path LIKE '%\%';
-- video_preview_clips: UNIQUE(library_id, rel_path)
DELETE FROM video_preview_clips
WHERE rel_path LIKE '%\%'
AND EXISTS (
SELECT 1 FROM video_preview_clips AS other
WHERE other.library_id = video_preview_clips.library_id
AND other.rel_path = REPLACE(video_preview_clips.rel_path, '\', '/')
AND other.id != video_preview_clips.id
);
UPDATE video_preview_clips
SET rel_path = REPLACE(rel_path, '\', '/')
WHERE rel_path LIKE '%\%';
-- photo_insights has no UNIQUE on rel_path (history table), so a plain
-- normalize is safe.
UPDATE photo_insights
SET rel_path = REPLACE(rel_path, '\', '/')
WHERE rel_path LIKE '%\%';
ANALYZE;

184
src/bin/backfill_hashes.rs Normal file
View File

@@ -0,0 +1,184 @@
//! Backfill `image_exif.content_hash` + `size_bytes` for rows that were
//! ingested before hash computation was wired into the watcher.
//!
//! The watcher computes hashes for new files as they're ingested, so this
//! binary is a one-shot tool for the historical backlog. Safe to re-run;
//! only rows with NULL content_hash are processed.
use std::path::Path;
use std::sync::{Arc, Mutex};
use std::time::Instant;
use clap::Parser;
use rayon::prelude::*;
use image_api::content_hash;
use image_api::database::{ExifDao, SqliteExifDao, connect};
use image_api::libraries::{self, Library};
#[derive(Parser, Debug)]
#[command(name = "backfill_hashes")]
#[command(about = "Compute content_hash for image_exif rows missing one")]
struct Args {
/// Max rows to hash per batch. The process loops until no rows remain.
#[arg(long, default_value_t = 500)]
batch_size: i64,
/// Rayon parallelism override. 0 uses the default thread pool size.
#[arg(long, default_value_t = 0)]
parallelism: usize,
/// Dry-run: log what would be hashed without writing to the DB.
#[arg(long)]
dry_run: bool,
}
fn main() -> anyhow::Result<()> {
env_logger::init();
dotenv::dotenv().ok();
let args = Args::parse();
if args.parallelism > 0 {
rayon::ThreadPoolBuilder::new()
.num_threads(args.parallelism)
.build_global()
.expect("Unable to configure rayon thread pool");
}
// Resolve libraries (patch placeholder if still unset) so we can map
// library_id back to a root_path on disk.
let base_path = dotenv::var("BASE_PATH").ok();
let mut seed_conn = connect();
if let Some(base) = base_path.as_deref() {
libraries::seed_or_patch_from_env(&mut seed_conn, base);
}
let libs = libraries::load_all(&mut seed_conn);
drop(seed_conn);
if libs.is_empty() {
anyhow::bail!("No libraries configured; cannot backfill hashes");
}
let libs_by_id: std::collections::HashMap<i32, Library> =
libs.into_iter().map(|lib| (lib.id, lib)).collect();
println!(
"Configured libraries: {}",
libs_by_id
.values()
.map(|l| format!("{} -> {}", l.name, l.root_path))
.collect::<Vec<_>>()
.join(", ")
);
let dao: Arc<Mutex<Box<dyn ExifDao>>> =
Arc::new(Mutex::new(Box::new(SqliteExifDao::new())));
let ctx = opentelemetry::Context::new();
let mut total_hashed = 0u64;
let mut total_missing = 0u64;
let mut total_errors = 0u64;
let start = Instant::now();
loop {
let rows = {
let mut guard = dao.lock().expect("Unable to lock ExifDao");
guard
.get_rows_missing_hash(&ctx, args.batch_size)
.map_err(|e| anyhow::anyhow!("DB error: {:?}", e))?
};
if rows.is_empty() {
break;
}
println!("Processing batch of {} rows", rows.len());
// Compute hashes in parallel (I/O-bound; rayon helps on local disks,
// throttled by network on SMB mounts — use --parallelism to tune).
let results: Vec<(i32, String, Option<content_hash::FileIdentity>)> = rows
.into_par_iter()
.map(|(library_id, rel_path)| {
let abs = libs_by_id
.get(&library_id)
.map(|lib| Path::new(&lib.root_path).join(&rel_path));
match abs {
Some(abs_path) if abs_path.exists() => {
match content_hash::compute(&abs_path) {
Ok(id) => (library_id, rel_path, Some(id)),
Err(e) => {
eprintln!("hash error for {}: {:?}", abs_path.display(), e);
(library_id, rel_path, None)
}
}
}
Some(_) => (library_id, rel_path, None), // file missing on disk
None => {
eprintln!("Row refers to unknown library_id {}", library_id);
(library_id, rel_path, None)
}
}
})
.collect();
// Persist sequentially — SQLite writes serialize anyway.
if !args.dry_run {
let mut guard = dao.lock().expect("Unable to lock ExifDao");
for (library_id, rel_path, ident) in &results {
match ident {
Some(id) => {
match guard.backfill_content_hash(
&ctx,
*library_id,
rel_path,
&id.content_hash,
id.size_bytes,
) {
Ok(_) => total_hashed += 1,
Err(e) => {
eprintln!("persist error for {}: {:?}", rel_path, e);
total_errors += 1;
}
}
}
None => {
total_missing += 1;
}
}
}
} else {
for (_, rel_path, ident) in &results {
match ident {
Some(id) => {
println!(
"[dry-run] {} -> {} ({} bytes)",
rel_path, id.content_hash, id.size_bytes
);
total_hashed += 1;
}
None => {
total_missing += 1;
}
}
}
println!(
"[dry-run] processed one batch of {}. Stopping — a real run would continue \
until no NULL content_hash rows remain.",
results.len()
);
break;
}
let elapsed = start.elapsed().as_secs_f64().max(0.001);
let rate = total_hashed as f64 / elapsed;
println!(
" hashed={} missing={} errors={} ({:.1} files/sec)",
total_hashed, total_missing, total_errors, rate
);
}
println!();
println!(
"Done. hashed={}, skipped (missing on disk)={}, errors={}, elapsed={:.1}s",
total_hashed,
total_missing,
total_errors,
start.elapsed().as_secs_f64()
);
Ok(())
}

View File

@@ -67,7 +67,7 @@ fn main() -> anyhow::Result<()> {
let context = opentelemetry::Context::new();
let relative_path = match path.strip_prefix(&base) {
Ok(p) => p.to_str().unwrap().to_string(),
Ok(p) => p.to_str().unwrap().replace('\\', "/"),
Err(_) => {
eprintln!(
"Error: Could not create relative path for {}",

103
src/content_hash.rs Normal file
View File

@@ -0,0 +1,103 @@
//! Content-based file identity used to dedup derivative outputs
//! (thumbnails, HLS segments) across libraries.
//!
//! Hashes are computed with blake3 streaming so that network-mounted
//! libraries don't need to load whole files into memory. The result is
//! a 64-character hex string; we shard derivative directories on the
//! first two characters to keep any single directory's fanout bounded.
use std::fs::File;
use std::io::{self, Read};
use std::path::{Path, PathBuf};
/// Size of the read buffer used when streaming a file through blake3.
/// 1 MiB trades a bit of RSS for fewer syscalls on slow network mounts.
const HASH_BUFFER_SIZE: usize = 1024 * 1024;
/// Hash identity of a file, together with its byte length.
#[derive(Clone, Debug, PartialEq, Eq)]
pub struct FileIdentity {
pub content_hash: String,
pub size_bytes: i64,
}
/// Stream a file through blake3 and return the hex-encoded digest + size.
pub fn compute(path: &Path) -> io::Result<FileIdentity> {
let mut file = File::open(path)?;
let size_bytes = file.metadata()?.len() as i64;
let mut hasher = blake3::Hasher::new();
let mut buf = vec![0u8; HASH_BUFFER_SIZE];
loop {
let n = file.read(&mut buf)?;
if n == 0 {
break;
}
hasher.update(&buf[..n]);
}
Ok(FileIdentity {
content_hash: hasher.finalize().to_hex().to_string(),
size_bytes,
})
}
/// Hash-keyed thumbnail path: `<thumbs_dir>/<hash[..2]>/<hash>.jpg`.
/// Generation and serving both consult this first; the legacy mirrored
/// path acts as a fallback for pre-backfill rows.
pub fn thumbnail_path(thumbs_dir: &Path, hash: &str) -> PathBuf {
let shard = shard_prefix(hash);
thumbs_dir.join(shard).join(format!("{}.jpg", hash))
}
/// Hash-keyed HLS output directory: `<video_dir>/<hash[..2]>/<hash>/`.
/// The playlist lives at `playlist.m3u8` inside this directory and its
/// segments are co-located so HLS relative references Just Work.
pub fn hls_dir(video_dir: &Path, hash: &str) -> PathBuf {
let shard = shard_prefix(hash);
video_dir.join(shard).join(hash)
}
fn shard_prefix(hash: &str) -> &str {
let end = hash.char_indices().nth(2).map(|(i, _)| i).unwrap_or(hash.len());
&hash[..end]
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn identical_content_yields_identical_hash() {
let dir = tempfile::tempdir().unwrap();
let a = dir.path().join("a.bin");
let b = dir.path().join("b.bin");
std::fs::write(&a, b"hello world").unwrap();
std::fs::write(&b, b"hello world").unwrap();
let ha = compute(&a).unwrap();
let hb = compute(&b).unwrap();
assert_eq!(ha, hb);
assert_eq!(ha.size_bytes, 11);
}
#[test]
fn different_content_yields_different_hash() {
let dir = tempfile::tempdir().unwrap();
let a = dir.path().join("a.bin");
let b = dir.path().join("b.bin");
std::fs::write(&a, b"aaa").unwrap();
std::fs::write(&b, b"bbb").unwrap();
assert_ne!(compute(&a).unwrap(), compute(&b).unwrap());
}
#[test]
fn derivative_paths_shard_by_first_two_hex() {
let thumbs = Path::new("/tmp/thumbs");
let p = thumbnail_path(thumbs, "abcdef0123");
assert_eq!(p, PathBuf::from("/tmp/thumbs/ab/abcdef0123.jpg"));
let video = Path::new("/tmp/video");
let d = hls_dir(video, "1234deadbeef");
assert_eq!(d, PathBuf::from("/tmp/video/12/1234deadbeef"));
}
}

View File

@@ -312,6 +312,35 @@ pub trait ExifDao: Sync + Send {
base_path: &str,
recursive: bool,
) -> Result<Vec<(String, f64, f64, Option<i64>)>, DbError>;
/// Return rows that still lack a `content_hash`, oldest first. Used by
/// the `backfill_hashes` binary to batch through the historical
/// backlog. Returns `(library_id, rel_path)` tuples so the caller can
/// resolve each file on disk.
fn get_rows_missing_hash(
&mut self,
context: &opentelemetry::Context,
limit: i64,
) -> Result<Vec<(i32, String)>, DbError>;
/// Persist the computed blake3 hash + file size for an existing row.
fn backfill_content_hash(
&mut self,
context: &opentelemetry::Context,
library_id: i32,
rel_path: &str,
hash: &str,
size_bytes: i64,
) -> Result<(), DbError>;
/// Return the first EXIF row with the given content hash (any library).
/// Used by thumbnail/HLS generation to detect pre-existing derivatives
/// from another library before regenerating.
fn find_by_content_hash(
&mut self,
context: &opentelemetry::Context,
hash: &str,
) -> Result<Option<ImageExif>, DbError>;
}
pub struct SqliteExifDao {
@@ -346,13 +375,21 @@ impl ExifDao for SqliteExifDao {
diesel::insert_into(image_exif)
.values(&exif_data)
.execute(connection.deref_mut())
.map_err(|_| anyhow::anyhow!("Insert error"))?;
.map_err(|e| {
log::warn!(
"image_exif insert failed (lib={}, rel_path={:?}): {}",
exif_data.library_id,
exif_data.file_path,
e
);
anyhow::anyhow!("Insert error: {}", e)
})?;
image_exif
.filter(library_id.eq(exif_data.library_id))
.filter(rel_path.eq(&exif_data.file_path))
.first::<ImageExif>(connection.deref_mut())
.map_err(|_| anyhow::anyhow!("Query error"))
.map_err(|e| anyhow::anyhow!("Post-insert lookup failed: {}", e))
})
.map_err(|_| DbError::new(DbErrorKind::InsertError))
}
@@ -672,4 +709,70 @@ impl ExifDao for SqliteExifDao {
})
.map_err(|_| DbError::new(DbErrorKind::QueryError))
}
fn get_rows_missing_hash(
&mut self,
context: &opentelemetry::Context,
limit: i64,
) -> Result<Vec<(i32, String)>, DbError> {
trace_db_call(context, "query", "get_rows_missing_hash", |_span| {
use schema::image_exif::dsl::*;
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
image_exif
.filter(content_hash.is_null())
.select((library_id, rel_path))
.order(id.asc())
.limit(limit)
.load::<(i32, String)>(connection.deref_mut())
.map_err(|_| anyhow::anyhow!("Query error"))
})
.map_err(|_| DbError::new(DbErrorKind::QueryError))
}
fn backfill_content_hash(
&mut self,
context: &opentelemetry::Context,
library_id_val: i32,
rel_path_val: &str,
hash: &str,
size_val: i64,
) -> Result<(), DbError> {
trace_db_call(context, "update", "backfill_content_hash", |_span| {
use schema::image_exif::dsl::*;
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
diesel::update(
image_exif
.filter(library_id.eq(library_id_val))
.filter(rel_path.eq(rel_path_val)),
)
.set((content_hash.eq(hash), size_bytes.eq(size_val)))
.execute(connection.deref_mut())
.map(|_| ())
.map_err(|_| anyhow::anyhow!("Update error"))
})
.map_err(|_| DbError::new(DbErrorKind::UpdateError))
}
fn find_by_content_hash(
&mut self,
context: &opentelemetry::Context,
hash: &str,
) -> Result<Option<ImageExif>, DbError> {
trace_db_call(context, "query", "find_by_content_hash", |_span| {
use schema::image_exif::dsl::*;
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
image_exif
.filter(content_hash.eq(hash))
.first::<ImageExif>(connection.deref_mut())
.optional()
.map_err(|_| anyhow::anyhow!("Query error"))
})
.map_err(|_| DbError::new(DbErrorKind::QueryError))
}
}

View File

@@ -1360,6 +1360,33 @@ mod tests {
) -> Result<Vec<(String, f64, f64, Option<i64>)>, DbError> {
todo!()
}
fn get_rows_missing_hash(
&mut self,
_context: &opentelemetry::Context,
_limit: i64,
) -> Result<Vec<(i32, String)>, DbError> {
Ok(Vec::new())
}
fn backfill_content_hash(
&mut self,
_context: &opentelemetry::Context,
_library_id: i32,
_rel_path: &str,
_hash: &str,
_size_bytes: i64,
) -> Result<(), DbError> {
Ok(())
}
fn find_by_content_hash(
&mut self,
_context: &opentelemetry::Context,
_hash: &str,
) -> Result<Option<crate::database::models::ImageExif>, DbError> {
Ok(None)
}
}
mod api {

View File

@@ -4,6 +4,7 @@ extern crate diesel;
pub mod ai;
pub mod auth;
pub mod cleanup;
pub mod content_hash;
pub mod data;
pub mod database;
pub mod error;

View File

@@ -61,6 +61,7 @@ mod error;
mod exif;
mod file_types;
mod files;
mod content_hash;
mod geo;
mod libraries;
mod state;
@@ -96,6 +97,7 @@ async fn get_image(
request: HttpRequest,
req: web::Query<ThumbnailRequest>,
app_state: Data<AppState>,
exif_dao: Data<Mutex<Box<dyn ExifDao>>>,
) -> impl Responder {
let tracer = global_tracer();
let context = extract_context_from_request(&request);
@@ -108,15 +110,44 @@ async fn get_image(
let relative_path = path
.strip_prefix(&app_state.base_path)
.expect("Error stripping base path prefix from thumbnail");
let relative_path_str = relative_path.to_string_lossy().replace('\\', "/");
let thumbs = &app_state.thumbnail_path;
let mut thumb_path = Path::new(&thumbs).join(relative_path);
let legacy_thumb_path = Path::new(&thumbs).join(relative_path);
// If it's a video and GIF format is requested, try to serve GIF thumbnail
// Gif thumbnails are a separate lookup (video GIF previews).
// Dual-lookup for gif is out of scope; preserve existing flow.
if req.format == Some(ThumbnailFormat::Gif) && is_video_file(&path) {
thumb_path = Path::new(&app_state.gif_path).join(relative_path);
thumb_path.set_extension("gif");
let mut gif_path = Path::new(&app_state.gif_path).join(relative_path);
gif_path.set_extension("gif");
trace!("Gif thumbnail path: {:?}", gif_path);
if let Ok(file) = NamedFile::open(&gif_path) {
span.set_status(Status::Ok);
return file
.use_etag(true)
.use_last_modified(true)
.prefer_utf8(true)
.into_response(&request);
}
}
// Resolve the hash-keyed thumbnail (if the row already has a
// content_hash) and fall back to the legacy mirrored path.
let hash_thumb_path: Option<PathBuf> = {
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
match dao.get_exif(&context, &relative_path_str) {
Ok(Some(row)) => row
.content_hash
.as_deref()
.map(|h| content_hash::thumbnail_path(Path::new(thumbs), h)),
_ => None,
}
};
let thumb_path = hash_thumb_path
.as_ref()
.filter(|p| p.exists())
.cloned()
.unwrap_or_else(|| legacy_thumb_path.clone());
// Handle circular thumbnail request
if req.shape == Some(ThumbnailShape::Circle) {
@@ -141,8 +172,6 @@ async fn get_image(
trace!("Thumbnail path: {:?}", thumb_path);
if let Ok(file) = NamedFile::open(&thumb_path) {
span.set_status(Status::Ok);
// The NamedFile will automatically set the correct content-type
// Enable ETag and set cache headers for thumbnails (1 day cache)
return file
.use_etag(true)
.use_last_modified(true)
@@ -406,11 +435,23 @@ async fn upload_image(
.expect("Error stripping library root prefix")
.to_str()
.unwrap()
.to_string();
.replace('\\', "/");
match exif::extract_exif_from_path(&uploaded_path) {
Ok(exif_data) => {
let timestamp = Utc::now().timestamp();
let (content_hash, size_bytes) =
match content_hash::compute(&uploaded_path) {
Ok(id) => (Some(id.content_hash), Some(id.size_bytes)),
Err(e) => {
warn!(
"Failed to hash uploaded {}: {:?}",
uploaded_path.display(),
e
);
(None, None)
}
};
let insert_exif = InsertImageExif {
library_id: target_library.id,
file_path: relative_path.clone(),
@@ -430,8 +471,8 @@ async fn upload_image(
date_taken: exif_data.date_taken,
created_time: timestamp,
last_modified: timestamp,
content_hash: None,
size_bytes: None,
content_hash,
size_bytes,
};
if let Ok(mut dao) = exif_dao.lock() {
@@ -1566,11 +1607,13 @@ fn process_new_files(
.filter(|entry| is_image(entry) || is_video(entry))
.filter_map(|entry| {
let file_path = entry.path().to_path_buf();
// Canonical rel_path is forward-slash regardless of OS so DB
// comparisons against the batch EXIF lookup line up.
let relative_path = file_path
.strip_prefix(base_path)
.ok()?
.to_str()?
.to_string();
.replace('\\', "/");
Some((file_path, relative_path))
})
.collect();
@@ -1600,82 +1643,107 @@ fn process_new_files(
};
let mut new_files_found = false;
let mut files_needing_exif = Vec::new();
let mut files_needing_row = Vec::new();
// Check each file for missing thumbnail or EXIF data
// Register every image/video file in image_exif. Rows without EXIF
// still carry library_id, rel_path, content_hash, and size_bytes so
// derivative dedup and DB-indexed sort/filter work for every file,
// not just photos with parseable EXIF.
for (file_path, relative_path) in &files {
// Check if thumbnail exists
let thumb_path = thumbnail_directory.join(relative_path);
let needs_thumbnail = !thumb_path.exists();
let needs_row = !existing_exif_paths.contains_key(relative_path);
// Check if EXIF data exists (for supported files)
let needs_exif = if exif::supports_exif(file_path) {
!existing_exif_paths.contains_key(relative_path)
} else {
false
};
if needs_thumbnail || needs_exif {
if needs_thumbnail || needs_row {
new_files_found = true;
if needs_thumbnail {
info!("New file detected (missing thumbnail): {}", relative_path);
}
if needs_exif {
files_needing_exif.push((file_path.clone(), relative_path.clone()));
if needs_row {
files_needing_row.push((file_path.clone(), relative_path.clone()));
}
}
}
// Process EXIF data for files that need it
if !files_needing_exif.is_empty() {
if !files_needing_row.is_empty() {
info!(
"Processing EXIF data for {} files",
files_needing_exif.len()
"Registering {} new files in image_exif",
files_needing_row.len()
);
for (file_path, relative_path) in files_needing_exif {
match exif::extract_exif_from_path(&file_path) {
Ok(exif_data) => {
for (file_path, relative_path) in files_needing_row {
let timestamp = Utc::now().timestamp();
// Hash + size from filesystem metadata — always attempted so
// every file gets a content_hash, even when EXIF is absent.
let (content_hash, size_bytes) = match content_hash::compute(&file_path) {
Ok(id) => (Some(id.content_hash), Some(id.size_bytes)),
Err(e) => {
warn!("Failed to hash {}: {:?}", file_path.display(), e);
(None, None)
}
};
// EXIF is best-effort enrichment. When extraction fails (or the
// file type doesn't support EXIF) we still store a row with all
// EXIF fields NULL; the file remains visible to sort-by-date
// and tag queries via its rel_path and filesystem timestamps.
let exif_fields = if exif::supports_exif(&file_path) {
match exif::extract_exif_from_path(&file_path) {
Ok(data) => Some(data),
Err(e) => {
debug!(
"No EXIF or parse error for {}: {:?}",
file_path.display(),
e
);
None
}
}
} else {
None
};
let insert_exif = InsertImageExif {
library_id: library.id,
file_path: relative_path.clone(),
camera_make: exif_data.camera_make,
camera_model: exif_data.camera_model,
lens_model: exif_data.lens_model,
width: exif_data.width,
height: exif_data.height,
orientation: exif_data.orientation,
gps_latitude: exif_data.gps_latitude.map(|v| v as f32),
gps_longitude: exif_data.gps_longitude.map(|v| v as f32),
gps_altitude: exif_data.gps_altitude.map(|v| v as f32),
focal_length: exif_data.focal_length.map(|v| v as f32),
aperture: exif_data.aperture.map(|v| v as f32),
shutter_speed: exif_data.shutter_speed,
iso: exif_data.iso,
date_taken: exif_data.date_taken,
camera_make: exif_fields.as_ref().and_then(|e| e.camera_make.clone()),
camera_model: exif_fields.as_ref().and_then(|e| e.camera_model.clone()),
lens_model: exif_fields.as_ref().and_then(|e| e.lens_model.clone()),
width: exif_fields.as_ref().and_then(|e| e.width),
height: exif_fields.as_ref().and_then(|e| e.height),
orientation: exif_fields.as_ref().and_then(|e| e.orientation),
gps_latitude: exif_fields
.as_ref()
.and_then(|e| e.gps_latitude.map(|v| v as f32)),
gps_longitude: exif_fields
.as_ref()
.and_then(|e| e.gps_longitude.map(|v| v as f32)),
gps_altitude: exif_fields
.as_ref()
.and_then(|e| e.gps_altitude.map(|v| v as f32)),
focal_length: exif_fields
.as_ref()
.and_then(|e| e.focal_length.map(|v| v as f32)),
aperture: exif_fields
.as_ref()
.and_then(|e| e.aperture.map(|v| v as f32)),
shutter_speed: exif_fields.as_ref().and_then(|e| e.shutter_speed.clone()),
iso: exif_fields.as_ref().and_then(|e| e.iso),
date_taken: exif_fields.as_ref().and_then(|e| e.date_taken),
created_time: timestamp,
last_modified: timestamp,
content_hash: None,
size_bytes: None,
content_hash,
size_bytes,
};
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
if let Err(e) = dao.store_exif(&context, insert_exif) {
error!("Failed to store EXIF data for {}: {:?}", relative_path, e);
error!("Failed to register {} in image_exif: {:?}", relative_path, e);
} else {
debug!("EXIF data stored for {}", relative_path);
}
}
Err(e) => {
debug!(
"No EXIF data or error extracting from {}: {:?}",
file_path.display(),
e
);
}
debug!("Registered {} in image_exif", relative_path);
}
}
}