feat: add content_hash backfill + register every media file

Adds blake3 content hashing as the basis for derivative dedup
(thumbnails, HLS) across libraries. Computed inline by the watcher on
ingest and by a new `backfill_hashes` binary for historical rows.

Key changes:
- `content_hash` and `size_bytes` are now populated on new image_exif
  rows; a new ExifDao surface (`get_rows_missing_hash`,
  `backfill_content_hash`, `find_by_content_hash`) supports backfill and
  future hash-keyed lookups.
- The watcher now registers every image/video in image_exif, not just
  files with parseable EXIF. EXIF becomes optional enrichment; videos
  and other non-EXIF files still get a hashed row. This also makes
  DB-indexed sort/filter cover the full library.
- `/image` thumbnail serve dual-looks up hash-keyed path first, then
  falls back to the legacy mirrored layout.
- Upload flow accepts `?library=` query param + hashes uploaded files.
- Store_exif logs the underlying Diesel error on insert failure so
  constraint violations surface instead of hiding behind a generic
  InsertError.
- New migration normalizes rel_path separators to forward slash across
  all tables, deduplicating any rows that collide after normalization.
  Fixes spurious UNIQUE violations from mixed backslash/forward-slash
  paths on Windows ingest.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Cameron
2026-04-17 16:25:39 -04:00
committed by cameron
parent ce5b337582
commit 0aaea91cc2
11 changed files with 681 additions and 69 deletions

View File

@@ -61,6 +61,7 @@ mod error;
mod exif;
mod file_types;
mod files;
mod content_hash;
mod geo;
mod libraries;
mod state;
@@ -96,6 +97,7 @@ async fn get_image(
request: HttpRequest,
req: web::Query<ThumbnailRequest>,
app_state: Data<AppState>,
exif_dao: Data<Mutex<Box<dyn ExifDao>>>,
) -> impl Responder {
let tracer = global_tracer();
let context = extract_context_from_request(&request);
@@ -108,16 +110,45 @@ async fn get_image(
let relative_path = path
.strip_prefix(&app_state.base_path)
.expect("Error stripping base path prefix from thumbnail");
let relative_path_str = relative_path.to_string_lossy().replace('\\', "/");
let thumbs = &app_state.thumbnail_path;
let mut thumb_path = Path::new(&thumbs).join(relative_path);
let legacy_thumb_path = Path::new(&thumbs).join(relative_path);
// If it's a video and GIF format is requested, try to serve GIF thumbnail
// Gif thumbnails are a separate lookup (video GIF previews).
// Dual-lookup for gif is out of scope; preserve existing flow.
if req.format == Some(ThumbnailFormat::Gif) && is_video_file(&path) {
thumb_path = Path::new(&app_state.gif_path).join(relative_path);
thumb_path.set_extension("gif");
let mut gif_path = Path::new(&app_state.gif_path).join(relative_path);
gif_path.set_extension("gif");
trace!("Gif thumbnail path: {:?}", gif_path);
if let Ok(file) = NamedFile::open(&gif_path) {
span.set_status(Status::Ok);
return file
.use_etag(true)
.use_last_modified(true)
.prefer_utf8(true)
.into_response(&request);
}
}
// Resolve the hash-keyed thumbnail (if the row already has a
// content_hash) and fall back to the legacy mirrored path.
let hash_thumb_path: Option<PathBuf> = {
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
match dao.get_exif(&context, &relative_path_str) {
Ok(Some(row)) => row
.content_hash
.as_deref()
.map(|h| content_hash::thumbnail_path(Path::new(thumbs), h)),
_ => None,
}
};
let thumb_path = hash_thumb_path
.as_ref()
.filter(|p| p.exists())
.cloned()
.unwrap_or_else(|| legacy_thumb_path.clone());
// Handle circular thumbnail request
if req.shape == Some(ThumbnailShape::Circle) {
match create_circular_thumbnail(&thumb_path, thumbs).await {
@@ -141,8 +172,6 @@ async fn get_image(
trace!("Thumbnail path: {:?}", thumb_path);
if let Ok(file) = NamedFile::open(&thumb_path) {
span.set_status(Status::Ok);
// The NamedFile will automatically set the correct content-type
// Enable ETag and set cache headers for thumbnails (1 day cache)
return file
.use_etag(true)
.use_last_modified(true)
@@ -406,11 +435,23 @@ async fn upload_image(
.expect("Error stripping library root prefix")
.to_str()
.unwrap()
.to_string();
.replace('\\', "/");
match exif::extract_exif_from_path(&uploaded_path) {
Ok(exif_data) => {
let timestamp = Utc::now().timestamp();
let (content_hash, size_bytes) =
match content_hash::compute(&uploaded_path) {
Ok(id) => (Some(id.content_hash), Some(id.size_bytes)),
Err(e) => {
warn!(
"Failed to hash uploaded {}: {:?}",
uploaded_path.display(),
e
);
(None, None)
}
};
let insert_exif = InsertImageExif {
library_id: target_library.id,
file_path: relative_path.clone(),
@@ -430,8 +471,8 @@ async fn upload_image(
date_taken: exif_data.date_taken,
created_time: timestamp,
last_modified: timestamp,
content_hash: None,
size_bytes: None,
content_hash,
size_bytes,
};
if let Ok(mut dao) = exif_dao.lock() {
@@ -1566,11 +1607,13 @@ fn process_new_files(
.filter(|entry| is_image(entry) || is_video(entry))
.filter_map(|entry| {
let file_path = entry.path().to_path_buf();
// Canonical rel_path is forward-slash regardless of OS so DB
// comparisons against the batch EXIF lookup line up.
let relative_path = file_path
.strip_prefix(base_path)
.ok()?
.to_str()?
.to_string();
.replace('\\', "/");
Some((file_path, relative_path))
})
.collect();
@@ -1600,82 +1643,107 @@ fn process_new_files(
};
let mut new_files_found = false;
let mut files_needing_exif = Vec::new();
let mut files_needing_row = Vec::new();
// Check each file for missing thumbnail or EXIF data
// Register every image/video file in image_exif. Rows without EXIF
// still carry library_id, rel_path, content_hash, and size_bytes so
// derivative dedup and DB-indexed sort/filter work for every file,
// not just photos with parseable EXIF.
for (file_path, relative_path) in &files {
// Check if thumbnail exists
let thumb_path = thumbnail_directory.join(relative_path);
let needs_thumbnail = !thumb_path.exists();
let needs_row = !existing_exif_paths.contains_key(relative_path);
// Check if EXIF data exists (for supported files)
let needs_exif = if exif::supports_exif(file_path) {
!existing_exif_paths.contains_key(relative_path)
} else {
false
};
if needs_thumbnail || needs_exif {
if needs_thumbnail || needs_row {
new_files_found = true;
if needs_thumbnail {
info!("New file detected (missing thumbnail): {}", relative_path);
}
if needs_exif {
files_needing_exif.push((file_path.clone(), relative_path.clone()));
if needs_row {
files_needing_row.push((file_path.clone(), relative_path.clone()));
}
}
}
// Process EXIF data for files that need it
if !files_needing_exif.is_empty() {
if !files_needing_row.is_empty() {
info!(
"Processing EXIF data for {} files",
files_needing_exif.len()
"Registering {} new files in image_exif",
files_needing_row.len()
);
for (file_path, relative_path) in files_needing_exif {
match exif::extract_exif_from_path(&file_path) {
Ok(exif_data) => {
let timestamp = Utc::now().timestamp();
let insert_exif = InsertImageExif {
library_id: library.id,
file_path: relative_path.clone(),
camera_make: exif_data.camera_make,
camera_model: exif_data.camera_model,
lens_model: exif_data.lens_model,
width: exif_data.width,
height: exif_data.height,
orientation: exif_data.orientation,
gps_latitude: exif_data.gps_latitude.map(|v| v as f32),
gps_longitude: exif_data.gps_longitude.map(|v| v as f32),
gps_altitude: exif_data.gps_altitude.map(|v| v as f32),
focal_length: exif_data.focal_length.map(|v| v as f32),
aperture: exif_data.aperture.map(|v| v as f32),
shutter_speed: exif_data.shutter_speed,
iso: exif_data.iso,
date_taken: exif_data.date_taken,
created_time: timestamp,
last_modified: timestamp,
content_hash: None,
size_bytes: None,
};
for (file_path, relative_path) in files_needing_row {
let timestamp = Utc::now().timestamp();
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
if let Err(e) = dao.store_exif(&context, insert_exif) {
error!("Failed to store EXIF data for {}: {:?}", relative_path, e);
} else {
debug!("EXIF data stored for {}", relative_path);
// Hash + size from filesystem metadata — always attempted so
// every file gets a content_hash, even when EXIF is absent.
let (content_hash, size_bytes) = match content_hash::compute(&file_path) {
Ok(id) => (Some(id.content_hash), Some(id.size_bytes)),
Err(e) => {
warn!("Failed to hash {}: {:?}", file_path.display(), e);
(None, None)
}
};
// EXIF is best-effort enrichment. When extraction fails (or the
// file type doesn't support EXIF) we still store a row with all
// EXIF fields NULL; the file remains visible to sort-by-date
// and tag queries via its rel_path and filesystem timestamps.
let exif_fields = if exif::supports_exif(&file_path) {
match exif::extract_exif_from_path(&file_path) {
Ok(data) => Some(data),
Err(e) => {
debug!(
"No EXIF or parse error for {}: {:?}",
file_path.display(),
e
);
None
}
}
Err(e) => {
debug!(
"No EXIF data or error extracting from {}: {:?}",
file_path.display(),
e
);
}
} else {
None
};
let insert_exif = InsertImageExif {
library_id: library.id,
file_path: relative_path.clone(),
camera_make: exif_fields.as_ref().and_then(|e| e.camera_make.clone()),
camera_model: exif_fields.as_ref().and_then(|e| e.camera_model.clone()),
lens_model: exif_fields.as_ref().and_then(|e| e.lens_model.clone()),
width: exif_fields.as_ref().and_then(|e| e.width),
height: exif_fields.as_ref().and_then(|e| e.height),
orientation: exif_fields.as_ref().and_then(|e| e.orientation),
gps_latitude: exif_fields
.as_ref()
.and_then(|e| e.gps_latitude.map(|v| v as f32)),
gps_longitude: exif_fields
.as_ref()
.and_then(|e| e.gps_longitude.map(|v| v as f32)),
gps_altitude: exif_fields
.as_ref()
.and_then(|e| e.gps_altitude.map(|v| v as f32)),
focal_length: exif_fields
.as_ref()
.and_then(|e| e.focal_length.map(|v| v as f32)),
aperture: exif_fields
.as_ref()
.and_then(|e| e.aperture.map(|v| v as f32)),
shutter_speed: exif_fields.as_ref().and_then(|e| e.shutter_speed.clone()),
iso: exif_fields.as_ref().and_then(|e| e.iso),
date_taken: exif_fields.as_ref().and_then(|e| e.date_taken),
created_time: timestamp,
last_modified: timestamp,
content_hash,
size_bytes,
};
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
if let Err(e) = dao.store_exif(&context, insert_exif) {
error!("Failed to register {} in image_exif: {:?}", relative_path, e);
} else {
debug!("Registered {} in image_exif", relative_path);
}
}
}