feat: add content_hash backfill + register every media file

Adds blake3 content hashing as the basis for derivative dedup
(thumbnails, HLS) across libraries. Computed inline by the watcher on
ingest and by a new `backfill_hashes` binary for historical rows.

Key changes:
- `content_hash` and `size_bytes` are now populated on new image_exif
  rows; a new ExifDao surface (`get_rows_missing_hash`,
  `backfill_content_hash`, `find_by_content_hash`) supports backfill and
  future hash-keyed lookups.
- The watcher now registers every image/video in image_exif, not just
  files with parseable EXIF. EXIF becomes optional enrichment; videos
  and other non-EXIF files still get a hashed row. This also makes
  DB-indexed sort/filter cover the full library.
- `/image` thumbnail serve dual-looks up hash-keyed path first, then
  falls back to the legacy mirrored layout.
- Upload flow accepts `?library=` query param + hashes uploaded files.
- Store_exif logs the underlying Diesel error on insert failure so
  constraint violations surface instead of hiding behind a generic
  InsertError.
- New migration normalizes rel_path separators to forward slash across
  all tables, deduplicating any rows that collide after normalization.
  Fixes spurious UNIQUE violations from mixed backslash/forward-slash
  paths on Windows ingest.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Cameron
2026-04-17 16:25:39 -04:00
committed by cameron
parent ce5b337582
commit 0aaea91cc2
11 changed files with 681 additions and 69 deletions

184
src/bin/backfill_hashes.rs Normal file
View File

@@ -0,0 +1,184 @@
//! Backfill `image_exif.content_hash` + `size_bytes` for rows that were
//! ingested before hash computation was wired into the watcher.
//!
//! The watcher computes hashes for new files as they're ingested, so this
//! binary is a one-shot tool for the historical backlog. Safe to re-run;
//! only rows with NULL content_hash are processed.
use std::path::Path;
use std::sync::{Arc, Mutex};
use std::time::Instant;
use clap::Parser;
use rayon::prelude::*;
use image_api::content_hash;
use image_api::database::{ExifDao, SqliteExifDao, connect};
use image_api::libraries::{self, Library};
#[derive(Parser, Debug)]
#[command(name = "backfill_hashes")]
#[command(about = "Compute content_hash for image_exif rows missing one")]
struct Args {
/// Max rows to hash per batch. The process loops until no rows remain.
#[arg(long, default_value_t = 500)]
batch_size: i64,
/// Rayon parallelism override. 0 uses the default thread pool size.
#[arg(long, default_value_t = 0)]
parallelism: usize,
/// Dry-run: log what would be hashed without writing to the DB.
#[arg(long)]
dry_run: bool,
}
fn main() -> anyhow::Result<()> {
env_logger::init();
dotenv::dotenv().ok();
let args = Args::parse();
if args.parallelism > 0 {
rayon::ThreadPoolBuilder::new()
.num_threads(args.parallelism)
.build_global()
.expect("Unable to configure rayon thread pool");
}
// Resolve libraries (patch placeholder if still unset) so we can map
// library_id back to a root_path on disk.
let base_path = dotenv::var("BASE_PATH").ok();
let mut seed_conn = connect();
if let Some(base) = base_path.as_deref() {
libraries::seed_or_patch_from_env(&mut seed_conn, base);
}
let libs = libraries::load_all(&mut seed_conn);
drop(seed_conn);
if libs.is_empty() {
anyhow::bail!("No libraries configured; cannot backfill hashes");
}
let libs_by_id: std::collections::HashMap<i32, Library> =
libs.into_iter().map(|lib| (lib.id, lib)).collect();
println!(
"Configured libraries: {}",
libs_by_id
.values()
.map(|l| format!("{} -> {}", l.name, l.root_path))
.collect::<Vec<_>>()
.join(", ")
);
let dao: Arc<Mutex<Box<dyn ExifDao>>> =
Arc::new(Mutex::new(Box::new(SqliteExifDao::new())));
let ctx = opentelemetry::Context::new();
let mut total_hashed = 0u64;
let mut total_missing = 0u64;
let mut total_errors = 0u64;
let start = Instant::now();
loop {
let rows = {
let mut guard = dao.lock().expect("Unable to lock ExifDao");
guard
.get_rows_missing_hash(&ctx, args.batch_size)
.map_err(|e| anyhow::anyhow!("DB error: {:?}", e))?
};
if rows.is_empty() {
break;
}
println!("Processing batch of {} rows", rows.len());
// Compute hashes in parallel (I/O-bound; rayon helps on local disks,
// throttled by network on SMB mounts — use --parallelism to tune).
let results: Vec<(i32, String, Option<content_hash::FileIdentity>)> = rows
.into_par_iter()
.map(|(library_id, rel_path)| {
let abs = libs_by_id
.get(&library_id)
.map(|lib| Path::new(&lib.root_path).join(&rel_path));
match abs {
Some(abs_path) if abs_path.exists() => {
match content_hash::compute(&abs_path) {
Ok(id) => (library_id, rel_path, Some(id)),
Err(e) => {
eprintln!("hash error for {}: {:?}", abs_path.display(), e);
(library_id, rel_path, None)
}
}
}
Some(_) => (library_id, rel_path, None), // file missing on disk
None => {
eprintln!("Row refers to unknown library_id {}", library_id);
(library_id, rel_path, None)
}
}
})
.collect();
// Persist sequentially — SQLite writes serialize anyway.
if !args.dry_run {
let mut guard = dao.lock().expect("Unable to lock ExifDao");
for (library_id, rel_path, ident) in &results {
match ident {
Some(id) => {
match guard.backfill_content_hash(
&ctx,
*library_id,
rel_path,
&id.content_hash,
id.size_bytes,
) {
Ok(_) => total_hashed += 1,
Err(e) => {
eprintln!("persist error for {}: {:?}", rel_path, e);
total_errors += 1;
}
}
}
None => {
total_missing += 1;
}
}
}
} else {
for (_, rel_path, ident) in &results {
match ident {
Some(id) => {
println!(
"[dry-run] {} -> {} ({} bytes)",
rel_path, id.content_hash, id.size_bytes
);
total_hashed += 1;
}
None => {
total_missing += 1;
}
}
}
println!(
"[dry-run] processed one batch of {}. Stopping — a real run would continue \
until no NULL content_hash rows remain.",
results.len()
);
break;
}
let elapsed = start.elapsed().as_secs_f64().max(0.001);
let rate = total_hashed as f64 / elapsed;
println!(
" hashed={} missing={} errors={} ({:.1} files/sec)",
total_hashed, total_missing, total_errors, rate
);
}
println!();
println!(
"Done. hashed={}, skipped (missing on disk)={}, errors={}, elapsed={:.1}s",
total_hashed,
total_missing,
total_errors,
start.elapsed().as_secs_f64()
);
Ok(())
}

View File

@@ -67,7 +67,7 @@ fn main() -> anyhow::Result<()> {
let context = opentelemetry::Context::new();
let relative_path = match path.strip_prefix(&base) {
Ok(p) => p.to_str().unwrap().to_string(),
Ok(p) => p.to_str().unwrap().replace('\\', "/"),
Err(_) => {
eprintln!(
"Error: Could not create relative path for {}",