feat: add content_hash backfill + register every media file
Adds blake3 content hashing as the basis for derivative dedup (thumbnails, HLS) across libraries. Computed inline by the watcher on ingest and by a new `backfill_hashes` binary for historical rows. Key changes: - `content_hash` and `size_bytes` are now populated on new image_exif rows; a new ExifDao surface (`get_rows_missing_hash`, `backfill_content_hash`, `find_by_content_hash`) supports backfill and future hash-keyed lookups. - The watcher now registers every image/video in image_exif, not just files with parseable EXIF. EXIF becomes optional enrichment; videos and other non-EXIF files still get a hashed row. This also makes DB-indexed sort/filter cover the full library. - `/image` thumbnail serve dual-looks up hash-keyed path first, then falls back to the legacy mirrored layout. - Upload flow accepts `?library=` query param + hashes uploaded files. - Store_exif logs the underlying Diesel error on insert failure so constraint violations surface instead of hiding behind a generic InsertError. - New migration normalizes rel_path separators to forward slash across all tables, deduplicating any rows that collide after normalization. Fixes spurious UNIQUE violations from mixed backslash/forward-slash paths on Windows ingest. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -312,6 +312,35 @@ pub trait ExifDao: Sync + Send {
|
||||
base_path: &str,
|
||||
recursive: bool,
|
||||
) -> Result<Vec<(String, f64, f64, Option<i64>)>, DbError>;
|
||||
|
||||
/// Return rows that still lack a `content_hash`, oldest first. Used by
|
||||
/// the `backfill_hashes` binary to batch through the historical
|
||||
/// backlog. Returns `(library_id, rel_path)` tuples so the caller can
|
||||
/// resolve each file on disk.
|
||||
fn get_rows_missing_hash(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
limit: i64,
|
||||
) -> Result<Vec<(i32, String)>, DbError>;
|
||||
|
||||
/// Persist the computed blake3 hash + file size for an existing row.
|
||||
fn backfill_content_hash(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
library_id: i32,
|
||||
rel_path: &str,
|
||||
hash: &str,
|
||||
size_bytes: i64,
|
||||
) -> Result<(), DbError>;
|
||||
|
||||
/// Return the first EXIF row with the given content hash (any library).
|
||||
/// Used by thumbnail/HLS generation to detect pre-existing derivatives
|
||||
/// from another library before regenerating.
|
||||
fn find_by_content_hash(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
hash: &str,
|
||||
) -> Result<Option<ImageExif>, DbError>;
|
||||
}
|
||||
|
||||
pub struct SqliteExifDao {
|
||||
@@ -346,13 +375,21 @@ impl ExifDao for SqliteExifDao {
|
||||
diesel::insert_into(image_exif)
|
||||
.values(&exif_data)
|
||||
.execute(connection.deref_mut())
|
||||
.map_err(|_| anyhow::anyhow!("Insert error"))?;
|
||||
.map_err(|e| {
|
||||
log::warn!(
|
||||
"image_exif insert failed (lib={}, rel_path={:?}): {}",
|
||||
exif_data.library_id,
|
||||
exif_data.file_path,
|
||||
e
|
||||
);
|
||||
anyhow::anyhow!("Insert error: {}", e)
|
||||
})?;
|
||||
|
||||
image_exif
|
||||
.filter(library_id.eq(exif_data.library_id))
|
||||
.filter(rel_path.eq(&exif_data.file_path))
|
||||
.first::<ImageExif>(connection.deref_mut())
|
||||
.map_err(|_| anyhow::anyhow!("Query error"))
|
||||
.map_err(|e| anyhow::anyhow!("Post-insert lookup failed: {}", e))
|
||||
})
|
||||
.map_err(|_| DbError::new(DbErrorKind::InsertError))
|
||||
}
|
||||
@@ -672,4 +709,70 @@ impl ExifDao for SqliteExifDao {
|
||||
})
|
||||
.map_err(|_| DbError::new(DbErrorKind::QueryError))
|
||||
}
|
||||
|
||||
fn get_rows_missing_hash(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
limit: i64,
|
||||
) -> Result<Vec<(i32, String)>, DbError> {
|
||||
trace_db_call(context, "query", "get_rows_missing_hash", |_span| {
|
||||
use schema::image_exif::dsl::*;
|
||||
|
||||
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
|
||||
|
||||
image_exif
|
||||
.filter(content_hash.is_null())
|
||||
.select((library_id, rel_path))
|
||||
.order(id.asc())
|
||||
.limit(limit)
|
||||
.load::<(i32, String)>(connection.deref_mut())
|
||||
.map_err(|_| anyhow::anyhow!("Query error"))
|
||||
})
|
||||
.map_err(|_| DbError::new(DbErrorKind::QueryError))
|
||||
}
|
||||
|
||||
fn backfill_content_hash(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
library_id_val: i32,
|
||||
rel_path_val: &str,
|
||||
hash: &str,
|
||||
size_val: i64,
|
||||
) -> Result<(), DbError> {
|
||||
trace_db_call(context, "update", "backfill_content_hash", |_span| {
|
||||
use schema::image_exif::dsl::*;
|
||||
|
||||
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
|
||||
|
||||
diesel::update(
|
||||
image_exif
|
||||
.filter(library_id.eq(library_id_val))
|
||||
.filter(rel_path.eq(rel_path_val)),
|
||||
)
|
||||
.set((content_hash.eq(hash), size_bytes.eq(size_val)))
|
||||
.execute(connection.deref_mut())
|
||||
.map(|_| ())
|
||||
.map_err(|_| anyhow::anyhow!("Update error"))
|
||||
})
|
||||
.map_err(|_| DbError::new(DbErrorKind::UpdateError))
|
||||
}
|
||||
|
||||
fn find_by_content_hash(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
hash: &str,
|
||||
) -> Result<Option<ImageExif>, DbError> {
|
||||
trace_db_call(context, "query", "find_by_content_hash", |_span| {
|
||||
use schema::image_exif::dsl::*;
|
||||
|
||||
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
|
||||
|
||||
image_exif
|
||||
.filter(content_hash.eq(hash))
|
||||
.first::<ImageExif>(connection.deref_mut())
|
||||
.optional()
|
||||
.map_err(|_| anyhow::anyhow!("Query error"))
|
||||
})
|
||||
.map_err(|_| DbError::new(DbErrorKind::QueryError))
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user