Split main.rs: extract backfill drains and thumbnails into modules
main.rs drops from 3542 → ~2930 lines by moving:
- src/backfill.rs (new): backfill_unhashed_backlog,
backfill_missing_date_taken, backfill_missing_content_hashes,
build_face_candidates, process_face_backlog. Now unit-tested for
the first time — 5 tests covering cap behavior, library-id
filtering, missing-on-disk skip, and the video/unhashed/scanned
filters on face-candidate selection.
- src/thumbnails.rs (new): unsupported_thumbnail_sentinel,
generate_image_thumbnail, create_thumbnails, update_media_counts,
is_image, is_video, plus the IMAGE_GAUGE / VIDEO_GAUGE Prometheus
metrics. Replaces the no-op stubs that used to live in lib.rs.
4 new unit tests for the sentinel path math and the
walker-counts-images-vs-videos smoke path.
Supporting:
- SqliteExifDao::from_shared (test-only) so an SqliteExifDao and
SqliteFaceDao can share one in-memory connection — required to
test build_face_candidates against the real join.
- files.rs / video/{mod,actors}.rs import from crate::thumbnails::*
instead of the now-removed stubs in lib.rs.
cargo test --bin image-api: 325 passing (was 314).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
721
src/backfill.rs
Normal file
721
src/backfill.rs
Normal file
@@ -0,0 +1,721 @@
|
||||
//! Per-tick drains the watcher runs alongside ingest.
|
||||
//!
|
||||
//! These passes were previously inlined in `main.rs`; they exist because
|
||||
//! a quick scan only walks recently-modified files, so any backlog of
|
||||
//! rows missing a `content_hash` / `date_taken` / face detection
|
||||
//! wouldn't otherwise drain except during the once-an-hour full scan.
|
||||
//! Each function is bounded per call by a `*_PER_TICK` env-var cap.
|
||||
|
||||
use std::collections::HashMap;
|
||||
use std::path::PathBuf;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use log::{debug, info, warn};
|
||||
|
||||
use crate::content_hash;
|
||||
use crate::database::ExifDao;
|
||||
use crate::date_resolver;
|
||||
use crate::face_watch;
|
||||
use crate::faces;
|
||||
use crate::file_types;
|
||||
use crate::libraries;
|
||||
use crate::tags;
|
||||
|
||||
/// Compute and persist content_hash for image_exif rows where it's NULL.
|
||||
///
|
||||
/// Bounded per call by `FACE_HASH_BACKFILL_MAX_PER_TICK` (default 2000)
|
||||
/// so a watcher tick on a large legacy library doesn't block for hours
|
||||
/// blake3-ing every photo at once. Subsequent scans pick up the rest.
|
||||
/// For 50k+ libraries the dedicated `cargo run --bin backfill_hashes`
|
||||
/// is still faster (it doesn't fight a watcher loop for the DAO mutex).
|
||||
///
|
||||
/// Drains unhashed image_exif rows by querying them directly, independent
|
||||
/// of the filesystem walk. Quick scans only walk recently-modified files,
|
||||
/// so a backlog of pre-existing unhashed rows never enters
|
||||
/// `process_new_files`'s candidate set — left alone, it would only drain
|
||||
/// on full scans (default once an hour). Calling this every tick keeps
|
||||
/// the face-detection backlog moving regardless.
|
||||
///
|
||||
/// Returns the number of rows successfully backfilled this pass.
|
||||
pub fn backfill_unhashed_backlog(
|
||||
context: &opentelemetry::Context,
|
||||
library: &libraries::Library,
|
||||
exif_dao: &Arc<Mutex<Box<dyn ExifDao>>>,
|
||||
) -> usize {
|
||||
let cap: i64 = dotenv::var("FACE_HASH_BACKFILL_MAX_PER_TICK")
|
||||
.ok()
|
||||
.and_then(|s| s.parse().ok())
|
||||
.filter(|n: &i64| *n > 0)
|
||||
.unwrap_or(2000);
|
||||
|
||||
// Fetch up to cap+1 rows so we can tell "more remain" without a
|
||||
// separate count query. Across libraries — there's no per-library
|
||||
// filter on get_rows_missing_hash today — but we only ever update
|
||||
// rows whose library_id matches the caller's library, so other
|
||||
// libraries' rows just get skipped here and picked up on the next
|
||||
// library's tick. Negligible cost given the cap.
|
||||
let rows: Vec<(i32, String)> = {
|
||||
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
|
||||
dao.get_rows_missing_hash(context, cap + 1)
|
||||
.unwrap_or_default()
|
||||
};
|
||||
if rows.is_empty() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
let more_than_cap = rows.len() as i64 > cap;
|
||||
let base_path = std::path::Path::new(&library.root_path);
|
||||
|
||||
let mut backfilled = 0usize;
|
||||
let mut errors = 0usize;
|
||||
let mut skipped_other_lib = 0usize;
|
||||
for (lib_id, rel_path) in rows.iter().take(cap as usize) {
|
||||
if *lib_id != library.id {
|
||||
skipped_other_lib += 1;
|
||||
continue;
|
||||
}
|
||||
let abs = base_path.join(rel_path);
|
||||
if !abs.exists() {
|
||||
// File walked away — the watcher's reconciliation pass will
|
||||
// remove the orphan exif row eventually.
|
||||
continue;
|
||||
}
|
||||
match content_hash::compute(&abs) {
|
||||
Ok(id) => {
|
||||
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
|
||||
if let Err(e) = dao.backfill_content_hash(
|
||||
context,
|
||||
library.id,
|
||||
rel_path,
|
||||
&id.content_hash,
|
||||
id.size_bytes,
|
||||
) {
|
||||
warn!(
|
||||
"face_watch: backfill_content_hash failed for {}: {:?}",
|
||||
rel_path, e
|
||||
);
|
||||
errors += 1;
|
||||
} else {
|
||||
backfilled += 1;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
debug!(
|
||||
"face_watch: hash compute failed for {} ({:?})",
|
||||
abs.display(),
|
||||
e
|
||||
);
|
||||
errors += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if backfilled > 0 || errors > 0 || more_than_cap {
|
||||
info!(
|
||||
"face_watch: backfill pass for library '{}': hashed {} ({} error(s), {} skipped to other libraries; {} cap, more_remain={})",
|
||||
library.name, backfilled, errors, skipped_other_lib, cap, more_than_cap
|
||||
);
|
||||
}
|
||||
backfilled
|
||||
}
|
||||
|
||||
/// Drain image_exif rows whose `date_taken` was never resolved or was
|
||||
/// resolved by the weakest fallback (`fs_time`). Runs the canonical-date
|
||||
/// waterfall — exiftool batch (one subprocess for the whole tick's
|
||||
/// rows) → filename regex → earliest_fs_time — and persists each
|
||||
/// resolution with its source tag. Capped per tick by
|
||||
/// `DATE_BACKFILL_MAX_PER_TICK` (default 500) so a 14k-row library
|
||||
/// drains over a few quick-scan ticks without blocking the watcher.
|
||||
///
|
||||
/// kamadak-exif is intentionally skipped here: the row already has a
|
||||
/// NULL date_taken because the ingest path's kamadak-exif call returned
|
||||
/// nothing, and re-running it would just produce the same answer.
|
||||
/// exiftool is the meaningful new attempt — it handles videos and
|
||||
/// MakerNote-hosted dates kamadak can't reach.
|
||||
pub fn backfill_missing_date_taken(
|
||||
context: &opentelemetry::Context,
|
||||
library: &libraries::Library,
|
||||
exif_dao: &Arc<Mutex<Box<dyn ExifDao>>>,
|
||||
) -> usize {
|
||||
let cap: i64 = dotenv::var("DATE_BACKFILL_MAX_PER_TICK")
|
||||
.ok()
|
||||
.and_then(|s| s.parse().ok())
|
||||
.filter(|n: &i64| *n > 0)
|
||||
.unwrap_or(500);
|
||||
|
||||
let rows: Vec<(i32, String)> = {
|
||||
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
|
||||
dao.get_rows_needing_date_backfill(context, library.id, cap + 1)
|
||||
.unwrap_or_default()
|
||||
};
|
||||
if rows.is_empty() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
let more_than_cap = rows.len() as i64 > cap;
|
||||
let base_path = std::path::Path::new(&library.root_path);
|
||||
|
||||
// Build absolute paths and drop rows whose files no longer exist —
|
||||
// the missing-file scan in library_maintenance retires deleted rows
|
||||
// separately. Without this filter, NULL-date rows for missing files
|
||||
// would loop through the drain forever (no source can resolve them).
|
||||
let mut existing: Vec<(String, PathBuf)> = Vec::with_capacity(rows.len());
|
||||
for (_, rel_path) in rows.iter().take(cap as usize) {
|
||||
let abs = base_path.join(rel_path);
|
||||
if abs.exists() {
|
||||
existing.push((rel_path.clone(), abs));
|
||||
}
|
||||
}
|
||||
if existing.is_empty() {
|
||||
return 0;
|
||||
}
|
||||
|
||||
// One exiftool subprocess for the whole batch; the resolver falls
|
||||
// through to filename / fs_time per file when exiftool can't supply
|
||||
// a date (or isn't installed at all).
|
||||
let paths: Vec<PathBuf> = existing.iter().map(|(_, p)| p.clone()).collect();
|
||||
let resolved = date_resolver::resolve_dates_batch(&paths, &HashMap::new());
|
||||
|
||||
let mut backfilled = 0usize;
|
||||
let mut unresolved = 0usize;
|
||||
let mut by_source: HashMap<&'static str, usize> = HashMap::new();
|
||||
{
|
||||
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
|
||||
for (rel_path, abs) in &existing {
|
||||
let Some(rd) = resolved.get(abs).copied() else {
|
||||
unresolved += 1;
|
||||
continue;
|
||||
};
|
||||
match dao.backfill_date_taken(
|
||||
context,
|
||||
library.id,
|
||||
rel_path,
|
||||
rd.timestamp,
|
||||
rd.source.as_str(),
|
||||
) {
|
||||
Ok(()) => {
|
||||
backfilled += 1;
|
||||
*by_source.entry(rd.source.as_str()).or_insert(0) += 1;
|
||||
}
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"date_backfill: update failed for lib {} {}: {:?}",
|
||||
library.id, rel_path, e
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if backfilled > 0 || unresolved > 0 || more_than_cap {
|
||||
info!(
|
||||
"date_backfill: library '{}': resolved {} ({:?}), {} unresolved, cap={}, more_remain={}",
|
||||
library.name, backfilled, by_source, unresolved, cap, more_than_cap
|
||||
);
|
||||
}
|
||||
backfilled
|
||||
}
|
||||
|
||||
/// Per-tick face-detection drain. Pulls a capped batch of hashed-but-
|
||||
/// unscanned image_exif rows directly via the FaceDao anti-join and
|
||||
/// hands them to the existing detection pass. Runs on every tick (not
|
||||
/// just full scans) so the backlog moves at quick-scan cadence.
|
||||
pub fn process_face_backlog(
|
||||
context: &opentelemetry::Context,
|
||||
library: &libraries::Library,
|
||||
face_client: &crate::ai::face_client::FaceClient,
|
||||
face_dao: &Arc<Mutex<Box<dyn faces::FaceDao>>>,
|
||||
tag_dao: &Arc<Mutex<Box<dyn tags::TagDao>>>,
|
||||
excluded_dirs: &[String],
|
||||
) {
|
||||
let cap: i64 = dotenv::var("FACE_BACKLOG_MAX_PER_TICK")
|
||||
.ok()
|
||||
.and_then(|s| s.parse().ok())
|
||||
.filter(|n: &i64| *n > 0)
|
||||
.unwrap_or(64);
|
||||
|
||||
let rows: Vec<(String, String)> = {
|
||||
let mut dao = face_dao.lock().expect("face dao");
|
||||
match dao.list_unscanned_candidates(context, library.id, cap) {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
warn!(
|
||||
"face_watch: list_unscanned_candidates failed for library '{}': {:?}",
|
||||
library.name, e
|
||||
);
|
||||
return;
|
||||
}
|
||||
}
|
||||
};
|
||||
if rows.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
info!(
|
||||
"face_watch: backlog drain — running detection on {} candidate(s) for library '{}' (cap={})",
|
||||
rows.len(),
|
||||
library.name,
|
||||
cap
|
||||
);
|
||||
|
||||
let candidates: Vec<face_watch::FaceCandidate> = rows
|
||||
.into_iter()
|
||||
.map(|(rel_path, content_hash)| face_watch::FaceCandidate {
|
||||
rel_path,
|
||||
content_hash,
|
||||
})
|
||||
.collect();
|
||||
|
||||
face_watch::run_face_detection_pass(
|
||||
library,
|
||||
excluded_dirs,
|
||||
face_client,
|
||||
Arc::clone(face_dao),
|
||||
Arc::clone(tag_dao),
|
||||
candidates,
|
||||
);
|
||||
}
|
||||
|
||||
/// Compute content_hash for any image rows the walker just touched
|
||||
/// whose stored EXIF row is still hash-less. Called from
|
||||
/// `process_new_files` so freshly-ingested files don't have to wait for
|
||||
/// the next standalone `backfill_unhashed_backlog` tick before face
|
||||
/// detection can key on their bytes.
|
||||
///
|
||||
/// Cap is on **successes only**. An earlier version counted errors too,
|
||||
/// so a pocket of chronically-unhashable files at the front of the
|
||||
/// table (vanished mid-scan, permission denied, etc.) burned the budget
|
||||
/// every tick and the rest of the backlog never advanced.
|
||||
pub fn backfill_missing_content_hashes(
|
||||
context: &opentelemetry::Context,
|
||||
files: &[(PathBuf, String)],
|
||||
library: &libraries::Library,
|
||||
exif_dao: &Arc<Mutex<Box<dyn ExifDao>>>,
|
||||
) {
|
||||
let image_paths: Vec<String> = files
|
||||
.iter()
|
||||
.filter(|(p, _)| !file_types::is_video_file(p))
|
||||
.map(|(_, rel)| rel.clone())
|
||||
.collect();
|
||||
if image_paths.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let exif_records = {
|
||||
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
|
||||
dao.get_exif_batch(context, Some(library.id), &image_paths)
|
||||
.unwrap_or_default()
|
||||
};
|
||||
// Cheap lookup back from rel_path → absolute file_path so
|
||||
// content_hash::compute can read the bytes.
|
||||
let path_by_rel: HashMap<String, &PathBuf> =
|
||||
files.iter().map(|(p, rel)| (rel.clone(), p)).collect();
|
||||
|
||||
let cap: usize = dotenv::var("FACE_HASH_BACKFILL_MAX_PER_TICK")
|
||||
.ok()
|
||||
.and_then(|s| s.parse().ok())
|
||||
.filter(|n: &usize| *n > 0)
|
||||
.unwrap_or(2000);
|
||||
|
||||
// Count the unhashed backlog up front so we can surface "still needs
|
||||
// backfill: N" in the log — without it, a face-scan that's stuck at
|
||||
// 44% looks stalled when really it's chipping through hashes.
|
||||
let unhashed_total = exif_records
|
||||
.iter()
|
||||
.filter(|r| r.content_hash.is_none())
|
||||
.count();
|
||||
|
||||
let mut backfilled = 0usize;
|
||||
let mut errors = 0usize;
|
||||
for record in &exif_records {
|
||||
if backfilled >= cap {
|
||||
break;
|
||||
}
|
||||
if record.content_hash.is_some() {
|
||||
continue;
|
||||
}
|
||||
let Some(file_path) = path_by_rel.get(&record.file_path) else {
|
||||
// Walked file went missing between the directory scan and now;
|
||||
// next tick will retry naturally.
|
||||
continue;
|
||||
};
|
||||
match content_hash::compute(file_path) {
|
||||
Ok(id) => {
|
||||
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
|
||||
if let Err(e) = dao.backfill_content_hash(
|
||||
context,
|
||||
library.id,
|
||||
&record.file_path,
|
||||
&id.content_hash,
|
||||
id.size_bytes,
|
||||
) {
|
||||
warn!(
|
||||
"face_watch: backfill_content_hash failed for {}: {:?}",
|
||||
record.file_path, e
|
||||
);
|
||||
errors += 1;
|
||||
} else {
|
||||
backfilled += 1;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
debug!(
|
||||
"face_watch: hash compute failed for {} ({:?})",
|
||||
file_path.display(),
|
||||
e
|
||||
);
|
||||
errors += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
// Always log when there's an unhashed backlog so an operator
|
||||
// looking at "scan stuck at 44%" can see backfill is running and
|
||||
// how much remains. Quiet only when there's nothing to do.
|
||||
if unhashed_total > 0 || backfilled > 0 || errors > 0 {
|
||||
let remaining = unhashed_total.saturating_sub(backfilled);
|
||||
info!(
|
||||
"face_watch: backfilled {}/{} content_hash for library '{}' ({} error(s); {} still need backfill; cap={})",
|
||||
backfilled, unhashed_total, library.name, errors, remaining, cap
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
/// Build the face-detection candidate list for a scan tick.
|
||||
///
|
||||
/// Returns `(rel_path, content_hash)` for every image file that has a
|
||||
/// content_hash recorded in image_exif but no row in face_detections
|
||||
/// yet. Re-querying image_exif here picks up rows the EXIF write loop
|
||||
/// just inserted alongside any pre-existing rows the watcher walked
|
||||
/// over — covers both new uploads and the initial backlog scan.
|
||||
pub fn build_face_candidates(
|
||||
context: &opentelemetry::Context,
|
||||
library: &libraries::Library,
|
||||
files: &[(PathBuf, String)],
|
||||
exif_dao: &Arc<Mutex<Box<dyn ExifDao>>>,
|
||||
face_dao: &Arc<Mutex<Box<dyn faces::FaceDao>>>,
|
||||
) -> Vec<face_watch::FaceCandidate> {
|
||||
// Restrict to image files; videos aren't face-scanned in v1 (kamadak
|
||||
// doesn't even register them in image_exif).
|
||||
let image_paths: Vec<String> = files
|
||||
.iter()
|
||||
.filter(|(p, _)| !file_types::is_video_file(p))
|
||||
.map(|(_, rel)| rel.clone())
|
||||
.collect();
|
||||
if image_paths.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
|
||||
let exif_records = {
|
||||
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
|
||||
dao.get_exif_batch(context, Some(library.id), &image_paths)
|
||||
.unwrap_or_default()
|
||||
};
|
||||
// rel_path → content_hash (only rows with a hash; without one we have
|
||||
// nothing to key face data against).
|
||||
let mut hash_by_path: HashMap<String, String> = HashMap::with_capacity(exif_records.len());
|
||||
for record in exif_records {
|
||||
if let Some(h) = record.content_hash {
|
||||
hash_by_path.insert(record.file_path, h);
|
||||
}
|
||||
}
|
||||
|
||||
let mut candidates = Vec::new();
|
||||
let mut dao = face_dao.lock().expect("face dao");
|
||||
for rel_path in image_paths {
|
||||
let Some(hash) = hash_by_path.get(&rel_path) else {
|
||||
continue;
|
||||
};
|
||||
match dao.already_scanned(context, hash) {
|
||||
Ok(true) => continue,
|
||||
Ok(false) => candidates.push(face_watch::FaceCandidate {
|
||||
rel_path,
|
||||
content_hash: hash.clone(),
|
||||
}),
|
||||
Err(e) => {
|
||||
warn!("face_watch: already_scanned errored for {}: {:?}", hash, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
candidates
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
use std::fs;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use diesel::prelude::*;
|
||||
use tempfile::TempDir;
|
||||
|
||||
use crate::database::models::{InsertImageExif, InsertLibrary};
|
||||
use crate::database::test::in_memory_db_connection;
|
||||
use crate::database::{ExifDao, SqliteExifDao, schema};
|
||||
use crate::faces::{FaceDao, SqliteFaceDao};
|
||||
use crate::libraries::Library;
|
||||
|
||||
fn ctx() -> opentelemetry::Context {
|
||||
opentelemetry::Context::new()
|
||||
}
|
||||
|
||||
/// Build a tempdir-backed library + DAOs sharing a single in-memory
|
||||
/// SQLite connection (so cross-table joins like
|
||||
/// `list_unscanned_candidates` see consistent state).
|
||||
fn setup() -> (
|
||||
TempDir,
|
||||
Library,
|
||||
Arc<Mutex<diesel::SqliteConnection>>,
|
||||
Arc<Mutex<Box<dyn ExifDao>>>,
|
||||
Arc<Mutex<Box<dyn FaceDao>>>,
|
||||
) {
|
||||
let tmp = TempDir::new().expect("tempdir");
|
||||
let mut conn = in_memory_db_connection();
|
||||
// Migration seeds library id=1 with a placeholder root; rewrite it
|
||||
// to point at the tempdir so `<root>/<rel_path>` resolves to real
|
||||
// files this test creates.
|
||||
diesel::update(schema::libraries::table.filter(schema::libraries::id.eq(1)))
|
||||
.set(schema::libraries::root_path.eq(tmp.path().to_string_lossy().to_string()))
|
||||
.execute(&mut conn)
|
||||
.expect("rewrite library 1 root");
|
||||
// Add a second library so cross-library skip cases have somewhere
|
||||
// to put their rows.
|
||||
diesel::insert_into(schema::libraries::table)
|
||||
.values(InsertLibrary {
|
||||
name: "other",
|
||||
root_path: "/tmp/other-test-lib",
|
||||
created_at: 0,
|
||||
enabled: true,
|
||||
excluded_dirs: None,
|
||||
})
|
||||
.execute(&mut conn)
|
||||
.expect("seed second library");
|
||||
|
||||
let library = Library {
|
||||
id: 1,
|
||||
name: "main".to_string(),
|
||||
root_path: tmp.path().to_string_lossy().to_string(),
|
||||
enabled: true,
|
||||
excluded_dirs: Vec::new(),
|
||||
};
|
||||
let shared = Arc::new(Mutex::new(conn));
|
||||
let exif_dao: Arc<Mutex<Box<dyn ExifDao>>> = Arc::new(Mutex::new(Box::new(
|
||||
SqliteExifDao::from_shared(Arc::clone(&shared)),
|
||||
)));
|
||||
let face_dao: Arc<Mutex<Box<dyn FaceDao>>> = Arc::new(Mutex::new(Box::new(
|
||||
SqliteFaceDao::from_connection(Arc::clone(&shared)),
|
||||
)));
|
||||
(tmp, library, shared, exif_dao, face_dao)
|
||||
}
|
||||
|
||||
fn insert_exif(
|
||||
exif_dao: &Arc<Mutex<Box<dyn ExifDao>>>,
|
||||
lib_id: i32,
|
||||
rel: &str,
|
||||
content_hash: Option<&str>,
|
||||
) {
|
||||
let mut dao = exif_dao.lock().unwrap();
|
||||
dao.store_exif(
|
||||
&ctx(),
|
||||
InsertImageExif {
|
||||
library_id: lib_id,
|
||||
file_path: rel.to_string(),
|
||||
camera_make: None,
|
||||
camera_model: None,
|
||||
lens_model: None,
|
||||
width: None,
|
||||
height: None,
|
||||
orientation: None,
|
||||
gps_latitude: None,
|
||||
gps_longitude: None,
|
||||
gps_altitude: None,
|
||||
focal_length: None,
|
||||
aperture: None,
|
||||
shutter_speed: None,
|
||||
iso: None,
|
||||
date_taken: None,
|
||||
created_time: 0,
|
||||
last_modified: 0,
|
||||
content_hash: content_hash.map(|s| s.to_string()),
|
||||
size_bytes: None,
|
||||
phash_64: None,
|
||||
dhash_64: None,
|
||||
date_taken_source: None,
|
||||
},
|
||||
)
|
||||
.expect("insert");
|
||||
}
|
||||
|
||||
fn write_image(root: &std::path::Path, rel: &str, bytes: &[u8]) {
|
||||
let abs = root.join(rel);
|
||||
if let Some(parent) = abs.parent() {
|
||||
fs::create_dir_all(parent).expect("mkdir");
|
||||
}
|
||||
fs::write(abs, bytes).expect("write file");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn backfill_unhashed_backlog_hashes_missing_rows_in_this_library() {
|
||||
let (tmp, library, _conn, exif_dao, _face_dao) = setup();
|
||||
write_image(tmp.path(), "a.jpg", b"alpha-bytes");
|
||||
write_image(tmp.path(), "b.jpg", b"bravo-bytes");
|
||||
insert_exif(&exif_dao, 1, "a.jpg", None);
|
||||
insert_exif(&exif_dao, 1, "b.jpg", None);
|
||||
|
||||
let backfilled = backfill_unhashed_backlog(&ctx(), &library, &exif_dao);
|
||||
assert_eq!(backfilled, 2);
|
||||
|
||||
let mut dao = exif_dao.lock().unwrap();
|
||||
let rows = dao
|
||||
.get_exif_batch(&ctx(), Some(1), &["a.jpg".to_string(), "b.jpg".to_string()])
|
||||
.unwrap();
|
||||
assert_eq!(rows.len(), 2);
|
||||
for r in rows {
|
||||
assert!(
|
||||
r.content_hash.is_some(),
|
||||
"row {} should have a hash",
|
||||
r.file_path
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn backfill_unhashed_backlog_skips_other_libraries_and_missing_files() {
|
||||
let (tmp, library, _conn, exif_dao, _face_dao) = setup();
|
||||
write_image(tmp.path(), "exists.jpg", b"hello");
|
||||
// Row for this library whose file is missing on disk:
|
||||
insert_exif(&exif_dao, 1, "ghost.jpg", None);
|
||||
insert_exif(&exif_dao, 1, "exists.jpg", None);
|
||||
// Row in the other library — must be skipped (different lib_id).
|
||||
insert_exif(&exif_dao, 2, "other.jpg", None);
|
||||
|
||||
let backfilled = backfill_unhashed_backlog(&ctx(), &library, &exif_dao);
|
||||
assert_eq!(backfilled, 1, "only the existing in-library file hashes");
|
||||
|
||||
let mut dao = exif_dao.lock().unwrap();
|
||||
let other = dao
|
||||
.get_exif_batch(&ctx(), Some(2), &["other.jpg".to_string()])
|
||||
.unwrap();
|
||||
assert_eq!(other.len(), 1);
|
||||
assert!(
|
||||
other[0].content_hash.is_none(),
|
||||
"other-library row must remain unhashed"
|
||||
);
|
||||
let ghost = dao
|
||||
.get_exif_batch(&ctx(), Some(1), &["ghost.jpg".to_string()])
|
||||
.unwrap();
|
||||
assert_eq!(ghost.len(), 1);
|
||||
assert!(
|
||||
ghost[0].content_hash.is_none(),
|
||||
"missing-on-disk row stays unhashed (reconciliation removes it later)"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn backfill_unhashed_backlog_respects_per_tick_cap() {
|
||||
// Env-var-driven cap; the function reads it on every call, so we
|
||||
// can set it just for this test and unset before returning.
|
||||
// Serial guard: tests in the same binary may share env, but each
|
||||
// backfill call re-reads — and we only care that the cap shape
|
||||
// (success count <= cap, more_remain logged) holds.
|
||||
unsafe {
|
||||
std::env::set_var("FACE_HASH_BACKFILL_MAX_PER_TICK", "2");
|
||||
}
|
||||
let (tmp, library, _conn, exif_dao, _face_dao) = setup();
|
||||
for i in 0..5 {
|
||||
let rel = format!("img_{}.jpg", i);
|
||||
write_image(tmp.path(), &rel, format!("bytes-{}", i).as_bytes());
|
||||
insert_exif(&exif_dao, 1, &rel, None);
|
||||
}
|
||||
|
||||
let backfilled = backfill_unhashed_backlog(&ctx(), &library, &exif_dao);
|
||||
assert_eq!(backfilled, 2, "cap=2 must bound the per-tick successes");
|
||||
unsafe {
|
||||
std::env::remove_var("FACE_HASH_BACKFILL_MAX_PER_TICK");
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn backfill_missing_content_hashes_skips_videos_and_hashed_rows() {
|
||||
let (tmp, library, _conn, exif_dao, _face_dao) = setup();
|
||||
// Two image rows (one already hashed, one not), one video.
|
||||
write_image(tmp.path(), "fresh.jpg", b"fresh-pixels");
|
||||
write_image(tmp.path(), "already.jpg", b"already-pixels");
|
||||
write_image(tmp.path(), "clip.mp4", b"video-bytes");
|
||||
insert_exif(&exif_dao, 1, "fresh.jpg", None);
|
||||
insert_exif(&exif_dao, 1, "already.jpg", Some("pre-existing-hash"));
|
||||
insert_exif(&exif_dao, 1, "clip.mp4", None);
|
||||
|
||||
let files: Vec<(PathBuf, String)> = vec![
|
||||
(tmp.path().join("fresh.jpg"), "fresh.jpg".to_string()),
|
||||
(tmp.path().join("already.jpg"), "already.jpg".to_string()),
|
||||
(tmp.path().join("clip.mp4"), "clip.mp4".to_string()),
|
||||
];
|
||||
backfill_missing_content_hashes(&ctx(), &files, &library, &exif_dao);
|
||||
|
||||
let mut dao = exif_dao.lock().unwrap();
|
||||
let rows = dao
|
||||
.get_exif_batch(
|
||||
&ctx(),
|
||||
Some(1),
|
||||
&[
|
||||
"fresh.jpg".to_string(),
|
||||
"already.jpg".to_string(),
|
||||
"clip.mp4".to_string(),
|
||||
],
|
||||
)
|
||||
.unwrap();
|
||||
let by_path: HashMap<String, Option<String>> = rows
|
||||
.into_iter()
|
||||
.map(|r| (r.file_path, r.content_hash))
|
||||
.collect();
|
||||
assert!(
|
||||
by_path["fresh.jpg"].is_some(),
|
||||
"fresh image must get a hash"
|
||||
);
|
||||
assert_eq!(
|
||||
by_path["already.jpg"].as_deref(),
|
||||
Some("pre-existing-hash"),
|
||||
"already-hashed image left untouched"
|
||||
);
|
||||
assert!(
|
||||
by_path["clip.mp4"].is_none(),
|
||||
"video skipped (not face-scanned, no hash needed via this path)"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_face_candidates_filters_videos_unhashed_and_already_scanned() {
|
||||
let (tmp, library, _conn, exif_dao, face_dao) = setup();
|
||||
|
||||
// Seed image_exif with: hashed unscanned, hashed scanned, unhashed,
|
||||
// and a video. Files don't need to exist on disk — the function
|
||||
// doesn't read them, only the DB rows.
|
||||
insert_exif(&exif_dao, 1, "fresh.jpg", Some("hash-fresh"));
|
||||
insert_exif(&exif_dao, 1, "scanned.jpg", Some("hash-scanned"));
|
||||
insert_exif(&exif_dao, 1, "unhashed.jpg", None);
|
||||
insert_exif(&exif_dao, 1, "clip.mp4", Some("hash-video"));
|
||||
// Mark `scanned.jpg`'s hash as already detected.
|
||||
{
|
||||
let mut dao = face_dao.lock().unwrap();
|
||||
dao.mark_status(&ctx(), 1, "hash-scanned", "scanned.jpg", "no_faces", "test")
|
||||
.expect("mark scanned");
|
||||
}
|
||||
|
||||
let files: Vec<(PathBuf, String)> = vec![
|
||||
(tmp.path().join("fresh.jpg"), "fresh.jpg".to_string()),
|
||||
(tmp.path().join("scanned.jpg"), "scanned.jpg".to_string()),
|
||||
(tmp.path().join("unhashed.jpg"), "unhashed.jpg".to_string()),
|
||||
(tmp.path().join("clip.mp4"), "clip.mp4".to_string()),
|
||||
];
|
||||
let candidates = build_face_candidates(&ctx(), &library, &files, &exif_dao, &face_dao);
|
||||
|
||||
assert_eq!(
|
||||
candidates.len(),
|
||||
1,
|
||||
"exactly fresh.jpg should be a candidate"
|
||||
);
|
||||
assert_eq!(candidates[0].rel_path, "fresh.jpg");
|
||||
assert_eq!(candidates[0].content_hash, "hash-fresh");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user