Pulls cargo fmt + clippy pass over the new files only — pre-existing files left untouched even though fmt has drift on them. clamp(1,200) swaps a manual min/max chain that clippy flagged. test AppState constructor needed ClipClient::new(None) so the lib-test target compiles. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
792 lines
29 KiB
Rust
792 lines
29 KiB
Rust
//! Per-tick drains the watcher runs alongside ingest.
|
|
//!
|
|
//! These passes were previously inlined in `main.rs`; they exist because
|
|
//! a quick scan only walks recently-modified files, so any backlog of
|
|
//! rows missing a `content_hash` / `date_taken` / face detection
|
|
//! wouldn't otherwise drain except during the once-an-hour full scan.
|
|
//! Each function is bounded per call by a `*_PER_TICK` env-var cap.
|
|
|
|
use std::collections::HashMap;
|
|
use std::path::PathBuf;
|
|
use std::sync::{Arc, Mutex};
|
|
|
|
use log::{debug, info, warn};
|
|
|
|
use crate::content_hash;
|
|
use crate::database::ExifDao;
|
|
use crate::date_resolver;
|
|
use crate::face_watch;
|
|
use crate::faces;
|
|
use crate::file_types;
|
|
use crate::libraries;
|
|
use crate::tags;
|
|
|
|
/// Compute and persist content_hash for image_exif rows where it's NULL.
|
|
///
|
|
/// Bounded per call by `FACE_HASH_BACKFILL_MAX_PER_TICK` (default 2000)
|
|
/// so a watcher tick on a large legacy library doesn't block for hours
|
|
/// blake3-ing every photo at once. Subsequent scans pick up the rest.
|
|
/// For 50k+ libraries the dedicated `cargo run --bin backfill_hashes`
|
|
/// is still faster (it doesn't fight a watcher loop for the DAO mutex).
|
|
///
|
|
/// Drains unhashed image_exif rows by querying them directly, independent
|
|
/// of the filesystem walk. Quick scans only walk recently-modified files,
|
|
/// so a backlog of pre-existing unhashed rows never enters
|
|
/// `process_new_files`'s candidate set — left alone, it would only drain
|
|
/// on full scans (default once an hour). Calling this every tick keeps
|
|
/// the face-detection backlog moving regardless.
|
|
///
|
|
/// Returns the number of rows successfully backfilled this pass.
|
|
pub fn backfill_unhashed_backlog(
|
|
context: &opentelemetry::Context,
|
|
library: &libraries::Library,
|
|
exif_dao: &Arc<Mutex<Box<dyn ExifDao>>>,
|
|
) -> usize {
|
|
let cap: i64 = dotenv::var("FACE_HASH_BACKFILL_MAX_PER_TICK")
|
|
.ok()
|
|
.and_then(|s| s.parse().ok())
|
|
.filter(|n: &i64| *n > 0)
|
|
.unwrap_or(2000);
|
|
|
|
// Fetch up to cap+1 rows so we can tell "more remain" without a
|
|
// separate count query. Across libraries — there's no per-library
|
|
// filter on get_rows_missing_hash today — but we only ever update
|
|
// rows whose library_id matches the caller's library, so other
|
|
// libraries' rows just get skipped here and picked up on the next
|
|
// library's tick. Negligible cost given the cap.
|
|
let rows: Vec<(i32, String)> = {
|
|
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
|
|
dao.get_rows_missing_hash(context, cap + 1)
|
|
.unwrap_or_default()
|
|
};
|
|
if rows.is_empty() {
|
|
return 0;
|
|
}
|
|
|
|
let more_than_cap = rows.len() as i64 > cap;
|
|
let base_path = std::path::Path::new(&library.root_path);
|
|
|
|
let mut backfilled = 0usize;
|
|
let mut errors = 0usize;
|
|
let mut skipped_other_lib = 0usize;
|
|
for (lib_id, rel_path) in rows.iter().take(cap as usize) {
|
|
if *lib_id != library.id {
|
|
skipped_other_lib += 1;
|
|
continue;
|
|
}
|
|
let abs = base_path.join(rel_path);
|
|
if !abs.exists() {
|
|
// File walked away — the watcher's reconciliation pass will
|
|
// remove the orphan exif row eventually.
|
|
continue;
|
|
}
|
|
match content_hash::compute(&abs) {
|
|
Ok(id) => {
|
|
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
|
|
if let Err(e) = dao.backfill_content_hash(
|
|
context,
|
|
library.id,
|
|
rel_path,
|
|
&id.content_hash,
|
|
id.size_bytes,
|
|
) {
|
|
warn!(
|
|
"face_watch: backfill_content_hash failed for {}: {:?}",
|
|
rel_path, e
|
|
);
|
|
errors += 1;
|
|
} else {
|
|
backfilled += 1;
|
|
}
|
|
}
|
|
Err(e) => {
|
|
debug!(
|
|
"face_watch: hash compute failed for {} ({:?})",
|
|
abs.display(),
|
|
e
|
|
);
|
|
errors += 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
if backfilled > 0 || errors > 0 || more_than_cap {
|
|
info!(
|
|
"face_watch: backfill pass for library '{}': hashed {} ({} error(s), {} skipped to other libraries; {} cap, more_remain={})",
|
|
library.name, backfilled, errors, skipped_other_lib, cap, more_than_cap
|
|
);
|
|
}
|
|
backfilled
|
|
}
|
|
|
|
/// Drain image_exif rows whose `date_taken` was never resolved or was
|
|
/// resolved by the weakest fallback (`fs_time`). Runs the canonical-date
|
|
/// waterfall — exiftool batch (one subprocess for the whole tick's
|
|
/// rows) → filename regex → earliest_fs_time — and persists each
|
|
/// resolution with its source tag. Capped per tick by
|
|
/// `DATE_BACKFILL_MAX_PER_TICK` (default 500) so a 14k-row library
|
|
/// drains over a few quick-scan ticks without blocking the watcher.
|
|
///
|
|
/// kamadak-exif is intentionally skipped here: the row already has a
|
|
/// NULL date_taken because the ingest path's kamadak-exif call returned
|
|
/// nothing, and re-running it would just produce the same answer.
|
|
/// exiftool is the meaningful new attempt — it handles videos and
|
|
/// MakerNote-hosted dates kamadak can't reach.
|
|
pub fn backfill_missing_date_taken(
|
|
context: &opentelemetry::Context,
|
|
library: &libraries::Library,
|
|
exif_dao: &Arc<Mutex<Box<dyn ExifDao>>>,
|
|
) -> usize {
|
|
let cap: i64 = dotenv::var("DATE_BACKFILL_MAX_PER_TICK")
|
|
.ok()
|
|
.and_then(|s| s.parse().ok())
|
|
.filter(|n: &i64| *n > 0)
|
|
.unwrap_or(500);
|
|
|
|
let rows: Vec<(i32, String)> = {
|
|
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
|
|
dao.get_rows_needing_date_backfill(context, library.id, cap + 1)
|
|
.unwrap_or_default()
|
|
};
|
|
if rows.is_empty() {
|
|
return 0;
|
|
}
|
|
|
|
let more_than_cap = rows.len() as i64 > cap;
|
|
let base_path = std::path::Path::new(&library.root_path);
|
|
|
|
// Build absolute paths and drop rows whose files no longer exist —
|
|
// the missing-file scan in library_maintenance retires deleted rows
|
|
// separately. Without this filter, NULL-date rows for missing files
|
|
// would loop through the drain forever (no source can resolve them).
|
|
let mut existing: Vec<(String, PathBuf)> = Vec::with_capacity(rows.len());
|
|
for (_, rel_path) in rows.iter().take(cap as usize) {
|
|
let abs = base_path.join(rel_path);
|
|
if abs.exists() {
|
|
existing.push((rel_path.clone(), abs));
|
|
}
|
|
}
|
|
if existing.is_empty() {
|
|
return 0;
|
|
}
|
|
|
|
// One exiftool subprocess for the whole batch; the resolver falls
|
|
// through to filename / fs_time per file when exiftool can't supply
|
|
// a date (or isn't installed at all).
|
|
let paths: Vec<PathBuf> = existing.iter().map(|(_, p)| p.clone()).collect();
|
|
let resolved = date_resolver::resolve_dates_batch(&paths, &HashMap::new());
|
|
|
|
let mut backfilled = 0usize;
|
|
let mut unresolved = 0usize;
|
|
let mut by_source: HashMap<&'static str, usize> = HashMap::new();
|
|
{
|
|
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
|
|
for (rel_path, abs) in &existing {
|
|
let Some(rd) = resolved.get(abs).copied() else {
|
|
unresolved += 1;
|
|
continue;
|
|
};
|
|
match dao.backfill_date_taken(
|
|
context,
|
|
library.id,
|
|
rel_path,
|
|
rd.timestamp,
|
|
rd.source.as_str(),
|
|
) {
|
|
Ok(()) => {
|
|
backfilled += 1;
|
|
*by_source.entry(rd.source.as_str()).or_insert(0) += 1;
|
|
}
|
|
Err(e) => {
|
|
warn!(
|
|
"date_backfill: update failed for lib {} {}: {:?}",
|
|
library.id, rel_path, e
|
|
);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if backfilled > 0 || unresolved > 0 || more_than_cap {
|
|
info!(
|
|
"date_backfill: library '{}': resolved {} ({:?}), {} unresolved, cap={}, more_remain={}",
|
|
library.name, backfilled, by_source, unresolved, cap, more_than_cap
|
|
);
|
|
}
|
|
backfilled
|
|
}
|
|
|
|
/// Per-tick face-detection drain. Pulls a capped batch of hashed-but-
|
|
/// unscanned image_exif rows directly via the FaceDao anti-join and
|
|
/// hands them to the existing detection pass. Runs on every tick (not
|
|
/// just full scans) so the backlog moves at quick-scan cadence.
|
|
/// Per-tick CLIP encoding drain. Mirrors `process_face_backlog`: pull
|
|
/// up to `CLIP_BACKLOG_MAX_PER_TICK` candidates with a known
|
|
/// `content_hash` but no `clip_embedding`, hand them to
|
|
/// `clip_watch::run_clip_encoding_pass` for parallel fan-out, and let
|
|
/// that module write the result back via `backfill_clip_embedding`.
|
|
///
|
|
/// Idempotent — a row stays in the candidate set until its embedding
|
|
/// lands, so a transient failure (Apollo unreachable, CUDA OOM) just
|
|
/// defers to the next tick. Permanent failures (un-decodable bytes)
|
|
/// retry every tick at this point; future Branch may add a status
|
|
/// column like face_detections has.
|
|
pub fn process_clip_backlog(
|
|
context: &opentelemetry::Context,
|
|
library: &libraries::Library,
|
|
clip_client: &crate::ai::clip_client::ClipClient,
|
|
exif_dao: &Arc<Mutex<Box<dyn ExifDao>>>,
|
|
excluded_dirs: &[String],
|
|
) {
|
|
if !clip_client.is_enabled() {
|
|
return;
|
|
}
|
|
let cap: i64 = dotenv::var("CLIP_BACKLOG_MAX_PER_TICK")
|
|
.ok()
|
|
.and_then(|s| s.parse().ok())
|
|
.filter(|n: &i64| *n > 0)
|
|
.unwrap_or(32);
|
|
|
|
let rows: Vec<(String, String)> = {
|
|
let mut dao = exif_dao.lock().expect("exif dao");
|
|
match dao.list_clip_unencoded_candidates(context, library.id, cap) {
|
|
Ok(r) => r,
|
|
Err(e) => {
|
|
warn!(
|
|
"clip_watch: list_clip_unencoded_candidates failed for library '{}': {:?}",
|
|
library.name, e
|
|
);
|
|
return;
|
|
}
|
|
}
|
|
};
|
|
if rows.is_empty() {
|
|
return;
|
|
}
|
|
|
|
info!(
|
|
"clip_watch: backlog drain — encoding {} candidate(s) for library '{}' (cap={})",
|
|
rows.len(),
|
|
library.name,
|
|
cap
|
|
);
|
|
|
|
let candidates: Vec<crate::clip_watch::ClipCandidate> = rows
|
|
.into_iter()
|
|
.map(
|
|
|(rel_path, content_hash)| crate::clip_watch::ClipCandidate {
|
|
rel_path,
|
|
content_hash,
|
|
},
|
|
)
|
|
.collect();
|
|
|
|
crate::clip_watch::run_clip_encoding_pass(
|
|
library,
|
|
excluded_dirs,
|
|
clip_client,
|
|
Arc::clone(exif_dao),
|
|
candidates,
|
|
);
|
|
}
|
|
|
|
pub fn process_face_backlog(
|
|
context: &opentelemetry::Context,
|
|
library: &libraries::Library,
|
|
face_client: &crate::ai::face_client::FaceClient,
|
|
face_dao: &Arc<Mutex<Box<dyn faces::FaceDao>>>,
|
|
tag_dao: &Arc<Mutex<Box<dyn tags::TagDao>>>,
|
|
excluded_dirs: &[String],
|
|
) {
|
|
let cap: i64 = dotenv::var("FACE_BACKLOG_MAX_PER_TICK")
|
|
.ok()
|
|
.and_then(|s| s.parse().ok())
|
|
.filter(|n: &i64| *n > 0)
|
|
.unwrap_or(64);
|
|
|
|
let rows: Vec<(String, String)> = {
|
|
let mut dao = face_dao.lock().expect("face dao");
|
|
match dao.list_unscanned_candidates(context, library.id, cap) {
|
|
Ok(r) => r,
|
|
Err(e) => {
|
|
warn!(
|
|
"face_watch: list_unscanned_candidates failed for library '{}': {:?}",
|
|
library.name, e
|
|
);
|
|
return;
|
|
}
|
|
}
|
|
};
|
|
if rows.is_empty() {
|
|
return;
|
|
}
|
|
|
|
info!(
|
|
"face_watch: backlog drain — running detection on {} candidate(s) for library '{}' (cap={})",
|
|
rows.len(),
|
|
library.name,
|
|
cap
|
|
);
|
|
|
|
let candidates: Vec<face_watch::FaceCandidate> = rows
|
|
.into_iter()
|
|
.map(|(rel_path, content_hash)| face_watch::FaceCandidate {
|
|
rel_path,
|
|
content_hash,
|
|
})
|
|
.collect();
|
|
|
|
face_watch::run_face_detection_pass(
|
|
library,
|
|
excluded_dirs,
|
|
face_client,
|
|
Arc::clone(face_dao),
|
|
Arc::clone(tag_dao),
|
|
candidates,
|
|
);
|
|
}
|
|
|
|
/// Compute content_hash for any image rows the walker just touched
|
|
/// whose stored EXIF row is still hash-less. Called from
|
|
/// `process_new_files` so freshly-ingested files don't have to wait for
|
|
/// the next standalone `backfill_unhashed_backlog` tick before face
|
|
/// detection can key on their bytes.
|
|
///
|
|
/// Cap is on **successes only**. An earlier version counted errors too,
|
|
/// so a pocket of chronically-unhashable files at the front of the
|
|
/// table (vanished mid-scan, permission denied, etc.) burned the budget
|
|
/// every tick and the rest of the backlog never advanced.
|
|
pub fn backfill_missing_content_hashes(
|
|
context: &opentelemetry::Context,
|
|
files: &[(PathBuf, String)],
|
|
library: &libraries::Library,
|
|
exif_dao: &Arc<Mutex<Box<dyn ExifDao>>>,
|
|
) {
|
|
let image_paths: Vec<String> = files
|
|
.iter()
|
|
.filter(|(p, _)| !file_types::is_video_file(p))
|
|
.map(|(_, rel)| rel.clone())
|
|
.collect();
|
|
if image_paths.is_empty() {
|
|
return;
|
|
}
|
|
|
|
let exif_records = {
|
|
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
|
|
dao.get_exif_batch(context, Some(library.id), &image_paths)
|
|
.unwrap_or_default()
|
|
};
|
|
// Cheap lookup back from rel_path → absolute file_path so
|
|
// content_hash::compute can read the bytes.
|
|
let path_by_rel: HashMap<String, &PathBuf> =
|
|
files.iter().map(|(p, rel)| (rel.clone(), p)).collect();
|
|
|
|
let cap: usize = dotenv::var("FACE_HASH_BACKFILL_MAX_PER_TICK")
|
|
.ok()
|
|
.and_then(|s| s.parse().ok())
|
|
.filter(|n: &usize| *n > 0)
|
|
.unwrap_or(2000);
|
|
|
|
// Count the unhashed backlog up front so we can surface "still needs
|
|
// backfill: N" in the log — without it, a face-scan that's stuck at
|
|
// 44% looks stalled when really it's chipping through hashes.
|
|
let unhashed_total = exif_records
|
|
.iter()
|
|
.filter(|r| r.content_hash.is_none())
|
|
.count();
|
|
|
|
let mut backfilled = 0usize;
|
|
let mut errors = 0usize;
|
|
for record in &exif_records {
|
|
if backfilled >= cap {
|
|
break;
|
|
}
|
|
if record.content_hash.is_some() {
|
|
continue;
|
|
}
|
|
let Some(file_path) = path_by_rel.get(&record.file_path) else {
|
|
// Walked file went missing between the directory scan and now;
|
|
// next tick will retry naturally.
|
|
continue;
|
|
};
|
|
match content_hash::compute(file_path) {
|
|
Ok(id) => {
|
|
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
|
|
if let Err(e) = dao.backfill_content_hash(
|
|
context,
|
|
library.id,
|
|
&record.file_path,
|
|
&id.content_hash,
|
|
id.size_bytes,
|
|
) {
|
|
warn!(
|
|
"face_watch: backfill_content_hash failed for {}: {:?}",
|
|
record.file_path, e
|
|
);
|
|
errors += 1;
|
|
} else {
|
|
backfilled += 1;
|
|
}
|
|
}
|
|
Err(e) => {
|
|
debug!(
|
|
"face_watch: hash compute failed for {} ({:?})",
|
|
file_path.display(),
|
|
e
|
|
);
|
|
errors += 1;
|
|
}
|
|
}
|
|
}
|
|
// Always log when there's an unhashed backlog so an operator
|
|
// looking at "scan stuck at 44%" can see backfill is running and
|
|
// how much remains. Quiet only when there's nothing to do.
|
|
if unhashed_total > 0 || backfilled > 0 || errors > 0 {
|
|
let remaining = unhashed_total.saturating_sub(backfilled);
|
|
info!(
|
|
"face_watch: backfilled {}/{} content_hash for library '{}' ({} error(s); {} still need backfill; cap={})",
|
|
backfilled, unhashed_total, library.name, errors, remaining, cap
|
|
);
|
|
}
|
|
}
|
|
|
|
/// Build the face-detection candidate list for a scan tick.
|
|
///
|
|
/// Returns `(rel_path, content_hash)` for every image file that has a
|
|
/// content_hash recorded in image_exif but no row in face_detections
|
|
/// yet. Re-querying image_exif here picks up rows the EXIF write loop
|
|
/// just inserted alongside any pre-existing rows the watcher walked
|
|
/// over — covers both new uploads and the initial backlog scan.
|
|
pub fn build_face_candidates(
|
|
context: &opentelemetry::Context,
|
|
library: &libraries::Library,
|
|
files: &[(PathBuf, String)],
|
|
exif_dao: &Arc<Mutex<Box<dyn ExifDao>>>,
|
|
face_dao: &Arc<Mutex<Box<dyn faces::FaceDao>>>,
|
|
) -> Vec<face_watch::FaceCandidate> {
|
|
// Restrict to image files; videos aren't face-scanned in v1 (kamadak
|
|
// doesn't even register them in image_exif).
|
|
let image_paths: Vec<String> = files
|
|
.iter()
|
|
.filter(|(p, _)| !file_types::is_video_file(p))
|
|
.map(|(_, rel)| rel.clone())
|
|
.collect();
|
|
if image_paths.is_empty() {
|
|
return Vec::new();
|
|
}
|
|
|
|
let exif_records = {
|
|
let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
|
|
dao.get_exif_batch(context, Some(library.id), &image_paths)
|
|
.unwrap_or_default()
|
|
};
|
|
// rel_path → content_hash (only rows with a hash; without one we have
|
|
// nothing to key face data against).
|
|
let mut hash_by_path: HashMap<String, String> = HashMap::with_capacity(exif_records.len());
|
|
for record in exif_records {
|
|
if let Some(h) = record.content_hash {
|
|
hash_by_path.insert(record.file_path, h);
|
|
}
|
|
}
|
|
|
|
let mut candidates = Vec::new();
|
|
let mut dao = face_dao.lock().expect("face dao");
|
|
for rel_path in image_paths {
|
|
let Some(hash) = hash_by_path.get(&rel_path) else {
|
|
continue;
|
|
};
|
|
match dao.already_scanned(context, hash) {
|
|
Ok(true) => continue,
|
|
Ok(false) => candidates.push(face_watch::FaceCandidate {
|
|
rel_path,
|
|
content_hash: hash.clone(),
|
|
}),
|
|
Err(e) => {
|
|
warn!("face_watch: already_scanned errored for {}: {:?}", hash, e);
|
|
}
|
|
}
|
|
}
|
|
candidates
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
use std::fs;
|
|
use std::sync::{Arc, Mutex};
|
|
|
|
use diesel::prelude::*;
|
|
use tempfile::TempDir;
|
|
|
|
use crate::database::models::{InsertImageExif, InsertLibrary};
|
|
use crate::database::test::in_memory_db_connection;
|
|
use crate::database::{ExifDao, SqliteExifDao, schema};
|
|
use crate::faces::{FaceDao, SqliteFaceDao};
|
|
use crate::libraries::Library;
|
|
|
|
fn ctx() -> opentelemetry::Context {
|
|
opentelemetry::Context::new()
|
|
}
|
|
|
|
/// Build a tempdir-backed library + DAOs sharing a single in-memory
|
|
/// SQLite connection (so cross-table joins like
|
|
/// `list_unscanned_candidates` see consistent state).
|
|
fn setup() -> (
|
|
TempDir,
|
|
Library,
|
|
Arc<Mutex<diesel::SqliteConnection>>,
|
|
Arc<Mutex<Box<dyn ExifDao>>>,
|
|
Arc<Mutex<Box<dyn FaceDao>>>,
|
|
) {
|
|
let tmp = TempDir::new().expect("tempdir");
|
|
let mut conn = in_memory_db_connection();
|
|
// Migration seeds library id=1 with a placeholder root; rewrite it
|
|
// to point at the tempdir so `<root>/<rel_path>` resolves to real
|
|
// files this test creates.
|
|
diesel::update(schema::libraries::table.filter(schema::libraries::id.eq(1)))
|
|
.set(schema::libraries::root_path.eq(tmp.path().to_string_lossy().to_string()))
|
|
.execute(&mut conn)
|
|
.expect("rewrite library 1 root");
|
|
// Add a second library so cross-library skip cases have somewhere
|
|
// to put their rows.
|
|
diesel::insert_into(schema::libraries::table)
|
|
.values(InsertLibrary {
|
|
name: "other",
|
|
root_path: "/tmp/other-test-lib",
|
|
created_at: 0,
|
|
enabled: true,
|
|
excluded_dirs: None,
|
|
})
|
|
.execute(&mut conn)
|
|
.expect("seed second library");
|
|
|
|
let library = Library {
|
|
id: 1,
|
|
name: "main".to_string(),
|
|
root_path: tmp.path().to_string_lossy().to_string(),
|
|
enabled: true,
|
|
excluded_dirs: Vec::new(),
|
|
};
|
|
let shared = Arc::new(Mutex::new(conn));
|
|
let exif_dao: Arc<Mutex<Box<dyn ExifDao>>> = Arc::new(Mutex::new(Box::new(
|
|
SqliteExifDao::from_shared(Arc::clone(&shared)),
|
|
)));
|
|
let face_dao: Arc<Mutex<Box<dyn FaceDao>>> = Arc::new(Mutex::new(Box::new(
|
|
SqliteFaceDao::from_connection(Arc::clone(&shared)),
|
|
)));
|
|
(tmp, library, shared, exif_dao, face_dao)
|
|
}
|
|
|
|
fn insert_exif(
|
|
exif_dao: &Arc<Mutex<Box<dyn ExifDao>>>,
|
|
lib_id: i32,
|
|
rel: &str,
|
|
content_hash: Option<&str>,
|
|
) {
|
|
let mut dao = exif_dao.lock().unwrap();
|
|
dao.store_exif(
|
|
&ctx(),
|
|
InsertImageExif {
|
|
library_id: lib_id,
|
|
file_path: rel.to_string(),
|
|
camera_make: None,
|
|
camera_model: None,
|
|
lens_model: None,
|
|
width: None,
|
|
height: None,
|
|
orientation: None,
|
|
gps_latitude: None,
|
|
gps_longitude: None,
|
|
gps_altitude: None,
|
|
focal_length: None,
|
|
aperture: None,
|
|
shutter_speed: None,
|
|
iso: None,
|
|
date_taken: None,
|
|
created_time: 0,
|
|
last_modified: 0,
|
|
content_hash: content_hash.map(|s| s.to_string()),
|
|
size_bytes: None,
|
|
phash_64: None,
|
|
dhash_64: None,
|
|
date_taken_source: None,
|
|
},
|
|
)
|
|
.expect("insert");
|
|
}
|
|
|
|
fn write_image(root: &std::path::Path, rel: &str, bytes: &[u8]) {
|
|
let abs = root.join(rel);
|
|
if let Some(parent) = abs.parent() {
|
|
fs::create_dir_all(parent).expect("mkdir");
|
|
}
|
|
fs::write(abs, bytes).expect("write file");
|
|
}
|
|
|
|
#[test]
|
|
fn backfill_unhashed_backlog_hashes_missing_rows_in_this_library() {
|
|
let (tmp, library, _conn, exif_dao, _face_dao) = setup();
|
|
write_image(tmp.path(), "a.jpg", b"alpha-bytes");
|
|
write_image(tmp.path(), "b.jpg", b"bravo-bytes");
|
|
insert_exif(&exif_dao, 1, "a.jpg", None);
|
|
insert_exif(&exif_dao, 1, "b.jpg", None);
|
|
|
|
let backfilled = backfill_unhashed_backlog(&ctx(), &library, &exif_dao);
|
|
assert_eq!(backfilled, 2);
|
|
|
|
let mut dao = exif_dao.lock().unwrap();
|
|
let rows = dao
|
|
.get_exif_batch(&ctx(), Some(1), &["a.jpg".to_string(), "b.jpg".to_string()])
|
|
.unwrap();
|
|
assert_eq!(rows.len(), 2);
|
|
for r in rows {
|
|
assert!(
|
|
r.content_hash.is_some(),
|
|
"row {} should have a hash",
|
|
r.file_path
|
|
);
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn backfill_unhashed_backlog_skips_other_libraries_and_missing_files() {
|
|
let (tmp, library, _conn, exif_dao, _face_dao) = setup();
|
|
write_image(tmp.path(), "exists.jpg", b"hello");
|
|
// Row for this library whose file is missing on disk:
|
|
insert_exif(&exif_dao, 1, "ghost.jpg", None);
|
|
insert_exif(&exif_dao, 1, "exists.jpg", None);
|
|
// Row in the other library — must be skipped (different lib_id).
|
|
insert_exif(&exif_dao, 2, "other.jpg", None);
|
|
|
|
let backfilled = backfill_unhashed_backlog(&ctx(), &library, &exif_dao);
|
|
assert_eq!(backfilled, 1, "only the existing in-library file hashes");
|
|
|
|
let mut dao = exif_dao.lock().unwrap();
|
|
let other = dao
|
|
.get_exif_batch(&ctx(), Some(2), &["other.jpg".to_string()])
|
|
.unwrap();
|
|
assert_eq!(other.len(), 1);
|
|
assert!(
|
|
other[0].content_hash.is_none(),
|
|
"other-library row must remain unhashed"
|
|
);
|
|
let ghost = dao
|
|
.get_exif_batch(&ctx(), Some(1), &["ghost.jpg".to_string()])
|
|
.unwrap();
|
|
assert_eq!(ghost.len(), 1);
|
|
assert!(
|
|
ghost[0].content_hash.is_none(),
|
|
"missing-on-disk row stays unhashed (reconciliation removes it later)"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn backfill_unhashed_backlog_respects_per_tick_cap() {
|
|
// Env-var-driven cap; the function reads it on every call, so we
|
|
// can set it just for this test and unset before returning.
|
|
// Serial guard: tests in the same binary may share env, but each
|
|
// backfill call re-reads — and we only care that the cap shape
|
|
// (success count <= cap, more_remain logged) holds.
|
|
unsafe {
|
|
std::env::set_var("FACE_HASH_BACKFILL_MAX_PER_TICK", "2");
|
|
}
|
|
let (tmp, library, _conn, exif_dao, _face_dao) = setup();
|
|
for i in 0..5 {
|
|
let rel = format!("img_{}.jpg", i);
|
|
write_image(tmp.path(), &rel, format!("bytes-{}", i).as_bytes());
|
|
insert_exif(&exif_dao, 1, &rel, None);
|
|
}
|
|
|
|
let backfilled = backfill_unhashed_backlog(&ctx(), &library, &exif_dao);
|
|
assert_eq!(backfilled, 2, "cap=2 must bound the per-tick successes");
|
|
unsafe {
|
|
std::env::remove_var("FACE_HASH_BACKFILL_MAX_PER_TICK");
|
|
}
|
|
}
|
|
|
|
#[test]
|
|
fn backfill_missing_content_hashes_skips_videos_and_hashed_rows() {
|
|
let (tmp, library, _conn, exif_dao, _face_dao) = setup();
|
|
// Two image rows (one already hashed, one not), one video.
|
|
write_image(tmp.path(), "fresh.jpg", b"fresh-pixels");
|
|
write_image(tmp.path(), "already.jpg", b"already-pixels");
|
|
write_image(tmp.path(), "clip.mp4", b"video-bytes");
|
|
insert_exif(&exif_dao, 1, "fresh.jpg", None);
|
|
insert_exif(&exif_dao, 1, "already.jpg", Some("pre-existing-hash"));
|
|
insert_exif(&exif_dao, 1, "clip.mp4", None);
|
|
|
|
let files: Vec<(PathBuf, String)> = vec![
|
|
(tmp.path().join("fresh.jpg"), "fresh.jpg".to_string()),
|
|
(tmp.path().join("already.jpg"), "already.jpg".to_string()),
|
|
(tmp.path().join("clip.mp4"), "clip.mp4".to_string()),
|
|
];
|
|
backfill_missing_content_hashes(&ctx(), &files, &library, &exif_dao);
|
|
|
|
let mut dao = exif_dao.lock().unwrap();
|
|
let rows = dao
|
|
.get_exif_batch(
|
|
&ctx(),
|
|
Some(1),
|
|
&[
|
|
"fresh.jpg".to_string(),
|
|
"already.jpg".to_string(),
|
|
"clip.mp4".to_string(),
|
|
],
|
|
)
|
|
.unwrap();
|
|
let by_path: HashMap<String, Option<String>> = rows
|
|
.into_iter()
|
|
.map(|r| (r.file_path, r.content_hash))
|
|
.collect();
|
|
assert!(
|
|
by_path["fresh.jpg"].is_some(),
|
|
"fresh image must get a hash"
|
|
);
|
|
assert_eq!(
|
|
by_path["already.jpg"].as_deref(),
|
|
Some("pre-existing-hash"),
|
|
"already-hashed image left untouched"
|
|
);
|
|
assert!(
|
|
by_path["clip.mp4"].is_none(),
|
|
"video skipped (not face-scanned, no hash needed via this path)"
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn build_face_candidates_filters_videos_unhashed_and_already_scanned() {
|
|
let (tmp, library, _conn, exif_dao, face_dao) = setup();
|
|
|
|
// Seed image_exif with: hashed unscanned, hashed scanned, unhashed,
|
|
// and a video. Files don't need to exist on disk — the function
|
|
// doesn't read them, only the DB rows.
|
|
insert_exif(&exif_dao, 1, "fresh.jpg", Some("hash-fresh"));
|
|
insert_exif(&exif_dao, 1, "scanned.jpg", Some("hash-scanned"));
|
|
insert_exif(&exif_dao, 1, "unhashed.jpg", None);
|
|
insert_exif(&exif_dao, 1, "clip.mp4", Some("hash-video"));
|
|
// Mark `scanned.jpg`'s hash as already detected.
|
|
{
|
|
let mut dao = face_dao.lock().unwrap();
|
|
dao.mark_status(&ctx(), 1, "hash-scanned", "scanned.jpg", "no_faces", "test")
|
|
.expect("mark scanned");
|
|
}
|
|
|
|
let files: Vec<(PathBuf, String)> = vec![
|
|
(tmp.path().join("fresh.jpg"), "fresh.jpg".to_string()),
|
|
(tmp.path().join("scanned.jpg"), "scanned.jpg".to_string()),
|
|
(tmp.path().join("unhashed.jpg"), "unhashed.jpg".to_string()),
|
|
(tmp.path().join("clip.mp4"), "clip.mp4".to_string()),
|
|
];
|
|
let candidates = build_face_candidates(&ctx(), &library, &files, &exif_dao, &face_dao);
|
|
|
|
assert_eq!(
|
|
candidates.len(),
|
|
1,
|
|
"exactly fresh.jpg should be a candidate"
|
|
);
|
|
assert_eq!(candidates[0].rel_path, "fresh.jpg");
|
|
assert_eq!(candidates[0].content_hash, "hash-fresh");
|
|
}
|
|
}
|