duplicates: perceptual hash + soft-mark resolution + upload 409
Adds pHash + dHash columns alongside the existing blake3 content_hash so
near-duplicates (re-encoded, resized, format-converted copies) become
queryable. /duplicates/{exact,perceptual} return groups; /duplicates/
{resolve,unresolve} flip a duplicate_of_hash soft-mark on losing rows
and union perceptual-only tag sets onto the survivor. The default
/photos listing filters duplicate_of_hash IS NULL so demoted siblings
stop cluttering the grid; include_duplicates=true opts back in for
Apollo's review modal. Upload now hashes bytes pre-write and returns
409 with the canonical sibling when a file's bytes already exist.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
221
src/bin/backfill_perceptual_hash.rs
Normal file
221
src/bin/backfill_perceptual_hash.rs
Normal file
@@ -0,0 +1,221 @@
|
||||
//! Backfill `image_exif.phash_64` + `dhash_64` for image rows that
|
||||
//! were ingested before perceptual hashing was wired into the watcher.
|
||||
//!
|
||||
//! The watcher computes perceptual hashes for new images as they're
|
||||
//! ingested, so this binary is a one-shot for the historical backlog.
|
||||
//! Idempotent — only rows with a non-null content_hash and a null
|
||||
//! phash are processed, so re-runs are safe and pick up where they
|
||||
//! left off (e.g. after a crash or interrupt).
|
||||
//!
|
||||
//! Image-only by design: `get_rows_missing_perceptual_hash` filters by
|
||||
//! file extension at the DB layer so videos and other non-decodable
|
||||
//! media are skipped without round-tripping `image_hasher`. Files that
|
||||
//! can't be opened (missing on disk, permission errors) are quietly
|
||||
//! left as null and counted as "missing"; on next run, if the file is
|
||||
//! restored, the row will surface again.
|
||||
|
||||
use std::path::Path;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::Instant;
|
||||
|
||||
use clap::Parser;
|
||||
use log::{error, warn};
|
||||
use rayon::prelude::*;
|
||||
|
||||
use image_api::bin_progress;
|
||||
use image_api::database::{ExifDao, SqliteExifDao, connect};
|
||||
use image_api::libraries::{self, Library};
|
||||
use image_api::perceptual_hash;
|
||||
|
||||
#[derive(Parser, Debug)]
|
||||
#[command(name = "backfill_perceptual_hash")]
|
||||
#[command(about = "Compute pHash + dHash for image_exif rows missing one")]
|
||||
struct Args {
|
||||
/// Max rows to hash per batch. The process loops until no rows remain.
|
||||
#[arg(long, default_value_t = 256)]
|
||||
batch_size: i64,
|
||||
|
||||
/// Rayon parallelism override. 0 uses the default thread pool size.
|
||||
#[arg(long, default_value_t = 0)]
|
||||
parallelism: usize,
|
||||
|
||||
/// Dry-run: log what would be hashed without writing to the DB.
|
||||
#[arg(long)]
|
||||
dry_run: bool,
|
||||
}
|
||||
|
||||
fn main() -> anyhow::Result<()> {
|
||||
env_logger::init();
|
||||
dotenv::dotenv().ok();
|
||||
|
||||
let args = Args::parse();
|
||||
if args.parallelism > 0 {
|
||||
rayon::ThreadPoolBuilder::new()
|
||||
.num_threads(args.parallelism)
|
||||
.build_global()
|
||||
.expect("Unable to configure rayon thread pool");
|
||||
}
|
||||
|
||||
let base_path = dotenv::var("BASE_PATH").ok();
|
||||
let mut seed_conn = connect();
|
||||
if let Some(base) = base_path.as_deref() {
|
||||
libraries::seed_or_patch_from_env(&mut seed_conn, base);
|
||||
}
|
||||
let libs = libraries::load_all(&mut seed_conn);
|
||||
drop(seed_conn);
|
||||
if libs.is_empty() {
|
||||
anyhow::bail!("No libraries configured; cannot backfill perceptual hashes");
|
||||
}
|
||||
let libs_by_id: std::collections::HashMap<i32, Library> =
|
||||
libs.into_iter().map(|lib| (lib.id, lib)).collect();
|
||||
println!(
|
||||
"Configured libraries: {}",
|
||||
libs_by_id
|
||||
.values()
|
||||
.map(|l| format!("{} -> {}", l.name, l.root_path))
|
||||
.collect::<Vec<_>>()
|
||||
.join(", ")
|
||||
);
|
||||
|
||||
let dao: Arc<Mutex<Box<dyn ExifDao>>> = Arc::new(Mutex::new(Box::new(SqliteExifDao::new())));
|
||||
let ctx = opentelemetry::Context::new();
|
||||
|
||||
let mut total_hashed = 0u64;
|
||||
let mut total_missing = 0u64;
|
||||
let mut total_decode_failures = 0u64;
|
||||
let mut total_errors = 0u64;
|
||||
let start = Instant::now();
|
||||
|
||||
let pb = bin_progress::spinner("perceptual-hashing");
|
||||
|
||||
loop {
|
||||
let rows = {
|
||||
let mut guard = dao.lock().expect("Unable to lock ExifDao");
|
||||
guard
|
||||
.get_rows_missing_perceptual_hash(&ctx, args.batch_size)
|
||||
.map_err(|e| anyhow::anyhow!("DB error: {:?}", e))?
|
||||
};
|
||||
if rows.is_empty() {
|
||||
break;
|
||||
}
|
||||
let batch_size = rows.len();
|
||||
pb.set_message(format!(
|
||||
"batch of {} (hashed={} decode_fail={} missing={} errors={})",
|
||||
batch_size, total_hashed, total_decode_failures, total_missing, total_errors
|
||||
));
|
||||
|
||||
// Compute perceptual hashes in parallel — CPU-bound, decoder
|
||||
// releases the GIL-equivalent. rayon's default thread pool
|
||||
// matches the host's logical-core count which is the right
|
||||
// ceiling for image_hasher's DCT pass.
|
||||
let results: Vec<(
|
||||
i32,
|
||||
String,
|
||||
FilePerceptualResult,
|
||||
)> = rows
|
||||
.into_par_iter()
|
||||
.map(|(library_id, rel_path)| {
|
||||
let abs = libs_by_id
|
||||
.get(&library_id)
|
||||
.map(|lib| Path::new(&lib.root_path).join(&rel_path));
|
||||
match abs {
|
||||
Some(abs_path) if abs_path.exists() => {
|
||||
match perceptual_hash::compute(&abs_path) {
|
||||
Some(id) => (library_id, rel_path, FilePerceptualResult::Ok(id)),
|
||||
None => (library_id, rel_path, FilePerceptualResult::DecodeFailed),
|
||||
}
|
||||
}
|
||||
Some(_) => (library_id, rel_path, FilePerceptualResult::MissingOnDisk),
|
||||
None => {
|
||||
warn!("Row refers to unknown library_id {}", library_id);
|
||||
(library_id, rel_path, FilePerceptualResult::MissingOnDisk)
|
||||
}
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Persist sequentially — SQLite writes serialize anyway.
|
||||
if !args.dry_run {
|
||||
let mut guard = dao.lock().expect("Unable to lock ExifDao");
|
||||
for (library_id, rel_path, result) in &results {
|
||||
match result {
|
||||
FilePerceptualResult::Ok(id) => {
|
||||
match guard.backfill_perceptual_hash(
|
||||
&ctx,
|
||||
*library_id,
|
||||
rel_path,
|
||||
Some(id.phash_64),
|
||||
Some(id.dhash_64),
|
||||
) {
|
||||
Ok(_) => {
|
||||
total_hashed += 1;
|
||||
pb.inc(1);
|
||||
}
|
||||
Err(e) => {
|
||||
pb.println(format!("persist error for {}: {:?}", rel_path, e));
|
||||
total_errors += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
FilePerceptualResult::DecodeFailed => {
|
||||
// Mark as "we tried" so the next run doesn't keep
|
||||
// hammering this file. We persist NULL/NULL —
|
||||
// unfortunately that leaves it eligible for the
|
||||
// next run. The honest fix is a separate "perceptual
|
||||
// hash attempted" timestamp; for now we accept the
|
||||
// re-attempt cost since decode-failure rate is low.
|
||||
total_decode_failures += 1;
|
||||
}
|
||||
FilePerceptualResult::MissingOnDisk => {
|
||||
total_missing += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
for (_, rel_path, result) in &results {
|
||||
match result {
|
||||
FilePerceptualResult::Ok(id) => {
|
||||
pb.println(format!(
|
||||
"[dry-run] {} -> phash={:016x} dhash={:016x}",
|
||||
rel_path, id.phash_64, id.dhash_64
|
||||
));
|
||||
total_hashed += 1;
|
||||
pb.inc(1);
|
||||
}
|
||||
FilePerceptualResult::DecodeFailed => {
|
||||
total_decode_failures += 1;
|
||||
}
|
||||
FilePerceptualResult::MissingOnDisk => {
|
||||
total_missing += 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
pb.println(format!(
|
||||
"[dry-run] processed one batch of {}. Stopping — a real run would continue \
|
||||
until no NULL phash_64 image rows remain.",
|
||||
results.len()
|
||||
));
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
pb.finish_and_clear();
|
||||
println!(
|
||||
"Done. hashed={}, decode_failed={}, skipped (missing on disk)={}, errors={}, elapsed={:.1}s",
|
||||
total_hashed,
|
||||
total_decode_failures,
|
||||
total_missing,
|
||||
total_errors,
|
||||
start.elapsed().as_secs_f64()
|
||||
);
|
||||
if total_errors > 0 {
|
||||
error!("Backfill completed with {} persist errors", total_errors);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
enum FilePerceptualResult {
|
||||
Ok(perceptual_hash::PerceptualIdentity),
|
||||
DecodeFailed,
|
||||
MissingOnDisk,
|
||||
}
|
||||
Reference in New Issue
Block a user