From 67cf0c7f73e63ab628ab4d6848b1a88b40272275 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Wed, 6 May 2026 12:43:29 -0400 Subject: [PATCH] duplicates: folder-pair view of exact dups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bucket exact-dup rows by (library_id, dirname) pair on each side, then filter by coverage = shared / min(folder_a_total, folder_b_total) and an absolute floor on shared count. Surfaces "this folder is mostly contained in that folder" matches that the per-file EXACT view buries under one row each — e.g. an old phone-backup tree shadowing the organized library, or a topic-grouped folder duplicating a date-grouped one within the same library. New endpoint: GET /duplicates/folder-pairs?library=&include_resolved= &min_coverage=&min_shared=. Cached 5 min keyed on (library, include_resolved); the user-tunable thresholds filter the cached unfiltered pair list so slider drags don't re-bucket. Shares the resolve / unresolve flow with the existing tabs — the frontend fans out N parallel /resolve calls, one per shared content_hash. Folder names carry no signal (BMW lives under Night Photos, not BMW_backup), so bucketing is purely on (library_id, dirname) co-occurrence in exact-dup groups. Within-folder dups (same hash twice in the same folder) are skipped — those belong to the EXACT tab. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/database/mod.rs | 39 ++++ src/duplicates.rs | 475 ++++++++++++++++++++++++++++++++++++++++++++ src/files.rs | 9 + 3 files changed, 523 insertions(+) diff --git a/src/database/mod.rs b/src/database/mod.rs index 696ffbc..509315d 100644 --- a/src/database/mod.rs +++ b/src/database/mod.rs @@ -446,6 +446,18 @@ pub trait ExifDao: Sync + Send { include_resolved: bool, ) -> Result, DbError>; + /// Lightweight `(library_id, rel_path)` listing for every hashed + /// image_exif row, used to compute per-folder file totals for the + /// folder-pair duplicate view. Filters mirror `list_duplicates_exact` + /// so the denominator (folder population) and numerator (shared + /// dups between two folders) come from the same row population. + fn list_image_paths( + &mut self, + context: &opentelemetry::Context, + library_id: Option, + include_resolved: bool, + ) -> Result, DbError>; + /// Look up a single row's metadata by `(library_id, rel_path)`. Used /// by the resolve endpoint to map the request payload to the /// underlying `content_hash` before writing the soft-mark. Returns @@ -1585,6 +1597,33 @@ impl ExifDao for SqliteExifDao { .map_err(|_| DbError::new(DbErrorKind::QueryError)) } + fn list_image_paths( + &mut self, + context: &opentelemetry::Context, + library_id_filter: Option, + include_resolved: bool, + ) -> Result, DbError> { + trace_db_call(context, "query", "list_image_paths", |_span| { + use schema::image_exif::dsl::*; + + let mut connection = self.connection.lock().expect("Unable to get ExifDao"); + + let mut q = image_exif + .filter(content_hash.is_not_null()) + .select((library_id, rel_path)) + .into_boxed(); + if let Some(lib) = library_id_filter { + q = q.filter(library_id.eq(lib)); + } + if !include_resolved { + q = q.filter(duplicate_of_hash.is_null()); + } + q.load::<(i32, String)>(connection.deref_mut()) + .map_err(|_| anyhow::anyhow!("Query error")) + }) + .map_err(|_| DbError::new(DbErrorKind::QueryError)) + } + fn lookup_duplicate_row( &mut self, context: &opentelemetry::Context, diff --git a/src/duplicates.rs b/src/duplicates.rs index d7b6921..372415b 100644 --- a/src/duplicates.rs +++ b/src/duplicates.rs @@ -35,6 +35,7 @@ use crate::state::AppState; // ── Cache ──────────────────────────────────────────────────────────────── const PERCEPTUAL_CACHE_TTL: Duration = Duration::from_secs(300); +const FOLDER_PAIR_CACHE_TTL: Duration = Duration::from_secs(300); #[derive(Clone)] struct PerceptualCacheEntry { @@ -48,8 +49,22 @@ struct PerceptualCacheEntry { groups: Vec, } +#[derive(Clone)] +struct FolderPairCacheEntry { + library_id: Option, + include_resolved: bool, + computed_at: Instant, + /// Pre-filter cache: every detected folder pair, regardless of + /// min_coverage / min_shared. The handler applies the user-supplied + /// thresholds on top so a slider drag is a memcpy + a filter, not a + /// re-bucket. Bucketing dominates the cost (O(N²) over members of + /// big exact-dup groups), so this is the right cache boundary. + pairs: Vec, +} + lazy_static! { static ref PERCEPTUAL_CACHE: Mutex> = Mutex::new(None); + static ref FOLDER_PAIR_CACHE: Mutex> = Mutex::new(None); } /// Drop the perceptual-cluster cache. Called from `resolve`/`unresolve` @@ -60,6 +75,15 @@ fn invalidate_perceptual_cache() { } } +/// Drop the folder-pair cache. Same rationale as +/// `invalidate_perceptual_cache`: a resolve mutates which rows +/// participate, so the next folder-pair fetch must re-bucket. +fn invalidate_folder_pair_cache() { + if let Ok(mut guard) = FOLDER_PAIR_CACHE.lock() { + *guard = None; + } +} + // ── Wire shapes ────────────────────────────────────────────────────────── #[derive(Serialize, Debug, Clone)] @@ -144,6 +168,59 @@ pub struct UnresolveDuplicateReq { pub rel_path: String, } +#[derive(Deserialize, Debug)] +pub struct ListFolderPairsQuery { + pub library: Option, + #[serde(default)] + pub include_resolved: Option, + /// Coverage floor: `shared / min(side_a.total, side_b.total)`. + /// Default 0.5 — surfaces "at least half of the smaller folder is + /// duplicated in the other folder", which is the threshold above + /// which "demote one whole side" feels safe. + #[serde(default)] + pub min_coverage: Option, + /// Absolute floor on shared-file count. Default 3 — anything less + /// is incidental noise (e.g. two folders that happen to share a + /// stock background image). + #[serde(default)] + pub min_shared: Option, +} + +#[derive(Serialize, Debug, Clone)] +pub struct FolderEndpoint { + pub library_id: i32, + /// Folder path relative to library root, e.g. `Cars/BMW`. Empty + /// string for files at the library root (no leading slash). + pub folder: String, + /// Total count of `image_exif` rows in this folder, applied with + /// the same `include_resolved` filter as the dup query so the + /// numerator and denominator come from the same population. + pub total_files: i64, +} + +#[derive(Serialize, Debug, Clone)] +pub struct FolderPairFile { + pub content_hash: String, + pub a_rel_path: String, + pub b_rel_path: String, + pub size_bytes: Option, + pub date_taken: Option, + pub width: Option, + pub height: Option, +} + +#[derive(Serialize, Debug, Clone)] +pub struct FolderPair { + pub side_a: FolderEndpoint, + pub side_b: FolderEndpoint, + pub shared_count: i64, + /// `shared_count / min(side_a.total_files, side_b.total_files)`, + /// in `[0.0, 1.0]`. A coverage of 1.0 means the smaller folder is + /// fully contained in the other. + pub coverage: f32, + pub shared_files: Vec, +} + // ── Handlers ───────────────────────────────────────────────────────────── async fn list_exact_handler( @@ -334,6 +411,7 @@ async fn resolve_handler( drop(dao); invalidate_perceptual_cache(); + invalidate_folder_pair_cache(); HttpResponse::Ok().json(ResolveResponse { resolved_count }) } @@ -355,10 +433,73 @@ async fn unresolve_handler( drop(dao); invalidate_perceptual_cache(); + invalidate_folder_pair_cache(); HttpResponse::Ok().finish() } +async fn list_folder_pairs_handler( + _: Claims, + request: HttpRequest, + app_state: web::Data, + query: web::Query, + exif_dao: web::Data>>, +) -> impl Responder { + let context = extract_context_from_request(&request); + let span = global_tracer().start_with_context("duplicates.list_folder_pairs", &context); + let span_context = opentelemetry::Context::current_with_span(span); + + let library_id = libraries::resolve_library_param(&app_state, query.library.as_deref()) + .ok() + .flatten() + .map(|l| l.id); + let include_resolved = query.include_resolved.unwrap_or(false); + let min_coverage = query.min_coverage.unwrap_or(0.5).clamp(0.0, 1.0); + let min_shared = query.min_shared.unwrap_or(3).max(1); + + // Cache hit on the (library, include_resolved) tuple — coverage / + // min_shared are user-tunable filters applied AFTER bucketing, so + // the cache stores the unfiltered pair list. + if let Ok(guard) = FOLDER_PAIR_CACHE.lock() + && let Some(entry) = guard.as_ref() + && entry.library_id == library_id + && entry.include_resolved == include_resolved + && entry.computed_at.elapsed() < FOLDER_PAIR_CACHE_TTL + { + let filtered = filter_folder_pairs(entry.pairs.clone(), min_coverage, min_shared); + return HttpResponse::Ok().json(FolderPairsResponse { pairs: filtered }); + } + + let (dup_rows, all_paths) = { + let mut dao = exif_dao.lock().expect("exif dao lock"); + let dup_rows = match dao.list_duplicates_exact(&span_context, library_id, include_resolved) + { + Ok(rows) => rows, + Err(e) => return HttpResponse::InternalServerError().body(format!("{:?}", e)), + }; + let all_paths = match dao.list_image_paths(&span_context, library_id, include_resolved) { + Ok(rows) => rows, + Err(e) => return HttpResponse::InternalServerError().body(format!("{:?}", e)), + }; + (dup_rows, all_paths) + }; + + let totals = folder_totals(&all_paths); + let pairs = bucket_folder_pairs(dup_rows, &totals); + + if let Ok(mut guard) = FOLDER_PAIR_CACHE.lock() { + *guard = Some(FolderPairCacheEntry { + library_id, + include_resolved, + computed_at: Instant::now(), + pairs: pairs.clone(), + }); + } + + let filtered = filter_folder_pairs(pairs, min_coverage, min_shared); + HttpResponse::Ok().json(FolderPairsResponse { pairs: filtered }) +} + // ── Grouping / clustering ──────────────────────────────────────────────── #[derive(Serialize, Debug)] @@ -366,6 +507,188 @@ struct GroupsResponse { groups: Vec, } +#[derive(Serialize, Debug)] +struct FolderPairsResponse { + pairs: Vec, +} + +/// Folder portion of `rel_path`: everything up to (and excluding) the +/// last `/`. Returns an empty string for top-level files. Library-root +/// agnostic — the rel_path is already relative to the library root. +fn folder_dir(rel_path: &str) -> &str { + match rel_path.rfind('/') { + Some(i) => &rel_path[..i], + None => "", + } +} + +/// Per-folder file totals from a flat `(library_id, rel_path)` listing. +/// Used as the denominator for the coverage metric. We count every +/// hashed image_exif row that matches the dup query's filter, so the +/// numerator (shared dups) and denominator (folder population) come +/// from the same population. +fn folder_totals(rows: &[(i32, String)]) -> HashMap<(i32, String), i64> { + let mut out: HashMap<(i32, String), i64> = HashMap::new(); + for (lib, rel_path) in rows { + let dir = folder_dir(rel_path).to_string(); + *out.entry((*lib, dir)).or_insert(0) += 1; + } + out +} + +/// Canonical ordering for a folder pair: lexicographic on +/// `(library_id, folder)`. Ensures we bucket `(F1, F2)` and `(F2, F1)` +/// onto the same key regardless of which member of an exact-dup group +/// we encounter first. +fn canonical_pair<'a>( + a: &'a (i32, String), + b: &'a (i32, String), +) -> (&'a (i32, String), &'a (i32, String)) { + if a <= b { (a, b) } else { (b, a) } +} + +/// Bucket exact-dup rows into folder-pair edges. For each exact-dup +/// group, pick one representative member per (library, folder) tuple +/// (lex-smallest rel_path) so within-folder duplicates collapse to one +/// edge endpoint each — those are an EXACT-tab concern, not a +/// folder-pair one. Then for every distinct ordered pair of folders +/// the hash touches, record one shared-file entry. +/// +/// Output is unfiltered; the caller applies `min_coverage` / +/// `min_shared` thresholds on top so the slider UX doesn't have to +/// re-bucket on every drag. +fn bucket_folder_pairs( + rows: Vec, + totals: &HashMap<(i32, String), i64>, +) -> Vec { + // Group dup rows by content_hash (the rows are already ordered by + // hash in the SQL query, but don't rely on that for correctness). + let mut by_hash: HashMap> = HashMap::new(); + for row in rows { + by_hash + .entry(row.content_hash.clone()) + .or_default() + .push(row); + } + + // Edge accumulator: pair-key → list of shared files. The pair key + // owns its strings so we can serialize folder names back into the + // wire shape without juggling lifetimes through the response. + type PairKey = ((i32, String), (i32, String)); + let mut edges: HashMap> = HashMap::new(); + + for (_, members) in by_hash { + if members.len() < 2 { + continue; + } + + // One representative per (library_id, folder). Lex-smallest + // rel_path wins — deterministic, and matches the SQL ORDER BY + // so the visible thumbnail is stable across requests. + let mut by_folder: HashMap<(i32, String), &DuplicateRow> = HashMap::new(); + for m in &members { + let key = (m.library_id, folder_dir(&m.rel_path).to_string()); + by_folder + .entry(key) + .and_modify(|existing| { + if m.rel_path < existing.rel_path { + *existing = m; + } + }) + .or_insert(m); + } + + // Skip degenerate hashes that all live in one folder — those + // are within-folder dups, surfaced by the EXACT tab. + if by_folder.len() < 2 { + continue; + } + + // For every unordered pair of folders this hash touches, record + // one shared-file entry (one rep per side). + let folder_keys: Vec<&(i32, String)> = by_folder.keys().collect(); + for i in 0..folder_keys.len() { + for j in (i + 1)..folder_keys.len() { + let (a_key, b_key) = canonical_pair(folder_keys[i], folder_keys[j]); + let a_row = by_folder[a_key]; + let b_row = by_folder[b_key]; + edges + .entry((a_key.clone(), b_key.clone())) + .or_default() + .push(FolderPairFile { + content_hash: a_row.content_hash.clone(), + a_rel_path: a_row.rel_path.clone(), + b_rel_path: b_row.rel_path.clone(), + // Size / dims should agree across exact dups, + // but if a NULL slipped in we just take side A's. + size_bytes: a_row.size_bytes.or(b_row.size_bytes), + date_taken: a_row.date_taken.or(b_row.date_taken), + width: a_row.width.or(b_row.width), + height: a_row.height.or(b_row.height), + }); + } + } + } + + let mut pairs: Vec = edges + .into_iter() + .map(|((a, b), mut shared_files)| { + // Stable rel_path order inside the response so the + // frontend's thumbnail strip doesn't reshuffle on refetch. + shared_files.sort_by(|x, y| x.a_rel_path.cmp(&y.a_rel_path)); + let total_a = totals.get(&a).copied().unwrap_or(0); + let total_b = totals.get(&b).copied().unwrap_or(0); + let shared_count = shared_files.len() as i64; + let denom = total_a.min(total_b).max(1) as f32; + let coverage = (shared_count as f32 / denom).clamp(0.0, 1.0); + FolderPair { + side_a: FolderEndpoint { + library_id: a.0, + folder: a.1, + total_files: total_a, + }, + side_b: FolderEndpoint { + library_id: b.0, + folder: b.1, + total_files: total_b, + }, + shared_count, + coverage, + shared_files, + } + }) + .collect(); + + // Largest shared_count first (most reward per click), tie-break on + // higher coverage (subset-into-superset matches above scattered + // fragments of equal size), then deterministic by folder names. + pairs.sort_by(|a, b| { + b.shared_count + .cmp(&a.shared_count) + .then_with(|| { + b.coverage + .partial_cmp(&a.coverage) + .unwrap_or(std::cmp::Ordering::Equal) + }) + .then_with(|| a.side_a.library_id.cmp(&b.side_a.library_id)) + .then_with(|| a.side_a.folder.cmp(&b.side_a.folder)) + .then_with(|| a.side_b.library_id.cmp(&b.side_b.library_id)) + .then_with(|| a.side_b.folder.cmp(&b.side_b.folder)) + }); + pairs +} + +fn filter_folder_pairs( + pairs: Vec, + min_coverage: f32, + min_shared: u32, +) -> Vec { + pairs + .into_iter() + .filter(|p| p.coverage >= min_coverage && p.shared_count >= min_shared as i64) + .collect() +} + fn group_exact(rows: Vec) -> Vec { let mut by_hash: HashMap> = HashMap::new(); for row in rows { @@ -666,6 +989,10 @@ where .service( web::resource("/duplicates/perceptual").route(web::get().to(list_perceptual_handler)), ) + .service( + web::resource("/duplicates/folder-pairs") + .route(web::get().to(list_folder_pairs_handler)), + ) .service(web::resource("/duplicates/resolve").route(web::post().to(resolve_handler))) .service(web::resource("/duplicates/unresolve").route(web::post().to(unresolve_handler))) } @@ -872,6 +1199,154 @@ mod tests { ); } + fn dup_row_at(library_id: i32, rel: &str, hash: &str) -> DuplicateRow { + DuplicateRow { + library_id, + rel_path: rel.into(), + content_hash: hash.into(), + size_bytes: Some(1000), + date_taken: None, + width: None, + height: None, + phash_64: None, + dhash_64: None, + duplicate_of_hash: None, + duplicate_decided_at: None, + } + } + + #[test] + fn folder_dir_strips_basename() { + assert_eq!(folder_dir("Cars/BMW/DSC_5530.NEF"), "Cars/BMW"); + assert_eq!(folder_dir("IMG_1.jpg"), ""); + assert_eq!(folder_dir("a/b/c/d.jpg"), "a/b/c"); + } + + #[test] + fn folder_totals_groups_by_dir() { + let rows = vec![ + (1, "a/x.jpg".to_string()), + (1, "a/y.jpg".to_string()), + (1, "b/z.jpg".to_string()), + (2, "a/x.jpg".to_string()), + ]; + let t = folder_totals(&rows); + assert_eq!(t.get(&(1, "a".into())).copied(), Some(2)); + assert_eq!(t.get(&(1, "b".into())).copied(), Some(1)); + assert_eq!(t.get(&(2, "a".into())).copied(), Some(1)); + } + + #[test] + fn bucket_folder_pairs_collapses_within_folder_dups() { + // A hash that exists at TWO paths in the same folder isn't a + // folder-pair edge — it's a within-folder dup. Should produce + // zero pairs. + let rows = vec![ + dup_row_at(1, "f1/a.jpg", "h1"), + dup_row_at(1, "f1/a_copy.jpg", "h1"), + ]; + let totals: HashMap<(i32, String), i64> = + [((1, "f1".to_string()), 2)].into_iter().collect(); + let pairs = bucket_folder_pairs(rows, &totals); + assert!(pairs.is_empty()); + } + + #[test] + fn bucket_folder_pairs_canonicalizes_pair_order() { + // Two hashes both span (lib1, "f1") and (lib2, "f2") — should + // bucket onto the SAME pair, regardless of which side the dup + // query encounters first. + let rows = vec![ + dup_row_at(1, "f1/a.jpg", "h1"), + dup_row_at(2, "f2/a.jpg", "h1"), + dup_row_at(2, "f2/b.jpg", "h2"), + dup_row_at(1, "f1/b.jpg", "h2"), + ]; + let totals: HashMap<(i32, String), i64> = + [((1, "f1".to_string()), 2), ((2, "f2".to_string()), 2)] + .into_iter() + .collect(); + let pairs = bucket_folder_pairs(rows, &totals); + assert_eq!(pairs.len(), 1); + assert_eq!(pairs[0].shared_count, 2); + assert!((pairs[0].coverage - 1.0).abs() < 1e-6); + } + + #[test] + fn bucket_folder_pairs_subset_coverage() { + // BMW (50 files) is a strict subset of Night Photos (200 files): + // 3 hashes shared between them, BMW has 5 files, Night has 20. + // Coverage should be 3/5 = 0.6 (smaller side fully informs the + // metric — that's the "fully contained" signal). + let rows = vec![ + dup_row_at(1, "Cars/BMW/a.NEF", "h1"), + dup_row_at(1, "Cars/Night Photos/2015/July/a.NEF", "h1"), + dup_row_at(1, "Cars/BMW/b.NEF", "h2"), + dup_row_at(1, "Cars/Night Photos/2015/July/b.NEF", "h2"), + dup_row_at(1, "Cars/BMW/c.NEF", "h3"), + dup_row_at(1, "Cars/Night Photos/2015/July/c.NEF", "h3"), + ]; + let totals: HashMap<(i32, String), i64> = [ + ((1, "Cars/BMW".to_string()), 5), + ((1, "Cars/Night Photos/2015/July".to_string()), 20), + ] + .into_iter() + .collect(); + let pairs = bucket_folder_pairs(rows, &totals); + assert_eq!(pairs.len(), 1); + assert_eq!(pairs[0].shared_count, 3); + // Smaller side (BMW, 5 files) hits coverage 3/5 = 0.6. + assert!((pairs[0].coverage - 0.6).abs() < 1e-6); + } + + #[test] + fn bucket_folder_pairs_picks_lex_smallest_rep() { + // Two copies in folder A, one in folder B. The shared_files + // entry should reference A's lex-smallest rel_path. + let rows = vec![ + dup_row_at(1, "A/z.jpg", "h1"), + dup_row_at(1, "A/a.jpg", "h1"), + dup_row_at(1, "B/q.jpg", "h1"), + ]; + let totals: HashMap<(i32, String), i64> = + [((1, "A".to_string()), 2), ((1, "B".to_string()), 1)] + .into_iter() + .collect(); + let pairs = bucket_folder_pairs(rows, &totals); + assert_eq!(pairs.len(), 1); + let f = &pairs[0].shared_files[0]; + // Pair is canonicalized so A < B; A's rep wins lex. + assert_eq!(f.a_rel_path, "A/a.jpg"); + assert_eq!(f.b_rel_path, "B/q.jpg"); + } + + #[test] + fn filter_folder_pairs_applies_thresholds() { + let totals: HashMap<(i32, String), i64> = [ + ((1, "tiny".to_string()), 1), + ((1, "tinier".to_string()), 1), + ((1, "big-a".to_string()), 100), + ((1, "big-b".to_string()), 100), + ] + .into_iter() + .collect(); + // tiny↔tinier: 1 shared, coverage 1.0 — should be filtered out + // by min_shared=3 (incidental, not a folder-level signal). + // big-a↔big-b: 50 shared — should pass. + let mut rows: Vec = Vec::new(); + rows.push(dup_row_at(1, "tiny/x.jpg", "ht")); + rows.push(dup_row_at(1, "tinier/x.jpg", "ht")); + for i in 0..50 { + let h = format!("h{}", i); + rows.push(dup_row_at(1, &format!("big-a/{}.jpg", i), &h)); + rows.push(dup_row_at(1, &format!("big-b/{}.jpg", i), &h)); + } + let pairs = bucket_folder_pairs(rows, &totals); + let kept = filter_folder_pairs(pairs, 0.5, 3); + assert_eq!(kept.len(), 1); + assert_eq!(kept[0].shared_count, 50); + } + /// Sanity-check the BK-tree's metric, which is what the duplicates /// path actually clusters on. #[test] diff --git a/src/files.rs b/src/files.rs index 0ed46b5..9f01624 100644 --- a/src/files.rs +++ b/src/files.rs @@ -1767,6 +1767,15 @@ mod tests { Ok(Vec::new()) } + fn list_image_paths( + &mut self, + _context: &opentelemetry::Context, + _library_id: Option, + _include_resolved: bool, + ) -> Result, DbError> { + Ok(Vec::new()) + } + fn lookup_duplicate_row( &mut self, _context: &opentelemetry::Context,