From 67cf0c7f73e63ab628ab4d6848b1a88b40272275 Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Wed, 6 May 2026 12:43:29 -0400
Subject: [PATCH] duplicates: folder-pair view of exact dups
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Bucket exact-dup rows by (library_id, dirname) pair on each side, then
filter by coverage = shared / min(folder_a_total, folder_b_total) and
an absolute floor on shared count. Surfaces "this folder is mostly
contained in that folder" matches that the per-file EXACT view buries
under one row each — e.g. an old phone-backup tree shadowing the
organized library, or a topic-grouped folder duplicating a date-grouped
one within the same library.

New endpoint: GET /duplicates/folder-pairs?library=&include_resolved=
&min_coverage=&min_shared=. Cached 5 min keyed on (library, include_resolved);
the user-tunable thresholds filter the cached unfiltered pair list so
slider drags don't re-bucket. Shares the resolve / unresolve flow with
the existing tabs — the frontend fans out N parallel /resolve calls,
one per shared content_hash.

Folder names carry no signal (BMW lives under Night Photos, not BMW_backup),
so bucketing is purely on (library_id, dirname) co-occurrence in
exact-dup groups. Within-folder dups (same hash twice in the same
folder) are skipped — those belong to the EXACT tab.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/database/mod.rs |  39 ++++
 src/duplicates.rs   | 475 ++++++++++++++++++++++++++++++++++++++++++++
 src/files.rs        |   9 +
 3 files changed, 523 insertions(+)
diff --git a/src/database/mod.rs b/src/database/mod.rs
index 696ffbc..509315d 100644
--- a/src/database/mod.rs
+++ b/src/database/mod.rs
@@ -446,6 +446,18 @@ pub trait ExifDao: Sync + Send {
         include_resolved: bool,
     ) -> Result<Vec<DuplicateRow>, DbError>;
 
+    /// Lightweight `(library_id, rel_path)` listing for every hashed
+    /// image_exif row, used to compute per-folder file totals for the
+    /// folder-pair duplicate view. Filters mirror `list_duplicates_exact`
+    /// so the denominator (folder population) and numerator (shared
+    /// dups between two folders) come from the same row population.
+    fn list_image_paths(
+        &mut self,
+        context: &opentelemetry::Context,
+        library_id: Option<i32>,
+        include_resolved: bool,
+    ) -> Result<Vec<(i32, String)>, DbError>;
+
     /// Look up a single row's metadata by `(library_id, rel_path)`. Used
     /// by the resolve endpoint to map the request payload to the
     /// underlying `content_hash` before writing the soft-mark. Returns
@@ -1585,6 +1597,33 @@ impl ExifDao for SqliteExifDao {
         .map_err(|_| DbError::new(DbErrorKind::QueryError))
     }
 
+    fn list_image_paths(
+        &mut self,
+        context: &opentelemetry::Context,
+        library_id_filter: Option<i32>,
+        include_resolved: bool,
+    ) -> Result<Vec<(i32, String)>, DbError> {
+        trace_db_call(context, "query", "list_image_paths", |_span| {
+            use schema::image_exif::dsl::*;
+
+            let mut connection = self.connection.lock().expect("Unable to get ExifDao");
+
+            let mut q = image_exif
+                .filter(content_hash.is_not_null())
+                .select((library_id, rel_path))
+                .into_boxed();
+            if let Some(lib) = library_id_filter {
+                q = q.filter(library_id.eq(lib));
+            }
+            if !include_resolved {
+                q = q.filter(duplicate_of_hash.is_null());
+            }
+            q.load::<(i32, String)>(connection.deref_mut())
+                .map_err(|_| anyhow::anyhow!("Query error"))
+        })
+        .map_err(|_| DbError::new(DbErrorKind::QueryError))
+    }
+
     fn lookup_duplicate_row(
         &mut self,
         context: &opentelemetry::Context,
diff --git a/src/duplicates.rs b/src/duplicates.rs
index d7b6921..372415b 100644
--- a/src/duplicates.rs
+++ b/src/duplicates.rs
@@ -35,6 +35,7 @@ use crate::state::AppState;
 // ── Cache ────────────────────────────────────────────────────────────────
 
 const PERCEPTUAL_CACHE_TTL: Duration = Duration::from_secs(300);
+const FOLDER_PAIR_CACHE_TTL: Duration = Duration::from_secs(300);
 
 #[derive(Clone)]
 struct PerceptualCacheEntry {
@@ -48,8 +49,22 @@ struct PerceptualCacheEntry {
     groups: Vec<DuplicateGroup>,
 }
 
+#[derive(Clone)]
+struct FolderPairCacheEntry {
+    library_id: Option<i32>,
+    include_resolved: bool,
+    computed_at: Instant,
+    /// Pre-filter cache: every detected folder pair, regardless of
+    /// min_coverage / min_shared. The handler applies the user-supplied
+    /// thresholds on top so a slider drag is a memcpy + a filter, not a
+    /// re-bucket. Bucketing dominates the cost (O(N²) over members of
+    /// big exact-dup groups), so this is the right cache boundary.
+    pairs: Vec<FolderPair>,
+}
+
 lazy_static! {
     static ref PERCEPTUAL_CACHE: Mutex<Option<PerceptualCacheEntry>> = Mutex::new(None);
+    static ref FOLDER_PAIR_CACHE: Mutex<Option<FolderPairCacheEntry>> = Mutex::new(None);
 }
 
 /// Drop the perceptual-cluster cache. Called from `resolve`/`unresolve`
@@ -60,6 +75,15 @@ fn invalidate_perceptual_cache() {
     }
 }
 
+/// Drop the folder-pair cache. Same rationale as
+/// `invalidate_perceptual_cache`: a resolve mutates which rows
+/// participate, so the next folder-pair fetch must re-bucket.
+fn invalidate_folder_pair_cache() {
+    if let Ok(mut guard) = FOLDER_PAIR_CACHE.lock() {
+        *guard = None;
+    }
+}
+
 // ── Wire shapes ──────────────────────────────────────────────────────────
 
 #[derive(Serialize, Debug, Clone)]
@@ -144,6 +168,59 @@ pub struct UnresolveDuplicateReq {
     pub rel_path: String,
 }
 
+#[derive(Deserialize, Debug)]
+pub struct ListFolderPairsQuery {
+    pub library: Option<String>,
+    #[serde(default)]
+    pub include_resolved: Option<bool>,
+    /// Coverage floor: `shared / min(side_a.total, side_b.total)`.
+    /// Default 0.5 — surfaces "at least half of the smaller folder is
+    /// duplicated in the other folder", which is the threshold above
+    /// which "demote one whole side" feels safe.
+    #[serde(default)]
+    pub min_coverage: Option<f32>,
+    /// Absolute floor on shared-file count. Default 3 — anything less
+    /// is incidental noise (e.g. two folders that happen to share a
+    /// stock background image).
+    #[serde(default)]
+    pub min_shared: Option<u32>,
+}
+
+#[derive(Serialize, Debug, Clone)]
+pub struct FolderEndpoint {
+    pub library_id: i32,
+    /// Folder path relative to library root, e.g. `Cars/BMW`. Empty
+    /// string for files at the library root (no leading slash).
+    pub folder: String,
+    /// Total count of `image_exif` rows in this folder, applied with
+    /// the same `include_resolved` filter as the dup query so the
+    /// numerator and denominator come from the same population.
+    pub total_files: i64,
+}
+
+#[derive(Serialize, Debug, Clone)]
+pub struct FolderPairFile {
+    pub content_hash: String,
+    pub a_rel_path: String,
+    pub b_rel_path: String,
+    pub size_bytes: Option<i64>,
+    pub date_taken: Option<i64>,
+    pub width: Option<i32>,
+    pub height: Option<i32>,
+}
+
+#[derive(Serialize, Debug, Clone)]
+pub struct FolderPair {
+    pub side_a: FolderEndpoint,
+    pub side_b: FolderEndpoint,
+    pub shared_count: i64,
+    /// `shared_count / min(side_a.total_files, side_b.total_files)`,
+    /// in `[0.0, 1.0]`. A coverage of 1.0 means the smaller folder is
+    /// fully contained in the other.
+    pub coverage: f32,
+    pub shared_files: Vec<FolderPairFile>,
+}
+
 // ── Handlers ─────────────────────────────────────────────────────────────
 
 async fn list_exact_handler(
@@ -334,6 +411,7 @@ async fn resolve_handler(
 
     drop(dao);
     invalidate_perceptual_cache();
+    invalidate_folder_pair_cache();
 
     HttpResponse::Ok().json(ResolveResponse { resolved_count })
 }
@@ -355,10 +433,73 @@ async fn unresolve_handler(
 
     drop(dao);
     invalidate_perceptual_cache();
+    invalidate_folder_pair_cache();
 
     HttpResponse::Ok().finish()
 }
 
+async fn list_folder_pairs_handler(
+    _: Claims,
+    request: HttpRequest,
+    app_state: web::Data<AppState>,
+    query: web::Query<ListFolderPairsQuery>,
+    exif_dao: web::Data<Mutex<Box<dyn ExifDao>>>,
+) -> impl Responder {
+    let context = extract_context_from_request(&request);
+    let span = global_tracer().start_with_context("duplicates.list_folder_pairs", &context);
+    let span_context = opentelemetry::Context::current_with_span(span);
+
+    let library_id = libraries::resolve_library_param(&app_state, query.library.as_deref())
+        .ok()
+        .flatten()
+        .map(|l| l.id);
+    let include_resolved = query.include_resolved.unwrap_or(false);
+    let min_coverage = query.min_coverage.unwrap_or(0.5).clamp(0.0, 1.0);
+    let min_shared = query.min_shared.unwrap_or(3).max(1);
+
+    // Cache hit on the (library, include_resolved) tuple — coverage /
+    // min_shared are user-tunable filters applied AFTER bucketing, so
+    // the cache stores the unfiltered pair list.
+    if let Ok(guard) = FOLDER_PAIR_CACHE.lock()
+        && let Some(entry) = guard.as_ref()
+        && entry.library_id == library_id
+        && entry.include_resolved == include_resolved
+        && entry.computed_at.elapsed() < FOLDER_PAIR_CACHE_TTL
+    {
+        let filtered = filter_folder_pairs(entry.pairs.clone(), min_coverage, min_shared);
+        return HttpResponse::Ok().json(FolderPairsResponse { pairs: filtered });
+    }
+
+    let (dup_rows, all_paths) = {
+        let mut dao = exif_dao.lock().expect("exif dao lock");
+        let dup_rows = match dao.list_duplicates_exact(&span_context, library_id, include_resolved)
+        {
+            Ok(rows) => rows,
+            Err(e) => return HttpResponse::InternalServerError().body(format!("{:?}", e)),
+        };
+        let all_paths = match dao.list_image_paths(&span_context, library_id, include_resolved) {
+            Ok(rows) => rows,
+            Err(e) => return HttpResponse::InternalServerError().body(format!("{:?}", e)),
+        };
+        (dup_rows, all_paths)
+    };
+
+    let totals = folder_totals(&all_paths);
+    let pairs = bucket_folder_pairs(dup_rows, &totals);
+
+    if let Ok(mut guard) = FOLDER_PAIR_CACHE.lock() {
+        *guard = Some(FolderPairCacheEntry {
+            library_id,
+            include_resolved,
+            computed_at: Instant::now(),
+            pairs: pairs.clone(),
+        });
+    }
+
+    let filtered = filter_folder_pairs(pairs, min_coverage, min_shared);
+    HttpResponse::Ok().json(FolderPairsResponse { pairs: filtered })
+}
+
 // ── Grouping / clustering ────────────────────────────────────────────────
 
 #[derive(Serialize, Debug)]
@@ -366,6 +507,188 @@ struct GroupsResponse {
     groups: Vec<DuplicateGroup>,
 }
 
+#[derive(Serialize, Debug)]
+struct FolderPairsResponse {
+    pairs: Vec<FolderPair>,
+}
+
+/// Folder portion of `rel_path`: everything up to (and excluding) the
+/// last `/`. Returns an empty string for top-level files. Library-root
+/// agnostic — the rel_path is already relative to the library root.
+fn folder_dir(rel_path: &str) -> &str {
+    match rel_path.rfind('/') {
+        Some(i) => &rel_path[..i],
+        None => "",
+    }
+}
+
+/// Per-folder file totals from a flat `(library_id, rel_path)` listing.
+/// Used as the denominator for the coverage metric. We count every
+/// hashed image_exif row that matches the dup query's filter, so the
+/// numerator (shared dups) and denominator (folder population) come
+/// from the same population.
+fn folder_totals(rows: &[(i32, String)]) -> HashMap<(i32, String), i64> {
+    let mut out: HashMap<(i32, String), i64> = HashMap::new();
+    for (lib, rel_path) in rows {
+        let dir = folder_dir(rel_path).to_string();
+        *out.entry((*lib, dir)).or_insert(0) += 1;
+    }
+    out
+}
+
+/// Canonical ordering for a folder pair: lexicographic on
+/// `(library_id, folder)`. Ensures we bucket `(F1, F2)` and `(F2, F1)`
+/// onto the same key regardless of which member of an exact-dup group
+/// we encounter first.
+fn canonical_pair<'a>(
+    a: &'a (i32, String),
+    b: &'a (i32, String),
+) -> (&'a (i32, String), &'a (i32, String)) {
+    if a <= b { (a, b) } else { (b, a) }
+}
+
+/// Bucket exact-dup rows into folder-pair edges. For each exact-dup
+/// group, pick one representative member per (library, folder) tuple
+/// (lex-smallest rel_path) so within-folder duplicates collapse to one
+/// edge endpoint each — those are an EXACT-tab concern, not a
+/// folder-pair one. Then for every distinct ordered pair of folders
+/// the hash touches, record one shared-file entry.
+///
+/// Output is unfiltered; the caller applies `min_coverage` /
+/// `min_shared` thresholds on top so the slider UX doesn't have to
+/// re-bucket on every drag.
+fn bucket_folder_pairs(
+    rows: Vec<DuplicateRow>,
+    totals: &HashMap<(i32, String), i64>,
+) -> Vec<FolderPair> {
+    // Group dup rows by content_hash (the rows are already ordered by
+    // hash in the SQL query, but don't rely on that for correctness).
+    let mut by_hash: HashMap<String, Vec<DuplicateRow>> = HashMap::new();
+    for row in rows {
+        by_hash
+            .entry(row.content_hash.clone())
+            .or_default()
+            .push(row);
+    }
+
+    // Edge accumulator: pair-key → list of shared files. The pair key
+    // owns its strings so we can serialize folder names back into the
+    // wire shape without juggling lifetimes through the response.
+    type PairKey = ((i32, String), (i32, String));
+    let mut edges: HashMap<PairKey, Vec<FolderPairFile>> = HashMap::new();
+
+    for (_, members) in by_hash {
+        if members.len() < 2 {
+            continue;
+        }
+
+        // One representative per (library_id, folder). Lex-smallest
+        // rel_path wins — deterministic, and matches the SQL ORDER BY
+        // so the visible thumbnail is stable across requests.
+        let mut by_folder: HashMap<(i32, String), &DuplicateRow> = HashMap::new();
+        for m in &members {
+            let key = (m.library_id, folder_dir(&m.rel_path).to_string());
+            by_folder
+                .entry(key)
+                .and_modify(|existing| {
+                    if m.rel_path < existing.rel_path {
+                        *existing = m;
+                    }
+                })
+                .or_insert(m);
+        }
+
+        // Skip degenerate hashes that all live in one folder — those
+        // are within-folder dups, surfaced by the EXACT tab.
+        if by_folder.len() < 2 {
+            continue;
+        }
+
+        // For every unordered pair of folders this hash touches, record
+        // one shared-file entry (one rep per side).
+        let folder_keys: Vec<&(i32, String)> = by_folder.keys().collect();
+        for i in 0..folder_keys.len() {
+            for j in (i + 1)..folder_keys.len() {
+                let (a_key, b_key) = canonical_pair(folder_keys[i], folder_keys[j]);
+                let a_row = by_folder[a_key];
+                let b_row = by_folder[b_key];
+                edges
+                    .entry((a_key.clone(), b_key.clone()))
+                    .or_default()
+                    .push(FolderPairFile {
+                        content_hash: a_row.content_hash.clone(),
+                        a_rel_path: a_row.rel_path.clone(),
+                        b_rel_path: b_row.rel_path.clone(),
+                        // Size / dims should agree across exact dups,
+                        // but if a NULL slipped in we just take side A's.
+                        size_bytes: a_row.size_bytes.or(b_row.size_bytes),
+                        date_taken: a_row.date_taken.or(b_row.date_taken),
+                        width: a_row.width.or(b_row.width),
+                        height: a_row.height.or(b_row.height),
+                    });
+            }
+        }
+    }
+
+    let mut pairs: Vec<FolderPair> = edges
+        .into_iter()
+        .map(|((a, b), mut shared_files)| {
+            // Stable rel_path order inside the response so the
+            // frontend's thumbnail strip doesn't reshuffle on refetch.
+            shared_files.sort_by(|x, y| x.a_rel_path.cmp(&y.a_rel_path));
+            let total_a = totals.get(&a).copied().unwrap_or(0);
+            let total_b = totals.get(&b).copied().unwrap_or(0);
+            let shared_count = shared_files.len() as i64;
+            let denom = total_a.min(total_b).max(1) as f32;
+            let coverage = (shared_count as f32 / denom).clamp(0.0, 1.0);
+            FolderPair {
+                side_a: FolderEndpoint {
+                    library_id: a.0,
+                    folder: a.1,
+                    total_files: total_a,
+                },
+                side_b: FolderEndpoint {
+                    library_id: b.0,
+                    folder: b.1,
+                    total_files: total_b,
+                },
+                shared_count,
+                coverage,
+                shared_files,
+            }
+        })
+        .collect();
+
+    // Largest shared_count first (most reward per click), tie-break on
+    // higher coverage (subset-into-superset matches above scattered
+    // fragments of equal size), then deterministic by folder names.
+    pairs.sort_by(|a, b| {
+        b.shared_count
+            .cmp(&a.shared_count)
+            .then_with(|| {
+                b.coverage
+                    .partial_cmp(&a.coverage)
+                    .unwrap_or(std::cmp::Ordering::Equal)
+            })
+            .then_with(|| a.side_a.library_id.cmp(&b.side_a.library_id))
+            .then_with(|| a.side_a.folder.cmp(&b.side_a.folder))
+            .then_with(|| a.side_b.library_id.cmp(&b.side_b.library_id))
+            .then_with(|| a.side_b.folder.cmp(&b.side_b.folder))
+    });
+    pairs
+}
+
+fn filter_folder_pairs(
+    pairs: Vec<FolderPair>,
+    min_coverage: f32,
+    min_shared: u32,
+) -> Vec<FolderPair> {
+    pairs
+        .into_iter()
+        .filter(|p| p.coverage >= min_coverage && p.shared_count >= min_shared as i64)
+        .collect()
+}
+
 fn group_exact(rows: Vec<DuplicateRow>) -> Vec<DuplicateGroup> {
     let mut by_hash: HashMap<String, Vec<DuplicateRow>> = HashMap::new();
     for row in rows {
@@ -666,6 +989,10 @@ where
         .service(
             web::resource("/duplicates/perceptual").route(web::get().to(list_perceptual_handler)),
         )
+        .service(
+            web::resource("/duplicates/folder-pairs")
+                .route(web::get().to(list_folder_pairs_handler)),
+        )
         .service(web::resource("/duplicates/resolve").route(web::post().to(resolve_handler)))
         .service(web::resource("/duplicates/unresolve").route(web::post().to(unresolve_handler)))
 }
@@ -872,6 +1199,154 @@ mod tests {
         );
     }
 
+    fn dup_row_at(library_id: i32, rel: &str, hash: &str) -> DuplicateRow {
+        DuplicateRow {
+            library_id,
+            rel_path: rel.into(),
+            content_hash: hash.into(),
+            size_bytes: Some(1000),
+            date_taken: None,
+            width: None,
+            height: None,
+            phash_64: None,
+            dhash_64: None,
+            duplicate_of_hash: None,
+            duplicate_decided_at: None,
+        }
+    }
+
+    #[test]
+    fn folder_dir_strips_basename() {
+        assert_eq!(folder_dir("Cars/BMW/DSC_5530.NEF"), "Cars/BMW");
+        assert_eq!(folder_dir("IMG_1.jpg"), "");
+        assert_eq!(folder_dir("a/b/c/d.jpg"), "a/b/c");
+    }
+
+    #[test]
+    fn folder_totals_groups_by_dir() {
+        let rows = vec![
+            (1, "a/x.jpg".to_string()),
+            (1, "a/y.jpg".to_string()),
+            (1, "b/z.jpg".to_string()),
+            (2, "a/x.jpg".to_string()),
+        ];
+        let t = folder_totals(&rows);
+        assert_eq!(t.get(&(1, "a".into())).copied(), Some(2));
+        assert_eq!(t.get(&(1, "b".into())).copied(), Some(1));
+        assert_eq!(t.get(&(2, "a".into())).copied(), Some(1));
+    }
+
+    #[test]
+    fn bucket_folder_pairs_collapses_within_folder_dups() {
+        // A hash that exists at TWO paths in the same folder isn't a
+        // folder-pair edge — it's a within-folder dup. Should produce
+        // zero pairs.
+        let rows = vec![
+            dup_row_at(1, "f1/a.jpg", "h1"),
+            dup_row_at(1, "f1/a_copy.jpg", "h1"),
+        ];
+        let totals: HashMap<(i32, String), i64> =
+            [((1, "f1".to_string()), 2)].into_iter().collect();
+        let pairs = bucket_folder_pairs(rows, &totals);
+        assert!(pairs.is_empty());
+    }
+
+    #[test]
+    fn bucket_folder_pairs_canonicalizes_pair_order() {
+        // Two hashes both span (lib1, "f1") and (lib2, "f2") — should
+        // bucket onto the SAME pair, regardless of which side the dup
+        // query encounters first.
+        let rows = vec![
+            dup_row_at(1, "f1/a.jpg", "h1"),
+            dup_row_at(2, "f2/a.jpg", "h1"),
+            dup_row_at(2, "f2/b.jpg", "h2"),
+            dup_row_at(1, "f1/b.jpg", "h2"),
+        ];
+        let totals: HashMap<(i32, String), i64> =
+            [((1, "f1".to_string()), 2), ((2, "f2".to_string()), 2)]
+                .into_iter()
+                .collect();
+        let pairs = bucket_folder_pairs(rows, &totals);
+        assert_eq!(pairs.len(), 1);
+        assert_eq!(pairs[0].shared_count, 2);
+        assert!((pairs[0].coverage - 1.0).abs() < 1e-6);
+    }
+
+    #[test]
+    fn bucket_folder_pairs_subset_coverage() {
+        // BMW (50 files) is a strict subset of Night Photos (200 files):
+        // 3 hashes shared between them, BMW has 5 files, Night has 20.
+        // Coverage should be 3/5 = 0.6 (smaller side fully informs the
+        // metric — that's the "fully contained" signal).
+        let rows = vec![
+            dup_row_at(1, "Cars/BMW/a.NEF", "h1"),
+            dup_row_at(1, "Cars/Night Photos/2015/July/a.NEF", "h1"),
+            dup_row_at(1, "Cars/BMW/b.NEF", "h2"),
+            dup_row_at(1, "Cars/Night Photos/2015/July/b.NEF", "h2"),
+            dup_row_at(1, "Cars/BMW/c.NEF", "h3"),
+            dup_row_at(1, "Cars/Night Photos/2015/July/c.NEF", "h3"),
+        ];
+        let totals: HashMap<(i32, String), i64> = [
+            ((1, "Cars/BMW".to_string()), 5),
+            ((1, "Cars/Night Photos/2015/July".to_string()), 20),
+        ]
+        .into_iter()
+        .collect();
+        let pairs = bucket_folder_pairs(rows, &totals);
+        assert_eq!(pairs.len(), 1);
+        assert_eq!(pairs[0].shared_count, 3);
+        // Smaller side (BMW, 5 files) hits coverage 3/5 = 0.6.
+        assert!((pairs[0].coverage - 0.6).abs() < 1e-6);
+    }
+
+    #[test]
+    fn bucket_folder_pairs_picks_lex_smallest_rep() {
+        // Two copies in folder A, one in folder B. The shared_files
+        // entry should reference A's lex-smallest rel_path.
+        let rows = vec![
+            dup_row_at(1, "A/z.jpg", "h1"),
+            dup_row_at(1, "A/a.jpg", "h1"),
+            dup_row_at(1, "B/q.jpg", "h1"),
+        ];
+        let totals: HashMap<(i32, String), i64> =
+            [((1, "A".to_string()), 2), ((1, "B".to_string()), 1)]
+                .into_iter()
+                .collect();
+        let pairs = bucket_folder_pairs(rows, &totals);
+        assert_eq!(pairs.len(), 1);
+        let f = &pairs[0].shared_files[0];
+        // Pair is canonicalized so A < B; A's rep wins lex.
+        assert_eq!(f.a_rel_path, "A/a.jpg");
+        assert_eq!(f.b_rel_path, "B/q.jpg");
+    }
+
+    #[test]
+    fn filter_folder_pairs_applies_thresholds() {
+        let totals: HashMap<(i32, String), i64> = [
+            ((1, "tiny".to_string()), 1),
+            ((1, "tinier".to_string()), 1),
+            ((1, "big-a".to_string()), 100),
+            ((1, "big-b".to_string()), 100),
+        ]
+        .into_iter()
+        .collect();
+        // tiny↔tinier: 1 shared, coverage 1.0 — should be filtered out
+        // by min_shared=3 (incidental, not a folder-level signal).
+        // big-a↔big-b: 50 shared — should pass.
+        let mut rows: Vec<DuplicateRow> = Vec::new();
+        rows.push(dup_row_at(1, "tiny/x.jpg", "ht"));
+        rows.push(dup_row_at(1, "tinier/x.jpg", "ht"));
+        for i in 0..50 {
+            let h = format!("h{}", i);
+            rows.push(dup_row_at(1, &format!("big-a/{}.jpg", i), &h));
+            rows.push(dup_row_at(1, &format!("big-b/{}.jpg", i), &h));
+        }
+        let pairs = bucket_folder_pairs(rows, &totals);
+        let kept = filter_folder_pairs(pairs, 0.5, 3);
+        assert_eq!(kept.len(), 1);
+        assert_eq!(kept[0].shared_count, 50);
+    }
+
     /// Sanity-check the BK-tree's metric, which is what the duplicates
     /// path actually clusters on.
     #[test]
diff --git a/src/files.rs b/src/files.rs
index 0ed46b5..9f01624 100644
--- a/src/files.rs
+++ b/src/files.rs
@@ -1767,6 +1767,15 @@ mod tests {
             Ok(Vec::new())
         }
 
+        fn list_image_paths(
+            &mut self,
+            _context: &opentelemetry::Context,
+            _library_id: Option<i32>,
+            _include_resolved: bool,
+        ) -> Result<Vec<(i32, String)>, DbError> {
+            Ok(Vec::new())
+        }
+
         fn lookup_duplicate_row(
             &mut self,
             _context: &opentelemetry::Context,