Merge pull request 'duplicates: folder-pair view of exact dups' (#75) from feature/folder-pair-duplicates into master
Reviewed-on: #75
This commit was merged in pull request #75.
This commit is contained in:
@@ -446,6 +446,18 @@ pub trait ExifDao: Sync + Send {
|
||||
include_resolved: bool,
|
||||
) -> Result<Vec<DuplicateRow>, DbError>;
|
||||
|
||||
/// Lightweight `(library_id, rel_path)` listing for every hashed
|
||||
/// image_exif row, used to compute per-folder file totals for the
|
||||
/// folder-pair duplicate view. Filters mirror `list_duplicates_exact`
|
||||
/// so the denominator (folder population) and numerator (shared
|
||||
/// dups between two folders) come from the same row population.
|
||||
fn list_image_paths(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
library_id: Option<i32>,
|
||||
include_resolved: bool,
|
||||
) -> Result<Vec<(i32, String)>, DbError>;
|
||||
|
||||
/// Look up a single row's metadata by `(library_id, rel_path)`. Used
|
||||
/// by the resolve endpoint to map the request payload to the
|
||||
/// underlying `content_hash` before writing the soft-mark. Returns
|
||||
@@ -1585,6 +1597,33 @@ impl ExifDao for SqliteExifDao {
|
||||
.map_err(|_| DbError::new(DbErrorKind::QueryError))
|
||||
}
|
||||
|
||||
fn list_image_paths(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
library_id_filter: Option<i32>,
|
||||
include_resolved: bool,
|
||||
) -> Result<Vec<(i32, String)>, DbError> {
|
||||
trace_db_call(context, "query", "list_image_paths", |_span| {
|
||||
use schema::image_exif::dsl::*;
|
||||
|
||||
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
|
||||
|
||||
let mut q = image_exif
|
||||
.filter(content_hash.is_not_null())
|
||||
.select((library_id, rel_path))
|
||||
.into_boxed();
|
||||
if let Some(lib) = library_id_filter {
|
||||
q = q.filter(library_id.eq(lib));
|
||||
}
|
||||
if !include_resolved {
|
||||
q = q.filter(duplicate_of_hash.is_null());
|
||||
}
|
||||
q.load::<(i32, String)>(connection.deref_mut())
|
||||
.map_err(|_| anyhow::anyhow!("Query error"))
|
||||
})
|
||||
.map_err(|_| DbError::new(DbErrorKind::QueryError))
|
||||
}
|
||||
|
||||
fn lookup_duplicate_row(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
|
||||
@@ -35,6 +35,7 @@ use crate::state::AppState;
|
||||
// ── Cache ────────────────────────────────────────────────────────────────
|
||||
|
||||
const PERCEPTUAL_CACHE_TTL: Duration = Duration::from_secs(300);
|
||||
const FOLDER_PAIR_CACHE_TTL: Duration = Duration::from_secs(300);
|
||||
|
||||
#[derive(Clone)]
|
||||
struct PerceptualCacheEntry {
|
||||
@@ -48,8 +49,22 @@ struct PerceptualCacheEntry {
|
||||
groups: Vec<DuplicateGroup>,
|
||||
}
|
||||
|
||||
#[derive(Clone)]
|
||||
struct FolderPairCacheEntry {
|
||||
library_id: Option<i32>,
|
||||
include_resolved: bool,
|
||||
computed_at: Instant,
|
||||
/// Pre-filter cache: every detected folder pair, regardless of
|
||||
/// min_coverage / min_shared. The handler applies the user-supplied
|
||||
/// thresholds on top so a slider drag is a memcpy + a filter, not a
|
||||
/// re-bucket. Bucketing dominates the cost (O(N²) over members of
|
||||
/// big exact-dup groups), so this is the right cache boundary.
|
||||
pairs: Vec<FolderPair>,
|
||||
}
|
||||
|
||||
lazy_static! {
|
||||
static ref PERCEPTUAL_CACHE: Mutex<Option<PerceptualCacheEntry>> = Mutex::new(None);
|
||||
static ref FOLDER_PAIR_CACHE: Mutex<Option<FolderPairCacheEntry>> = Mutex::new(None);
|
||||
}
|
||||
|
||||
/// Drop the perceptual-cluster cache. Called from `resolve`/`unresolve`
|
||||
@@ -60,6 +75,15 @@ fn invalidate_perceptual_cache() {
|
||||
}
|
||||
}
|
||||
|
||||
/// Drop the folder-pair cache. Same rationale as
|
||||
/// `invalidate_perceptual_cache`: a resolve mutates which rows
|
||||
/// participate, so the next folder-pair fetch must re-bucket.
|
||||
fn invalidate_folder_pair_cache() {
|
||||
if let Ok(mut guard) = FOLDER_PAIR_CACHE.lock() {
|
||||
*guard = None;
|
||||
}
|
||||
}
|
||||
|
||||
// ── Wire shapes ──────────────────────────────────────────────────────────
|
||||
|
||||
#[derive(Serialize, Debug, Clone)]
|
||||
@@ -144,6 +168,59 @@ pub struct UnresolveDuplicateReq {
|
||||
pub rel_path: String,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Debug)]
|
||||
pub struct ListFolderPairsQuery {
|
||||
pub library: Option<String>,
|
||||
#[serde(default)]
|
||||
pub include_resolved: Option<bool>,
|
||||
/// Coverage floor: `shared / min(side_a.total, side_b.total)`.
|
||||
/// Default 0.5 — surfaces "at least half of the smaller folder is
|
||||
/// duplicated in the other folder", which is the threshold above
|
||||
/// which "demote one whole side" feels safe.
|
||||
#[serde(default)]
|
||||
pub min_coverage: Option<f32>,
|
||||
/// Absolute floor on shared-file count. Default 3 — anything less
|
||||
/// is incidental noise (e.g. two folders that happen to share a
|
||||
/// stock background image).
|
||||
#[serde(default)]
|
||||
pub min_shared: Option<u32>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, Clone)]
|
||||
pub struct FolderEndpoint {
|
||||
pub library_id: i32,
|
||||
/// Folder path relative to library root, e.g. `Cars/BMW`. Empty
|
||||
/// string for files at the library root (no leading slash).
|
||||
pub folder: String,
|
||||
/// Total count of `image_exif` rows in this folder, applied with
|
||||
/// the same `include_resolved` filter as the dup query so the
|
||||
/// numerator and denominator come from the same population.
|
||||
pub total_files: i64,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, Clone)]
|
||||
pub struct FolderPairFile {
|
||||
pub content_hash: String,
|
||||
pub a_rel_path: String,
|
||||
pub b_rel_path: String,
|
||||
pub size_bytes: Option<i64>,
|
||||
pub date_taken: Option<i64>,
|
||||
pub width: Option<i32>,
|
||||
pub height: Option<i32>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug, Clone)]
|
||||
pub struct FolderPair {
|
||||
pub side_a: FolderEndpoint,
|
||||
pub side_b: FolderEndpoint,
|
||||
pub shared_count: i64,
|
||||
/// `shared_count / min(side_a.total_files, side_b.total_files)`,
|
||||
/// in `[0.0, 1.0]`. A coverage of 1.0 means the smaller folder is
|
||||
/// fully contained in the other.
|
||||
pub coverage: f32,
|
||||
pub shared_files: Vec<FolderPairFile>,
|
||||
}
|
||||
|
||||
// ── Handlers ─────────────────────────────────────────────────────────────
|
||||
|
||||
async fn list_exact_handler(
|
||||
@@ -334,6 +411,7 @@ async fn resolve_handler(
|
||||
|
||||
drop(dao);
|
||||
invalidate_perceptual_cache();
|
||||
invalidate_folder_pair_cache();
|
||||
|
||||
HttpResponse::Ok().json(ResolveResponse { resolved_count })
|
||||
}
|
||||
@@ -355,10 +433,73 @@ async fn unresolve_handler(
|
||||
|
||||
drop(dao);
|
||||
invalidate_perceptual_cache();
|
||||
invalidate_folder_pair_cache();
|
||||
|
||||
HttpResponse::Ok().finish()
|
||||
}
|
||||
|
||||
async fn list_folder_pairs_handler(
|
||||
_: Claims,
|
||||
request: HttpRequest,
|
||||
app_state: web::Data<AppState>,
|
||||
query: web::Query<ListFolderPairsQuery>,
|
||||
exif_dao: web::Data<Mutex<Box<dyn ExifDao>>>,
|
||||
) -> impl Responder {
|
||||
let context = extract_context_from_request(&request);
|
||||
let span = global_tracer().start_with_context("duplicates.list_folder_pairs", &context);
|
||||
let span_context = opentelemetry::Context::current_with_span(span);
|
||||
|
||||
let library_id = libraries::resolve_library_param(&app_state, query.library.as_deref())
|
||||
.ok()
|
||||
.flatten()
|
||||
.map(|l| l.id);
|
||||
let include_resolved = query.include_resolved.unwrap_or(false);
|
||||
let min_coverage = query.min_coverage.unwrap_or(0.5).clamp(0.0, 1.0);
|
||||
let min_shared = query.min_shared.unwrap_or(3).max(1);
|
||||
|
||||
// Cache hit on the (library, include_resolved) tuple — coverage /
|
||||
// min_shared are user-tunable filters applied AFTER bucketing, so
|
||||
// the cache stores the unfiltered pair list.
|
||||
if let Ok(guard) = FOLDER_PAIR_CACHE.lock()
|
||||
&& let Some(entry) = guard.as_ref()
|
||||
&& entry.library_id == library_id
|
||||
&& entry.include_resolved == include_resolved
|
||||
&& entry.computed_at.elapsed() < FOLDER_PAIR_CACHE_TTL
|
||||
{
|
||||
let filtered = filter_folder_pairs(entry.pairs.clone(), min_coverage, min_shared);
|
||||
return HttpResponse::Ok().json(FolderPairsResponse { pairs: filtered });
|
||||
}
|
||||
|
||||
let (dup_rows, all_paths) = {
|
||||
let mut dao = exif_dao.lock().expect("exif dao lock");
|
||||
let dup_rows = match dao.list_duplicates_exact(&span_context, library_id, include_resolved)
|
||||
{
|
||||
Ok(rows) => rows,
|
||||
Err(e) => return HttpResponse::InternalServerError().body(format!("{:?}", e)),
|
||||
};
|
||||
let all_paths = match dao.list_image_paths(&span_context, library_id, include_resolved) {
|
||||
Ok(rows) => rows,
|
||||
Err(e) => return HttpResponse::InternalServerError().body(format!("{:?}", e)),
|
||||
};
|
||||
(dup_rows, all_paths)
|
||||
};
|
||||
|
||||
let totals = folder_totals(&all_paths);
|
||||
let pairs = bucket_folder_pairs(dup_rows, &totals);
|
||||
|
||||
if let Ok(mut guard) = FOLDER_PAIR_CACHE.lock() {
|
||||
*guard = Some(FolderPairCacheEntry {
|
||||
library_id,
|
||||
include_resolved,
|
||||
computed_at: Instant::now(),
|
||||
pairs: pairs.clone(),
|
||||
});
|
||||
}
|
||||
|
||||
let filtered = filter_folder_pairs(pairs, min_coverage, min_shared);
|
||||
HttpResponse::Ok().json(FolderPairsResponse { pairs: filtered })
|
||||
}
|
||||
|
||||
// ── Grouping / clustering ────────────────────────────────────────────────
|
||||
|
||||
#[derive(Serialize, Debug)]
|
||||
@@ -366,6 +507,188 @@ struct GroupsResponse {
|
||||
groups: Vec<DuplicateGroup>,
|
||||
}
|
||||
|
||||
#[derive(Serialize, Debug)]
|
||||
struct FolderPairsResponse {
|
||||
pairs: Vec<FolderPair>,
|
||||
}
|
||||
|
||||
/// Folder portion of `rel_path`: everything up to (and excluding) the
|
||||
/// last `/`. Returns an empty string for top-level files. Library-root
|
||||
/// agnostic — the rel_path is already relative to the library root.
|
||||
fn folder_dir(rel_path: &str) -> &str {
|
||||
match rel_path.rfind('/') {
|
||||
Some(i) => &rel_path[..i],
|
||||
None => "",
|
||||
}
|
||||
}
|
||||
|
||||
/// Per-folder file totals from a flat `(library_id, rel_path)` listing.
|
||||
/// Used as the denominator for the coverage metric. We count every
|
||||
/// hashed image_exif row that matches the dup query's filter, so the
|
||||
/// numerator (shared dups) and denominator (folder population) come
|
||||
/// from the same population.
|
||||
fn folder_totals(rows: &[(i32, String)]) -> HashMap<(i32, String), i64> {
|
||||
let mut out: HashMap<(i32, String), i64> = HashMap::new();
|
||||
for (lib, rel_path) in rows {
|
||||
let dir = folder_dir(rel_path).to_string();
|
||||
*out.entry((*lib, dir)).or_insert(0) += 1;
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
/// Canonical ordering for a folder pair: lexicographic on
|
||||
/// `(library_id, folder)`. Ensures we bucket `(F1, F2)` and `(F2, F1)`
|
||||
/// onto the same key regardless of which member of an exact-dup group
|
||||
/// we encounter first.
|
||||
fn canonical_pair<'a>(
|
||||
a: &'a (i32, String),
|
||||
b: &'a (i32, String),
|
||||
) -> (&'a (i32, String), &'a (i32, String)) {
|
||||
if a <= b { (a, b) } else { (b, a) }
|
||||
}
|
||||
|
||||
/// Bucket exact-dup rows into folder-pair edges. For each exact-dup
|
||||
/// group, pick one representative member per (library, folder) tuple
|
||||
/// (lex-smallest rel_path) so within-folder duplicates collapse to one
|
||||
/// edge endpoint each — those are an EXACT-tab concern, not a
|
||||
/// folder-pair one. Then for every distinct ordered pair of folders
|
||||
/// the hash touches, record one shared-file entry.
|
||||
///
|
||||
/// Output is unfiltered; the caller applies `min_coverage` /
|
||||
/// `min_shared` thresholds on top so the slider UX doesn't have to
|
||||
/// re-bucket on every drag.
|
||||
fn bucket_folder_pairs(
|
||||
rows: Vec<DuplicateRow>,
|
||||
totals: &HashMap<(i32, String), i64>,
|
||||
) -> Vec<FolderPair> {
|
||||
// Group dup rows by content_hash (the rows are already ordered by
|
||||
// hash in the SQL query, but don't rely on that for correctness).
|
||||
let mut by_hash: HashMap<String, Vec<DuplicateRow>> = HashMap::new();
|
||||
for row in rows {
|
||||
by_hash
|
||||
.entry(row.content_hash.clone())
|
||||
.or_default()
|
||||
.push(row);
|
||||
}
|
||||
|
||||
// Edge accumulator: pair-key → list of shared files. The pair key
|
||||
// owns its strings so we can serialize folder names back into the
|
||||
// wire shape without juggling lifetimes through the response.
|
||||
type PairKey = ((i32, String), (i32, String));
|
||||
let mut edges: HashMap<PairKey, Vec<FolderPairFile>> = HashMap::new();
|
||||
|
||||
for (_, members) in by_hash {
|
||||
if members.len() < 2 {
|
||||
continue;
|
||||
}
|
||||
|
||||
// One representative per (library_id, folder). Lex-smallest
|
||||
// rel_path wins — deterministic, and matches the SQL ORDER BY
|
||||
// so the visible thumbnail is stable across requests.
|
||||
let mut by_folder: HashMap<(i32, String), &DuplicateRow> = HashMap::new();
|
||||
for m in &members {
|
||||
let key = (m.library_id, folder_dir(&m.rel_path).to_string());
|
||||
by_folder
|
||||
.entry(key)
|
||||
.and_modify(|existing| {
|
||||
if m.rel_path < existing.rel_path {
|
||||
*existing = m;
|
||||
}
|
||||
})
|
||||
.or_insert(m);
|
||||
}
|
||||
|
||||
// Skip degenerate hashes that all live in one folder — those
|
||||
// are within-folder dups, surfaced by the EXACT tab.
|
||||
if by_folder.len() < 2 {
|
||||
continue;
|
||||
}
|
||||
|
||||
// For every unordered pair of folders this hash touches, record
|
||||
// one shared-file entry (one rep per side).
|
||||
let folder_keys: Vec<&(i32, String)> = by_folder.keys().collect();
|
||||
for i in 0..folder_keys.len() {
|
||||
for j in (i + 1)..folder_keys.len() {
|
||||
let (a_key, b_key) = canonical_pair(folder_keys[i], folder_keys[j]);
|
||||
let a_row = by_folder[a_key];
|
||||
let b_row = by_folder[b_key];
|
||||
edges
|
||||
.entry((a_key.clone(), b_key.clone()))
|
||||
.or_default()
|
||||
.push(FolderPairFile {
|
||||
content_hash: a_row.content_hash.clone(),
|
||||
a_rel_path: a_row.rel_path.clone(),
|
||||
b_rel_path: b_row.rel_path.clone(),
|
||||
// Size / dims should agree across exact dups,
|
||||
// but if a NULL slipped in we just take side A's.
|
||||
size_bytes: a_row.size_bytes.or(b_row.size_bytes),
|
||||
date_taken: a_row.date_taken.or(b_row.date_taken),
|
||||
width: a_row.width.or(b_row.width),
|
||||
height: a_row.height.or(b_row.height),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
let mut pairs: Vec<FolderPair> = edges
|
||||
.into_iter()
|
||||
.map(|((a, b), mut shared_files)| {
|
||||
// Stable rel_path order inside the response so the
|
||||
// frontend's thumbnail strip doesn't reshuffle on refetch.
|
||||
shared_files.sort_by(|x, y| x.a_rel_path.cmp(&y.a_rel_path));
|
||||
let total_a = totals.get(&a).copied().unwrap_or(0);
|
||||
let total_b = totals.get(&b).copied().unwrap_or(0);
|
||||
let shared_count = shared_files.len() as i64;
|
||||
let denom = total_a.min(total_b).max(1) as f32;
|
||||
let coverage = (shared_count as f32 / denom).clamp(0.0, 1.0);
|
||||
FolderPair {
|
||||
side_a: FolderEndpoint {
|
||||
library_id: a.0,
|
||||
folder: a.1,
|
||||
total_files: total_a,
|
||||
},
|
||||
side_b: FolderEndpoint {
|
||||
library_id: b.0,
|
||||
folder: b.1,
|
||||
total_files: total_b,
|
||||
},
|
||||
shared_count,
|
||||
coverage,
|
||||
shared_files,
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Largest shared_count first (most reward per click), tie-break on
|
||||
// higher coverage (subset-into-superset matches above scattered
|
||||
// fragments of equal size), then deterministic by folder names.
|
||||
pairs.sort_by(|a, b| {
|
||||
b.shared_count
|
||||
.cmp(&a.shared_count)
|
||||
.then_with(|| {
|
||||
b.coverage
|
||||
.partial_cmp(&a.coverage)
|
||||
.unwrap_or(std::cmp::Ordering::Equal)
|
||||
})
|
||||
.then_with(|| a.side_a.library_id.cmp(&b.side_a.library_id))
|
||||
.then_with(|| a.side_a.folder.cmp(&b.side_a.folder))
|
||||
.then_with(|| a.side_b.library_id.cmp(&b.side_b.library_id))
|
||||
.then_with(|| a.side_b.folder.cmp(&b.side_b.folder))
|
||||
});
|
||||
pairs
|
||||
}
|
||||
|
||||
fn filter_folder_pairs(
|
||||
pairs: Vec<FolderPair>,
|
||||
min_coverage: f32,
|
||||
min_shared: u32,
|
||||
) -> Vec<FolderPair> {
|
||||
pairs
|
||||
.into_iter()
|
||||
.filter(|p| p.coverage >= min_coverage && p.shared_count >= min_shared as i64)
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn group_exact(rows: Vec<DuplicateRow>) -> Vec<DuplicateGroup> {
|
||||
let mut by_hash: HashMap<String, Vec<DuplicateRow>> = HashMap::new();
|
||||
for row in rows {
|
||||
@@ -666,6 +989,10 @@ where
|
||||
.service(
|
||||
web::resource("/duplicates/perceptual").route(web::get().to(list_perceptual_handler)),
|
||||
)
|
||||
.service(
|
||||
web::resource("/duplicates/folder-pairs")
|
||||
.route(web::get().to(list_folder_pairs_handler)),
|
||||
)
|
||||
.service(web::resource("/duplicates/resolve").route(web::post().to(resolve_handler)))
|
||||
.service(web::resource("/duplicates/unresolve").route(web::post().to(unresolve_handler)))
|
||||
}
|
||||
@@ -872,6 +1199,154 @@ mod tests {
|
||||
);
|
||||
}
|
||||
|
||||
fn dup_row_at(library_id: i32, rel: &str, hash: &str) -> DuplicateRow {
|
||||
DuplicateRow {
|
||||
library_id,
|
||||
rel_path: rel.into(),
|
||||
content_hash: hash.into(),
|
||||
size_bytes: Some(1000),
|
||||
date_taken: None,
|
||||
width: None,
|
||||
height: None,
|
||||
phash_64: None,
|
||||
dhash_64: None,
|
||||
duplicate_of_hash: None,
|
||||
duplicate_decided_at: None,
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn folder_dir_strips_basename() {
|
||||
assert_eq!(folder_dir("Cars/BMW/DSC_5530.NEF"), "Cars/BMW");
|
||||
assert_eq!(folder_dir("IMG_1.jpg"), "");
|
||||
assert_eq!(folder_dir("a/b/c/d.jpg"), "a/b/c");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn folder_totals_groups_by_dir() {
|
||||
let rows = vec![
|
||||
(1, "a/x.jpg".to_string()),
|
||||
(1, "a/y.jpg".to_string()),
|
||||
(1, "b/z.jpg".to_string()),
|
||||
(2, "a/x.jpg".to_string()),
|
||||
];
|
||||
let t = folder_totals(&rows);
|
||||
assert_eq!(t.get(&(1, "a".into())).copied(), Some(2));
|
||||
assert_eq!(t.get(&(1, "b".into())).copied(), Some(1));
|
||||
assert_eq!(t.get(&(2, "a".into())).copied(), Some(1));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bucket_folder_pairs_collapses_within_folder_dups() {
|
||||
// A hash that exists at TWO paths in the same folder isn't a
|
||||
// folder-pair edge — it's a within-folder dup. Should produce
|
||||
// zero pairs.
|
||||
let rows = vec![
|
||||
dup_row_at(1, "f1/a.jpg", "h1"),
|
||||
dup_row_at(1, "f1/a_copy.jpg", "h1"),
|
||||
];
|
||||
let totals: HashMap<(i32, String), i64> =
|
||||
[((1, "f1".to_string()), 2)].into_iter().collect();
|
||||
let pairs = bucket_folder_pairs(rows, &totals);
|
||||
assert!(pairs.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bucket_folder_pairs_canonicalizes_pair_order() {
|
||||
// Two hashes both span (lib1, "f1") and (lib2, "f2") — should
|
||||
// bucket onto the SAME pair, regardless of which side the dup
|
||||
// query encounters first.
|
||||
let rows = vec![
|
||||
dup_row_at(1, "f1/a.jpg", "h1"),
|
||||
dup_row_at(2, "f2/a.jpg", "h1"),
|
||||
dup_row_at(2, "f2/b.jpg", "h2"),
|
||||
dup_row_at(1, "f1/b.jpg", "h2"),
|
||||
];
|
||||
let totals: HashMap<(i32, String), i64> =
|
||||
[((1, "f1".to_string()), 2), ((2, "f2".to_string()), 2)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
let pairs = bucket_folder_pairs(rows, &totals);
|
||||
assert_eq!(pairs.len(), 1);
|
||||
assert_eq!(pairs[0].shared_count, 2);
|
||||
assert!((pairs[0].coverage - 1.0).abs() < 1e-6);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bucket_folder_pairs_subset_coverage() {
|
||||
// BMW (50 files) is a strict subset of Night Photos (200 files):
|
||||
// 3 hashes shared between them, BMW has 5 files, Night has 20.
|
||||
// Coverage should be 3/5 = 0.6 (smaller side fully informs the
|
||||
// metric — that's the "fully contained" signal).
|
||||
let rows = vec![
|
||||
dup_row_at(1, "Cars/BMW/a.NEF", "h1"),
|
||||
dup_row_at(1, "Cars/Night Photos/2015/July/a.NEF", "h1"),
|
||||
dup_row_at(1, "Cars/BMW/b.NEF", "h2"),
|
||||
dup_row_at(1, "Cars/Night Photos/2015/July/b.NEF", "h2"),
|
||||
dup_row_at(1, "Cars/BMW/c.NEF", "h3"),
|
||||
dup_row_at(1, "Cars/Night Photos/2015/July/c.NEF", "h3"),
|
||||
];
|
||||
let totals: HashMap<(i32, String), i64> = [
|
||||
((1, "Cars/BMW".to_string()), 5),
|
||||
((1, "Cars/Night Photos/2015/July".to_string()), 20),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
let pairs = bucket_folder_pairs(rows, &totals);
|
||||
assert_eq!(pairs.len(), 1);
|
||||
assert_eq!(pairs[0].shared_count, 3);
|
||||
// Smaller side (BMW, 5 files) hits coverage 3/5 = 0.6.
|
||||
assert!((pairs[0].coverage - 0.6).abs() < 1e-6);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn bucket_folder_pairs_picks_lex_smallest_rep() {
|
||||
// Two copies in folder A, one in folder B. The shared_files
|
||||
// entry should reference A's lex-smallest rel_path.
|
||||
let rows = vec![
|
||||
dup_row_at(1, "A/z.jpg", "h1"),
|
||||
dup_row_at(1, "A/a.jpg", "h1"),
|
||||
dup_row_at(1, "B/q.jpg", "h1"),
|
||||
];
|
||||
let totals: HashMap<(i32, String), i64> =
|
||||
[((1, "A".to_string()), 2), ((1, "B".to_string()), 1)]
|
||||
.into_iter()
|
||||
.collect();
|
||||
let pairs = bucket_folder_pairs(rows, &totals);
|
||||
assert_eq!(pairs.len(), 1);
|
||||
let f = &pairs[0].shared_files[0];
|
||||
// Pair is canonicalized so A < B; A's rep wins lex.
|
||||
assert_eq!(f.a_rel_path, "A/a.jpg");
|
||||
assert_eq!(f.b_rel_path, "B/q.jpg");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn filter_folder_pairs_applies_thresholds() {
|
||||
let totals: HashMap<(i32, String), i64> = [
|
||||
((1, "tiny".to_string()), 1),
|
||||
((1, "tinier".to_string()), 1),
|
||||
((1, "big-a".to_string()), 100),
|
||||
((1, "big-b".to_string()), 100),
|
||||
]
|
||||
.into_iter()
|
||||
.collect();
|
||||
// tiny↔tinier: 1 shared, coverage 1.0 — should be filtered out
|
||||
// by min_shared=3 (incidental, not a folder-level signal).
|
||||
// big-a↔big-b: 50 shared — should pass.
|
||||
let mut rows: Vec<DuplicateRow> = Vec::new();
|
||||
rows.push(dup_row_at(1, "tiny/x.jpg", "ht"));
|
||||
rows.push(dup_row_at(1, "tinier/x.jpg", "ht"));
|
||||
for i in 0..50 {
|
||||
let h = format!("h{}", i);
|
||||
rows.push(dup_row_at(1, &format!("big-a/{}.jpg", i), &h));
|
||||
rows.push(dup_row_at(1, &format!("big-b/{}.jpg", i), &h));
|
||||
}
|
||||
let pairs = bucket_folder_pairs(rows, &totals);
|
||||
let kept = filter_folder_pairs(pairs, 0.5, 3);
|
||||
assert_eq!(kept.len(), 1);
|
||||
assert_eq!(kept[0].shared_count, 50);
|
||||
}
|
||||
|
||||
/// Sanity-check the BK-tree's metric, which is what the duplicates
|
||||
/// path actually clusters on.
|
||||
#[test]
|
||||
|
||||
@@ -1767,6 +1767,15 @@ mod tests {
|
||||
Ok(Vec::new())
|
||||
}
|
||||
|
||||
fn list_image_paths(
|
||||
&mut self,
|
||||
_context: &opentelemetry::Context,
|
||||
_library_id: Option<i32>,
|
||||
_include_resolved: bool,
|
||||
) -> Result<Vec<(i32, String)>, DbError> {
|
||||
Ok(Vec::new())
|
||||
}
|
||||
|
||||
fn lookup_duplicate_row(
|
||||
&mut self,
|
||||
_context: &opentelemetry::Context,
|
||||
|
||||
Reference in New Issue
Block a user