From 6a6a4a6a46d5bcb97dacc43ab910b2183da587cd Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Thu, 30 Apr 2026 00:36:44 +0000 Subject: [PATCH] tags: batch lookup expands content-hash siblings cross-library MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The first cut matched by rel_path only — fine for single-library deploys but wrong for multi-library setups where the same content lives under different rel_paths (e.g. a backup mount holding copies of the primary library). A tag applied under library A would silently not appear in the library-B grid badge even though the carousel's per-path /image/tags would resolve it correctly via siblings. The batch handler now does the expansion server-side in three queries regardless of input size: 1. image_exif batch lookup → query path → content_hash 2. image_exif JOIN by content_hash → all sibling rel_paths sharing each hash (paths are deduped across libraries) 3. tagged_photo + tags JOIN over the union of (query + sibling) rel_paths Tags are then aggregated back to query paths via a sibling→originals reverse map, deduped by tag id. Files without a content_hash (just indexed, hash compute pending, etc.) skip step 2 and only get tags from their own rel_path — same fallback the per-path handler uses. Adds ExifDao::get_rel_paths_for_hashes (batch counterpart of get_rel_paths_by_hash) chunked at 500 to stay under SQLite's SQLITE_LIMIT_VARIABLE_NUMBER. Five queries for a 4k-photo grid is still ~800x cheaper than per-path HTTP fan-out. --- src/database/mod.rs | 44 +++++++++++ src/files.rs | 8 ++ src/tags.rs | 176 ++++++++++++++++++++++++++++++++++++++------ 3 files changed, 207 insertions(+), 21 deletions(-) diff --git a/src/database/mod.rs b/src/database/mod.rs index 07406d6..a4e348a 100644 --- a/src/database/mod.rs +++ b/src/database/mod.rs @@ -386,6 +386,16 @@ pub trait ExifDao: Sync + Send { hash: &str, ) -> Result, DbError>; + /// Batch version of [`get_rel_paths_by_hash`]. Returns a + /// `hash → Vec` map for every hash that has at least one + /// rel_path. Used by the batch tag lookup endpoint to expand + /// content-hash siblings without firing a query per hash. + fn get_rel_paths_for_hashes( + &mut self, + context: &opentelemetry::Context, + hashes: &[String], + ) -> Result>, DbError>; + /// List `(library_id, rel_path)` pairs for the given libraries, optionally /// restricted to rows whose rel_path starts with `path_prefix`. When /// `library_ids` is empty, rows from every library are returned. Used by @@ -956,6 +966,40 @@ impl ExifDao for SqliteExifDao { .map_err(|_| DbError::new(DbErrorKind::QueryError)) } + fn get_rel_paths_for_hashes( + &mut self, + context: &opentelemetry::Context, + hashes: &[String], + ) -> Result>, DbError> { + use std::collections::HashMap; + let mut out: HashMap> = HashMap::new(); + if hashes.is_empty() { + return Ok(out); + } + trace_db_call(context, "query", "get_rel_paths_for_hashes", |_span| { + use schema::image_exif::dsl::*; + + let mut connection = self.connection.lock().expect("Unable to get ExifDao"); + + // Chunk the IN clause to stay safely under SQLite's + // SQLITE_LIMIT_VARIABLE_NUMBER (32766 modern, 999 legacy). + const CHUNK: usize = 500; + for chunk in hashes.chunks(CHUNK) { + let rows: Vec<(String, String)> = image_exif + .filter(content_hash.eq_any(chunk)) + .select((content_hash.assume_not_null(), rel_path)) + .distinct() + .load::<(String, String)>(connection.deref_mut()) + .map_err(|_| anyhow::anyhow!("Query error"))?; + for (hash, path) in rows { + out.entry(hash).or_default().push(path); + } + } + Ok(out) + }) + .map_err(|_| DbError::new(DbErrorKind::QueryError)) + } + fn list_rel_paths_for_libraries( &mut self, context: &opentelemetry::Context, diff --git a/src/files.rs b/src/files.rs index b4458c9..9ab1468 100644 --- a/src/files.rs +++ b/src/files.rs @@ -1659,6 +1659,14 @@ mod tests { Ok(vec![]) } + fn get_rel_paths_for_hashes( + &mut self, + _context: &opentelemetry::Context, + _hashes: &[String], + ) -> Result>, DbError> { + Ok(std::collections::HashMap::new()) + } + fn list_rel_paths_for_libraries( &mut self, _context: &opentelemetry::Context, diff --git a/src/tags.rs b/src/tags.rs index e40e13c..bdd5abd 100644 --- a/src/tags.rs +++ b/src/tags.rs @@ -244,44 +244,142 @@ pub struct LookupTagsBatchRequest { pub paths: Vec, } -/// Bulk per-path tag lookup. Apollo's photo-match flow used to fan out -/// one ``GET /image/tags?path=`` per record (~4k for a wide window) — +/// Bulk per-path tag lookup with cross-library content-hash sibling +/// expansion. Apollo's photo-match flow used to fan out one +/// ``GET /image/tags?path=`` per record (~4k for a wide window) — /// each call locked the dao briefly and the round-trip cost dwarfed -/// the actual SQL. This collapses the whole fan-out into one POST and -/// one (chunked) JOIN. Body: ``{paths: [...]}``; response: -/// ``{path: [{id, name, ...}]}`` with **only paths that have at least -/// one tag** in the map (the caller treats absence as empty list). +/// the actual SQL. This collapses the whole fan-out into: /// -/// Trade-off: this matches by ``rel_path`` directly and does NOT do -/// the cross-library content-hash sibling expansion that the per-path -/// ``GET /image/tags`` does. For Apollo's grid view the simpler match -/// is fine — it's the common case for single-library deploys; the -/// carousel still uses the per-path endpoint and resolves siblings on -/// demand. If multi-library content-hash sharing becomes load-bearing -/// for the grid, extend this to JOIN ``image_exif`` on content_hash. +/// 1. one ``image_exif`` batch lookup → query path → content_hash +/// 2. one ``image_exif`` JOIN by content_hash → all sibling rel_paths +/// (so a tag applied under library A surfaces under library B +/// when the content hashes match — important once a backup mount +/// holds copies of files from the primary library) +/// 3. one ``tagged_photo`` JOIN over the union of (query + sibling) +/// rel_paths +/// +/// Body: ``{paths: [...]}``; response: ``{path: [{id, name, ...}]}`` +/// with only paths that have at least one tag (caller treats absence +/// as empty). Each chunk is capped to stay under SQLite's variable +/// limit; five queries per 4k photos is still ~800x cheaper than +/// per-path HTTP fan-out. async fn lookup_tags_batch( _: Claims, http_request: HttpRequest, body: web::Json, tag_dao: web::Data>, + exif_dao: web::Data>>, ) -> impl Responder { + use std::collections::{HashMap, HashSet}; let context = extract_context_from_request(&http_request); let span = global_tracer().start_with_context("lookup_tags_batch", &context); let span_context = opentelemetry::Context::current_with_span(span); if body.paths.is_empty() { - return HttpResponse::Ok().json(std::collections::HashMap::>::new()); + return HttpResponse::Ok().json(HashMap::>::new()); } - let normalized: Vec = body.paths.iter().map(|p| normalize_path(p)).collect(); - let mut dao = tag_dao.lock().expect("Unable to get TagDao"); - match dao.get_tags_grouped_by_paths(&span_context, &normalized) { - Ok(grouped) => { - span_context.span().set_status(Status::Ok); - HttpResponse::Ok().json(grouped) + let query_paths: Vec = body.paths.iter().map(|p| normalize_path(p)).collect(); + + // Stage 1: query → content_hash mapping. Files without a hash yet + // (just-indexed, hash compute failed, etc.) skip the sibling + // expansion and only get tags from their own rel_path. + let exif_records = { + let mut dao = exif_dao.lock().expect("Unable to get ExifDao"); + match dao.get_exif_batch(&span_context, &query_paths) { + Ok(rows) => rows, + Err(e) => { + return HttpResponse::InternalServerError() + .body(format!("exif batch lookup failed: {:?}", e)); + } + } + }; + let mut hash_by_path: HashMap = HashMap::with_capacity(exif_records.len()); + for record in exif_records { + if let Some(h) = record.content_hash { + hash_by_path.insert(record.file_path, h); } - Err(e) => HttpResponse::InternalServerError().body(format!("{}", e)), } + let unique_hashes: Vec = hash_by_path + .values() + .cloned() + .collect::>() + .into_iter() + .collect(); + + // Stage 2: hash → all sibling rel_paths. + let paths_by_hash = if unique_hashes.is_empty() { + HashMap::new() + } else { + let mut dao = exif_dao.lock().expect("Unable to get ExifDao"); + match dao.get_rel_paths_for_hashes(&span_context, &unique_hashes) { + Ok(map) => map, + Err(e) => { + return HttpResponse::InternalServerError() + .body(format!("hash sibling lookup failed: {:?}", e)); + } + } + }; + + // Stage 3: build expanded path set and the reverse map + // sibling → [original query paths whose tag bucket should include + // the sibling's tags]. A query path always attributes to itself + // (covers the no-content-hash case). + let mut originals_by_sibling: HashMap> = HashMap::new(); + let mut all_paths: HashSet = HashSet::new(); + for query_path in &query_paths { + all_paths.insert(query_path.clone()); + originals_by_sibling + .entry(query_path.clone()) + .or_default() + .push(query_path.clone()); + if let Some(hash) = hash_by_path.get(query_path) + && let Some(siblings) = paths_by_hash.get(hash) + { + for sibling in siblings { + if sibling == query_path { + continue; + } + all_paths.insert(sibling.clone()); + originals_by_sibling + .entry(sibling.clone()) + .or_default() + .push(query_path.clone()); + } + } + } + + // Stage 4: tags grouped by rel_path for the union. + let all_paths_vec: Vec = all_paths.into_iter().collect(); + let tags_by_sibling = { + let mut dao = tag_dao.lock().expect("Unable to get TagDao"); + match dao.get_tags_grouped_by_paths(&span_context, &all_paths_vec) { + Ok(map) => map, + Err(e) => { + return HttpResponse::InternalServerError().body(format!("{}", e)); + } + } + }; + + // Stage 5: aggregate sibling tags back to original query paths, + // de-duped by tag id. Empty buckets stay out of the response so + // the caller's "missing key = []" contract holds. + let mut result: HashMap> = HashMap::new(); + for (sibling_path, originals) in originals_by_sibling { + if let Some(tags) = tags_by_sibling.get(&sibling_path) { + for orig in originals { + let entry = result.entry(orig).or_default(); + for t in tags { + if !entry.iter().any(|e| e.id == t.id) { + entry.push(t.clone()); + } + } + } + } + } + + span_context.span().set_status(Status::Ok); + HttpResponse::Ok().json(result) } #[derive(Serialize, Queryable, Clone, Debug, PartialEq)] @@ -1142,6 +1240,42 @@ mod tests { } } + #[actix_rt::test] + async fn get_tags_grouped_by_paths_returns_per_path_buckets() { + // Backstop for the batch tag-lookup endpoint: confirms the + // grouped variant returns one bucket per path with at least + // one tag, and omits paths with no tags entirely (the caller + // treats absence as []). The handler stacks sibling expansion + // on top via image_exif content_hash; the DAO method itself + // just needs to honour rel_path → tags directly. + let mut dao = TestTagDao::new(); + let ctx = opentelemetry::Context::current(); + // Seed: two paths tagged, one path untagged. + dao.tagged_photos.borrow_mut().insert( + "a.jpg".into(), + vec![Tag { id: 1, name: "alpha".into(), created_time: 0 }], + ); + dao.tagged_photos.borrow_mut().insert( + "b.jpg".into(), + vec![ + Tag { id: 2, name: "beta".into(), created_time: 0 }, + Tag { id: 3, name: "gamma".into(), created_time: 0 }, + ], + ); + let grouped = dao + .get_tags_grouped_by_paths( + &ctx, + &["a.jpg".into(), "b.jpg".into(), "c.jpg".into()], + ) + .unwrap(); + assert_eq!(grouped.get("a.jpg").map(|v| v.len()), Some(1)); + assert_eq!(grouped.get("b.jpg").map(|v| v.len()), Some(2)); + assert!( + !grouped.contains_key("c.jpg"), + "untagged paths must be absent so caller's missing-key=[] contract holds" + ); + } + #[actix_rt::test] async fn add_new_tag_test() { let tag_dao = TestTagDao::new();