From 48cac8c285d333679dd6af2ad9c23b1c1e26e922 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Fri, 1 May 2026 14:52:16 +0000 Subject: [PATCH] multi-library: hash-keyed tagged_photo + photo_insights with reconciliation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Branch B of the multi-library data-model rollout. tagged_photo and photo_insights now follow the bytes (content_hash), not the path, matching the policy pinned in CLAUDE.md "Multi-library data model". Branch A's availability probe and EXIF scoping land first; this branch builds on top. Migration (2026-05-01-000000_hash_keyed_derived_data) Adds nullable content_hash columns to tagged_photo and photo_insights, with partial indexes on the non-null subset to keep the index small during the transitional window. The migration backfills from image_exif: * tagged_photo joins on rel_path alone (no library_id available); * photo_insights joins on (library_id, rel_path), unambiguous. Rows whose image_exif hash isn't known yet stay null and the runtime reconciliation pass populates them as the hash backlog drains. Insert-time population TagDao::tag_file looks up image_exif.content_hash by rel_path before inserting; the hash is written into the new column. InsightDao::store_insight does the same scoped to (library_id, rel_path). Caller-supplied hash on InsertPhotoInsight wins; otherwise the DAO does the lookup. Both paths fall back to None if the hash isn't known yet — reconciliation backfills. Reconciliation (database/reconcile.rs) Three idempotent passes the watcher runs once per tick after the per-library backfill loop: 1. tagged_photo NULL hashes → populate from image_exif by rel_path. 2. photo_insights NULL hashes → populate by (library_id, rel_path). 3. photo_insights scalar merge — when multiple is_current rows share a content_hash, keep the earliest generated_at as current; demote the rest. Demoted rows keep their data so /insights/history is unaffected; only the "current" pointer narrows to one per hash. No filesystem dependency, so reconcile doesn't need the availability gate; runs every tick. Logs once when something changed, debug otherwise. Tags are set-valued under the policy (union on read, already DISTINCT in queries), so there is no analogous tag-collapse pass — duplicate (tag_id, content_hash) rows across libraries are harmless. Read paths are unchanged in this branch — lookup_tags_batch's existing rel_path-via-hash-sibling expansion still produces the correct merge. A follow-up can simplify reads to use the new column directly for performance. Tests: 217 pass (212 pre-existing + 5 new in reconcile covering NULL-fill, hash-not-yet-known no-op, library scoping on insights, earliest-wins collapse, idempotency). Co-Authored-By: Claude Opus 4.7 (1M context) --- .../down.sql | 5 + .../up.sql | 64 +++ src/ai/insight_chat.rs | 2 + src/ai/insight_generator.rs | 6 +- src/database/insights_dao.rs | 19 +- src/database/mod.rs | 1 + src/database/models.rs | 8 + src/database/reconcile.rs | 385 ++++++++++++++++++ src/database/schema.rs | 2 + src/main.rs | 11 + src/tags.rs | 28 ++ 11 files changed, 529 insertions(+), 2 deletions(-) create mode 100644 migrations/2026-05-01-000000_hash_keyed_derived_data/down.sql create mode 100644 migrations/2026-05-01-000000_hash_keyed_derived_data/up.sql create mode 100644 src/database/reconcile.rs diff --git a/migrations/2026-05-01-000000_hash_keyed_derived_data/down.sql b/migrations/2026-05-01-000000_hash_keyed_derived_data/down.sql new file mode 100644 index 0000000..ed528bc --- /dev/null +++ b/migrations/2026-05-01-000000_hash_keyed_derived_data/down.sql @@ -0,0 +1,5 @@ +DROP INDEX IF EXISTS idx_photo_insights_content_hash; +ALTER TABLE photo_insights DROP COLUMN content_hash; + +DROP INDEX IF EXISTS idx_tagged_photo_content_hash; +ALTER TABLE tagged_photo DROP COLUMN content_hash; diff --git a/migrations/2026-05-01-000000_hash_keyed_derived_data/up.sql b/migrations/2026-05-01-000000_hash_keyed_derived_data/up.sql new file mode 100644 index 0000000..11bf718 --- /dev/null +++ b/migrations/2026-05-01-000000_hash_keyed_derived_data/up.sql @@ -0,0 +1,64 @@ +-- Phase B of the multi-library data-model rollout: add a nullable +-- `content_hash` column to derived/user-intent tables that should follow +-- the bytes rather than the path. Reads will prefer hash-key joins and +-- fall back to rel_path while the column is null. A separate +-- reconciliation pass collapses duplicates as the column populates. +-- +-- See CLAUDE.md → "Multi-library data model" for the policy. The +-- reference implementation is `face_detections`, which has been +-- hash-keyed since it was introduced. +-- +-- Tables in this migration: +-- * tagged_photo — user-intent (tags follow the bytes) +-- * photo_insights — intrinsic to bytes (LLM-generated description) +-- +-- favorites is the natural third candidate but its DAO is barely used in +-- v1 and the row count is tiny; deferring lets this migration stay +-- focused on the high-volume tables that drive cross-library overhead. + +-- --------------------------------------------------------------------------- +-- tagged_photo +-- --------------------------------------------------------------------------- +ALTER TABLE tagged_photo ADD COLUMN content_hash TEXT; + +-- Backfill: for each tagged_photo row, find the content_hash for its +-- rel_path. tagged_photo doesn't carry a library_id, so a rel_path that +-- exists under multiple libraries with different content is genuinely +-- ambiguous — we take the first matching image_exif row. The +-- reconciliation pass at runtime cleans up any rows that resolve +-- differently once a hash is known per library. +UPDATE tagged_photo +SET content_hash = ( + SELECT content_hash FROM image_exif + WHERE image_exif.rel_path = tagged_photo.rel_path + AND image_exif.content_hash IS NOT NULL + LIMIT 1 +) +WHERE content_hash IS NULL; + +-- Hash-key index. Partial (only non-null rows) to keep the index small +-- during the transitional window where most rows are still null. +CREATE INDEX idx_tagged_photo_content_hash + ON tagged_photo (content_hash) + WHERE content_hash IS NOT NULL; + +-- --------------------------------------------------------------------------- +-- photo_insights +-- --------------------------------------------------------------------------- +ALTER TABLE photo_insights ADD COLUMN content_hash TEXT; + +-- Backfill keyed on (library_id, rel_path) — photo_insights already +-- carries library_id, so the resolution is unambiguous. +UPDATE photo_insights +SET content_hash = ( + SELECT content_hash FROM image_exif + WHERE image_exif.library_id = photo_insights.library_id + AND image_exif.rel_path = photo_insights.rel_path + AND image_exif.content_hash IS NOT NULL + LIMIT 1 +) +WHERE content_hash IS NULL; + +CREATE INDEX idx_photo_insights_content_hash + ON photo_insights (content_hash) + WHERE content_hash IS NOT NULL; diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs index 05de11c..792f124 100644 --- a/src/ai/insight_chat.rs +++ b/src/ai/insight_chat.rs @@ -521,6 +521,7 @@ impl InsightChatService { training_messages: Some(json), backend: effective_backend.clone(), fewshot_source_ids: None, + content_hash: None, }; let cx = opentelemetry::Context::new(); let mut dao = self.insight_dao.lock().expect("Unable to lock InsightDao"); @@ -983,6 +984,7 @@ impl InsightChatService { training_messages: Some(json), backend: effective_backend.clone(), fewshot_source_ids: None, + content_hash: None, }; let cx = opentelemetry::Context::new(); let mut dao = self.insight_dao.lock().expect("Unable to lock InsightDao"); diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs index f31bfac..e03b1af 100644 --- a/src/ai/insight_generator.rs +++ b/src/ai/insight_generator.rs @@ -1255,7 +1255,9 @@ impl InsightGenerator { .span() .set_attribute(KeyValue::new("summary_length", summary.len() as i64)); - // 11. Store in database + // 11. Store in database. content_hash is None here — store_insight + // looks it up from image_exif before persisting; reconciliation + // backfills if the hash isn't known yet. let insight = InsertPhotoInsight { library_id: crate::libraries::PRIMARY_LIBRARY_ID, file_path: file_path.to_string(), @@ -1267,6 +1269,7 @@ impl InsightGenerator { training_messages: None, backend: "local".to_string(), fewshot_source_ids: None, + content_hash: None, }; let mut dao = self.insight_dao.lock().expect("Unable to lock InsightDao"); @@ -3530,6 +3533,7 @@ Return ONLY the summary, nothing else."#, training_messages, backend: backend_label.clone(), fewshot_source_ids: fewshot_source_ids_json, + content_hash: None, }; let stored = { diff --git a/src/database/insights_dao.rs b/src/database/insights_dao.rs index 2821b5b..28de5f4 100644 --- a/src/database/insights_dao.rs +++ b/src/database/insights_dao.rs @@ -111,13 +111,30 @@ impl InsightDao for SqliteInsightDao { fn store_insight( &mut self, context: &opentelemetry::Context, - insight: InsertPhotoInsight, + mut insight: InsertPhotoInsight, ) -> Result { trace_db_call(context, "insert", "store_insight", |_span| { use schema::photo_insights::dsl::*; let mut connection = self.connection.lock().expect("Unable to get InsightDao"); + // Eagerly populate content_hash so this insight follows the + // bytes (CLAUDE.md "Multi-library data model"). Caller- + // supplied hash wins; otherwise look it up from image_exif + // for the (library_id, rel_path) tuple. None is acceptable — + // reconciliation backfills it once the hash lands. + if insight.content_hash.is_none() { + use schema::image_exif as ie; + insight.content_hash = ie::table + .filter(ie::library_id.eq(insight.library_id)) + .filter(ie::rel_path.eq(&insight.file_path)) + .filter(ie::content_hash.is_not_null()) + .select(ie::content_hash) + .first::>(connection.deref_mut()) + .ok() + .flatten(); + } + // Mark all existing insights for this file as no longer current diesel::update( photo_insights diff --git a/src/database/mod.rs b/src/database/mod.rs index 60b3b3d..444664f 100644 --- a/src/database/mod.rs +++ b/src/database/mod.rs @@ -16,6 +16,7 @@ pub mod knowledge_dao; pub mod location_dao; pub mod models; pub mod preview_dao; +pub mod reconcile; pub mod schema; pub mod search_dao; diff --git a/src/database/models.rs b/src/database/models.rs index 96d9c53..78edf00 100644 --- a/src/database/models.rs +++ b/src/database/models.rs @@ -108,6 +108,13 @@ pub struct InsertPhotoInsight { /// generation). Used downstream to filter out contaminated rows when /// assembling an unbiased training / evaluation set. pub fewshot_source_ids: Option, + /// Bytes-keyed identity. When present, this insight is considered + /// to belong to the content rather than the path — see CLAUDE.md + /// "Multi-library data model". The DAO populates this from + /// `image_exif.content_hash` at insert time when known; rows + /// inserted before the hash is available stay null and the + /// reconciliation pass backfills them. + pub content_hash: Option, } #[derive(Serialize, Queryable, Clone, Debug)] @@ -126,6 +133,7 @@ pub struct PhotoInsight { /// `"local"` (Ollama with images) | `"hybrid"` (local vision + OpenRouter chat). pub backend: String, pub fewshot_source_ids: Option, + pub content_hash: Option, } // --- Libraries --- diff --git a/src/database/reconcile.rs b/src/database/reconcile.rs new file mode 100644 index 0000000..baf31c3 --- /dev/null +++ b/src/database/reconcile.rs @@ -0,0 +1,385 @@ +//! Reconciliation pass for hash-keyed derived data. +//! +//! As `backfill_unhashed_backlog` populates `image_exif.content_hash` +//! for legacy rows, we want the matching `tagged_photo` and +//! `photo_insights` rows — which were inserted before the hash was +//! known — to inherit the hash too. Otherwise reads keep falling back +//! to the rel_path path even when a hash is now available. +//! +//! Two passes: +//! 1. **Hash backfill** — for every `tagged_photo` / `photo_insights` +//! row with NULL `content_hash`, look up the matching +//! `image_exif.content_hash` and write it. SQL-only; idempotent; +//! a no-op once everything is hashed. +//! 2. **Insight scalar merge** — when multiple `photo_insights` rows +//! share a `content_hash` with `is_current = true`, only the +//! earliest `generated_at` keeps `is_current = true` (per the +//! "earliest wins" rule in CLAUDE.md → "Multi-library data +//! model"). Others are demoted, not deleted, so they remain +//! visible in history endpoints. +//! +//! Tags are set-valued under the policy (union on read), so there's no +//! analogous "collapse" pass — duplicate `(tag_id, content_hash)` rows +//! across libraries are harmless and correctly de-duped at read time +//! by the existing `DISTINCT` queries. +//! +//! The pass operates on the database alone — no filesystem access — +//! so it doesn't need the library availability gate. + +// The lib doesn't call into this module directly — the watcher (in the +// bin) does. Dead-code analysis at the lib level can't see that, so +// suppress at the module level. Tests still exercise every function. +#![allow(dead_code)] + +use diesel::prelude::*; +use diesel::sql_query; +use diesel::sqlite::SqliteConnection; +use log::{debug, info, warn}; + +/// Outcome of a reconciliation tick. Tracked so the watcher can log +/// progress when something changed and stay quiet when nothing did. +#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)] +pub struct ReconcileStats { + pub tagged_photo_hashes_filled: usize, + pub photo_insights_hashes_filled: usize, + pub photo_insights_demoted: usize, +} + +impl ReconcileStats { + pub fn changed(&self) -> bool { + self.tagged_photo_hashes_filled > 0 + || self.photo_insights_hashes_filled > 0 + || self.photo_insights_demoted > 0 + } +} + +/// Run the reconciliation pass. Idempotent — safe to call on every +/// watcher tick. Errors are logged but never propagated; reconciliation +/// is best-effort and a transient DB hiccup must not stall the watcher. +pub fn run(conn: &mut SqliteConnection) -> ReconcileStats { + let mut stats = ReconcileStats::default(); + + stats.tagged_photo_hashes_filled = match backfill_tagged_photo_hashes(conn) { + Ok(n) => n, + Err(e) => { + warn!("reconcile: tagged_photo hash backfill failed: {:?}", e); + 0 + } + }; + + stats.photo_insights_hashes_filled = match backfill_photo_insights_hashes(conn) { + Ok(n) => n, + Err(e) => { + warn!("reconcile: photo_insights hash backfill failed: {:?}", e); + 0 + } + }; + + stats.photo_insights_demoted = match collapse_insight_currents(conn) { + Ok(n) => n, + Err(e) => { + warn!("reconcile: photo_insights scalar merge failed: {:?}", e); + 0 + } + }; + + if stats.changed() { + info!( + "reconcile: filled {} tagged_photo hash(es), {} photo_insights hash(es); demoted {} non-current insight row(s)", + stats.tagged_photo_hashes_filled, + stats.photo_insights_hashes_filled, + stats.photo_insights_demoted, + ); + } else { + debug!("reconcile: no changes this tick"); + } + + stats +} + +/// Populate `tagged_photo.content_hash` for any row that still has +/// NULL by joining on `rel_path` against `image_exif`. tagged_photo +/// doesn't carry `library_id`, so a path that exists under multiple +/// libraries with different content is genuinely ambiguous; we pick +/// any non-null hash for that path. Same trade-off as the migration +/// backfill — see `migrations/2026-05-01-000000_hash_keyed_derived_data`. +fn backfill_tagged_photo_hashes(conn: &mut SqliteConnection) -> QueryResult { + sql_query( + "UPDATE tagged_photo \ + SET content_hash = ( \ + SELECT content_hash FROM image_exif \ + WHERE image_exif.rel_path = tagged_photo.rel_path \ + AND image_exif.content_hash IS NOT NULL \ + LIMIT 1 \ + ) \ + WHERE content_hash IS NULL \ + AND EXISTS ( \ + SELECT 1 FROM image_exif \ + WHERE image_exif.rel_path = tagged_photo.rel_path \ + AND image_exif.content_hash IS NOT NULL \ + )", + ) + .execute(conn) +} + +/// Populate `photo_insights.content_hash` from `image_exif`, keyed on +/// `(library_id, rel_path)`. Unambiguous because photo_insights carries +/// library_id. +fn backfill_photo_insights_hashes(conn: &mut SqliteConnection) -> QueryResult { + sql_query( + "UPDATE photo_insights \ + SET content_hash = ( \ + SELECT content_hash FROM image_exif \ + WHERE image_exif.library_id = photo_insights.library_id \ + AND image_exif.rel_path = photo_insights.rel_path \ + AND image_exif.content_hash IS NOT NULL \ + LIMIT 1 \ + ) \ + WHERE content_hash IS NULL \ + AND EXISTS ( \ + SELECT 1 FROM image_exif \ + WHERE image_exif.library_id = photo_insights.library_id \ + AND image_exif.rel_path = photo_insights.rel_path \ + AND image_exif.content_hash IS NOT NULL \ + )", + ) + .execute(conn) +} + +/// Scalar-merge step: when multiple rows share a `content_hash` and +/// claim `is_current = true`, demote all but the earliest by +/// `generated_at` (ties broken by lowest id, deterministic). +/// +/// Demoted rows keep their data — only `is_current` flips. Clients that +/// hit `/insights/history` still see the full sequence; only the +/// "current" pointer is unique per hash. +fn collapse_insight_currents(conn: &mut SqliteConnection) -> QueryResult { + sql_query( + "UPDATE photo_insights \ + SET is_current = 0 \ + WHERE is_current = 1 \ + AND content_hash IS NOT NULL \ + AND id NOT IN ( \ + SELECT MIN(p2.id) FROM photo_insights p2 \ + WHERE p2.is_current = 1 \ + AND p2.content_hash = photo_insights.content_hash \ + AND p2.generated_at = ( \ + SELECT MIN(p3.generated_at) FROM photo_insights p3 \ + WHERE p3.is_current = 1 \ + AND p3.content_hash = p2.content_hash \ + ) \ + )", + ) + .execute(conn) +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::database::test::in_memory_db_connection; + + fn ensure_library(conn: &mut SqliteConnection, library_id: i32) { + // Migration seeds library id=1; tests that reference id>1 must + // create those rows themselves, otherwise FK enforcement (added + // in the tags-edit migration) rejects image_exif inserts. + diesel::sql_query( + "INSERT OR IGNORE INTO libraries (id, name, root_path, created_at) \ + VALUES (?, 'test-' || ?, '/tmp/test-' || ?, 0)", + ) + .bind::(library_id) + .bind::(library_id) + .bind::(library_id) + .execute(conn) + .unwrap(); + } + + fn insert_image_exif( + conn: &mut SqliteConnection, + library_id: i32, + rel_path: &str, + content_hash: Option<&str>, + ) { + use crate::database::schema::image_exif; + ensure_library(conn, library_id); + diesel::sql_query( + "INSERT INTO image_exif (library_id, rel_path, created_time, last_modified, content_hash) \ + VALUES (?, ?, 0, 0, ?)", + ) + .bind::(library_id) + .bind::(rel_path) + .bind::, _>(content_hash) + .execute(conn) + .unwrap(); + // Keep clippy happy that the import is used. + let _ = image_exif::table; + } + + fn insert_tagged_photo(conn: &mut SqliteConnection, rel_path: &str, tag_id: i32) { + diesel::sql_query( + "INSERT INTO tagged_photo (rel_path, tag_id, created_time) VALUES (?, ?, 0)", + ) + .bind::(rel_path) + .bind::(tag_id) + .execute(conn) + .unwrap(); + } + + fn insert_tag(conn: &mut SqliteConnection, id: i32, name: &str) { + diesel::sql_query("INSERT INTO tags (id, name, created_time) VALUES (?, ?, 0)") + .bind::(id) + .bind::(name) + .execute(conn) + .unwrap(); + } + + fn insert_insight( + conn: &mut SqliteConnection, + library_id: i32, + rel_path: &str, + generated_at: i64, + is_current: bool, + ) -> i32 { + ensure_library(conn, library_id); + diesel::sql_query( + "INSERT INTO photo_insights (library_id, rel_path, title, summary, generated_at, model_version, is_current, backend) \ + VALUES (?, ?, 't', 's', ?, 'v', ?, 'local')", + ) + .bind::(library_id) + .bind::(rel_path) + .bind::(generated_at) + .bind::(is_current) + .execute(conn) + .unwrap(); + diesel::sql_query("SELECT last_insert_rowid() AS id") + .get_result::(conn) + .map(|r| r.id) + .unwrap() + } + + #[derive(QueryableByName)] + struct TestId { + #[diesel(sql_type = diesel::sql_types::Integer)] + id: i32, + } + + #[derive(QueryableByName, Debug)] + struct HashOnly { + #[diesel(sql_type = diesel::sql_types::Nullable)] + content_hash: Option, + } + + #[derive(QueryableByName, Debug)] + struct CurrentRow { + #[diesel(sql_type = diesel::sql_types::Integer)] + id: i32, + #[diesel(sql_type = diesel::sql_types::Bool)] + is_current: bool, + } + + #[test] + fn backfill_fills_tagged_photo_hash_when_image_exif_has_one() { + let mut conn = in_memory_db_connection(); + insert_tag(&mut conn, 1, "vacation"); + insert_tagged_photo(&mut conn, "trip/IMG.jpg", 1); + // No image_exif row yet — backfill no-op. + let stats = run(&mut conn); + assert_eq!(stats.tagged_photo_hashes_filled, 0); + + // image_exif row appears with a hash; next reconcile fills it. + insert_image_exif(&mut conn, 1, "trip/IMG.jpg", Some("hashabc")); + let stats = run(&mut conn); + assert_eq!(stats.tagged_photo_hashes_filled, 1); + + let row = diesel::sql_query( + "SELECT content_hash FROM tagged_photo WHERE rel_path = 'trip/IMG.jpg'", + ) + .get_result::(&mut conn) + .unwrap(); + assert_eq!(row.content_hash.as_deref(), Some("hashabc")); + + // Idempotent: a second run is a no-op. + let stats = run(&mut conn); + assert_eq!(stats.tagged_photo_hashes_filled, 0); + } + + #[test] + fn backfill_skips_tagged_photo_when_image_exif_has_no_hash() { + let mut conn = in_memory_db_connection(); + insert_tag(&mut conn, 1, "vacation"); + insert_tagged_photo(&mut conn, "trip/IMG.jpg", 1); + // image_exif exists but its hash is null. + insert_image_exif(&mut conn, 1, "trip/IMG.jpg", None); + + let stats = run(&mut conn); + assert_eq!(stats.tagged_photo_hashes_filled, 0); + } + + #[test] + fn backfill_fills_photo_insights_hash_scoped_by_library() { + let mut conn = in_memory_db_connection(); + // Row in library 1 only — must not be filled by a hash from + // library 2's same-rel_path entry. + insert_image_exif(&mut conn, 1, "shared.jpg", Some("hash-lib1")); + let id1 = insert_insight(&mut conn, 1, "shared.jpg", 100, true); + + let stats = run(&mut conn); + assert_eq!(stats.photo_insights_hashes_filled, 1); + + let row = diesel::sql_query( + "SELECT content_hash FROM photo_insights WHERE id = ?", + ) + .bind::(id1) + .get_result::(&mut conn) + .unwrap(); + assert_eq!(row.content_hash.as_deref(), Some("hash-lib1")); + } + + #[test] + fn collapse_keeps_earliest_is_current_per_hash() { + let mut conn = in_memory_db_connection(); + // Two libraries, same content_hash via image_exif. Insights + // were generated independently in each library, both currently + // is_current = true. The earlier one wins. + insert_image_exif(&mut conn, 1, "a.jpg", Some("h1")); + insert_image_exif(&mut conn, 2, "a.jpg", Some("h1")); + let earlier = insert_insight(&mut conn, 1, "a.jpg", 100, true); + let later = insert_insight(&mut conn, 2, "a.jpg", 200, true); + + // First pass fills the content_hash; second collapses. + let stats = run(&mut conn); + assert_eq!(stats.photo_insights_hashes_filled, 2); + assert_eq!(stats.photo_insights_demoted, 1); + + let rows = diesel::sql_query( + "SELECT id, is_current FROM photo_insights ORDER BY id", + ) + .get_results::(&mut conn) + .unwrap(); + let earlier_row = rows.iter().find(|r| r.id == earlier).unwrap(); + let later_row = rows.iter().find(|r| r.id == later).unwrap(); + assert!(earlier_row.is_current, "earlier insight should remain current"); + assert!(!later_row.is_current, "later insight should be demoted"); + + // Idempotent. + let stats = run(&mut conn); + assert_eq!(stats.photo_insights_demoted, 0); + } + + #[test] + fn collapse_does_not_demote_a_solo_current_row() { + let mut conn = in_memory_db_connection(); + insert_image_exif(&mut conn, 1, "a.jpg", Some("h1")); + let solo = insert_insight(&mut conn, 1, "a.jpg", 100, true); + + let stats = run(&mut conn); + assert_eq!(stats.photo_insights_demoted, 0); + + let row = diesel::sql_query( + "SELECT id, is_current FROM photo_insights WHERE id = ?", + ) + .bind::(solo) + .get_result::(&mut conn) + .unwrap(); + assert!(row.is_current); + } +} diff --git a/src/database/schema.rs b/src/database/schema.rs index 55ad9e5..5e933ca 100644 --- a/src/database/schema.rs +++ b/src/database/schema.rs @@ -178,6 +178,7 @@ diesel::table! { approved -> Nullable, backend -> Text, fewshot_source_ids -> Nullable, + content_hash -> Nullable, } } @@ -199,6 +200,7 @@ diesel::table! { rel_path -> Text, tag_id -> Integer, created_time -> BigInt, + content_hash -> Nullable, } } diff --git a/src/main.rs b/src/main.rs index 93df054..f9f9dbe 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2063,6 +2063,17 @@ fn watch_files( update_media_counts(Path::new(&lib.root_path), &excluded_dirs); } + // Reconciliation: cross-library, so it runs once per tick + // outside the per-library loop. Idempotent — fast no-op when + // there's nothing to do. Operates on the database alone, no + // filesystem dependency, so it doesn't need a health gate. + // See database::reconcile and CLAUDE.md "Multi-library data + // model" for the rules. + { + let mut conn = image_api::database::connect(); + let _ = image_api::database::reconcile::run(&mut conn); + } + if is_full_scan { last_full_scan = now; } diff --git a/src/tags.rs b/src/tags.rs index 4064b4c..f3e0135 100644 --- a/src/tags.rs +++ b/src/tags.rs @@ -507,6 +507,11 @@ pub struct InsertTaggedPhoto { #[diesel(column_name = rel_path)] pub photo_name: String, pub created_time: i64, + /// Hash-keyed identity. The DAO populates this from + /// `image_exif.content_hash` at insert time when known; the + /// reconciliation pass backfills rows inserted before the hash + /// landed. See CLAUDE.md "Multi-library data model". + pub content_hash: Option, } #[derive(Queryable, Clone, Debug)] @@ -520,6 +525,8 @@ pub struct TaggedPhoto { pub tag_id: i32, #[allow(dead_code)] // Part of API contract pub created_time: i64, + #[allow(dead_code)] + pub content_hash: Option, } #[derive(Debug, Deserialize)] @@ -958,11 +965,31 @@ impl TagDao for SqliteTagDao { KeyValue::new("tag_id", tag_id.to_string()), ]); + // Eagerly populate content_hash so this tag follows the bytes, + // not the path (see CLAUDE.md "Multi-library data model"). + // None is fine — the reconciliation pass will backfill once + // image_exif has a hash for this file. We deliberately don't + // require library_id here: the tag handler is library- + // agnostic by design, and any matching image_exif row's hash + // is acceptable. If the path resolves to different bytes in + // different libraries, reconciliation per-library refines. + let content_hash: Option = { + use crate::database::schema::image_exif as ie; + ie::table + .filter(ie::rel_path.eq(path)) + .filter(ie::content_hash.is_not_null()) + .select(ie::content_hash) + .first::>(conn.deref_mut()) + .ok() + .flatten() + }; + diesel::insert_into(tagged_photo::table) .values(InsertTaggedPhoto { tag_id, photo_name: path.to_string(), created_time: Utc::now().timestamp(), + content_hash, }) .execute(conn.deref_mut()) .with_context(|| format!("Unable to tag file {:?} in sqlite", path)) @@ -1367,6 +1394,7 @@ mod tests { tag_id: tag.id, created_time: Utc::now().timestamp(), photo_name: path.to_string(), + content_hash: None, }; if self.tagged_photos.borrow().contains_key(path) {