multi-library: hash-keyed tagged_photo + photo_insights with reconciliation
Branch B of the multi-library data-model rollout. tagged_photo and
photo_insights now follow the bytes (content_hash), not the path,
matching the policy pinned in CLAUDE.md "Multi-library data model".
Branch A's availability probe and EXIF scoping land first; this
branch builds on top.
Migration (2026-05-01-000000_hash_keyed_derived_data)
Adds nullable content_hash columns to tagged_photo and photo_insights,
with partial indexes on the non-null subset to keep the index small
during the transitional window. The migration backfills from
image_exif:
* tagged_photo joins on rel_path alone (no library_id available);
* photo_insights joins on (library_id, rel_path), unambiguous.
Rows whose image_exif hash isn't known yet stay null and the runtime
reconciliation pass populates them as the hash backlog drains.
Insert-time population
TagDao::tag_file looks up image_exif.content_hash by rel_path before
inserting; the hash is written into the new column.
InsightDao::store_insight does the same scoped to (library_id,
rel_path). Caller-supplied hash on InsertPhotoInsight wins; otherwise
the DAO does the lookup. Both paths fall back to None if the hash
isn't known yet — reconciliation backfills.
Reconciliation (database/reconcile.rs)
Three idempotent passes the watcher runs once per tick after the
per-library backfill loop:
1. tagged_photo NULL hashes → populate from image_exif by rel_path.
2. photo_insights NULL hashes → populate by (library_id, rel_path).
3. photo_insights scalar merge — when multiple is_current rows
share a content_hash, keep the earliest generated_at as
current; demote the rest. Demoted rows keep their data so
/insights/history is unaffected; only the "current" pointer
narrows to one per hash.
No filesystem dependency, so reconcile doesn't need the availability
gate; runs every tick. Logs once when something changed, debug
otherwise.
Tags are set-valued under the policy (union on read, already
DISTINCT in queries), so there is no analogous tag-collapse pass —
duplicate (tag_id, content_hash) rows across libraries are
harmless.
Read paths are unchanged in this branch — lookup_tags_batch's
existing rel_path-via-hash-sibling expansion still produces the
correct merge. A follow-up can simplify reads to use the new column
directly for performance.
Tests: 217 pass (212 pre-existing + 5 new in reconcile covering
NULL-fill, hash-not-yet-known no-op, library scoping on insights,
earliest-wins collapse, idempotency).
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,5 @@
|
|||||||
|
DROP INDEX IF EXISTS idx_photo_insights_content_hash;
|
||||||
|
ALTER TABLE photo_insights DROP COLUMN content_hash;
|
||||||
|
|
||||||
|
DROP INDEX IF EXISTS idx_tagged_photo_content_hash;
|
||||||
|
ALTER TABLE tagged_photo DROP COLUMN content_hash;
|
||||||
64
migrations/2026-05-01-000000_hash_keyed_derived_data/up.sql
Normal file
64
migrations/2026-05-01-000000_hash_keyed_derived_data/up.sql
Normal file
@@ -0,0 +1,64 @@
|
|||||||
|
-- Phase B of the multi-library data-model rollout: add a nullable
|
||||||
|
-- `content_hash` column to derived/user-intent tables that should follow
|
||||||
|
-- the bytes rather than the path. Reads will prefer hash-key joins and
|
||||||
|
-- fall back to rel_path while the column is null. A separate
|
||||||
|
-- reconciliation pass collapses duplicates as the column populates.
|
||||||
|
--
|
||||||
|
-- See CLAUDE.md → "Multi-library data model" for the policy. The
|
||||||
|
-- reference implementation is `face_detections`, which has been
|
||||||
|
-- hash-keyed since it was introduced.
|
||||||
|
--
|
||||||
|
-- Tables in this migration:
|
||||||
|
-- * tagged_photo — user-intent (tags follow the bytes)
|
||||||
|
-- * photo_insights — intrinsic to bytes (LLM-generated description)
|
||||||
|
--
|
||||||
|
-- favorites is the natural third candidate but its DAO is barely used in
|
||||||
|
-- v1 and the row count is tiny; deferring lets this migration stay
|
||||||
|
-- focused on the high-volume tables that drive cross-library overhead.
|
||||||
|
|
||||||
|
-- ---------------------------------------------------------------------------
|
||||||
|
-- tagged_photo
|
||||||
|
-- ---------------------------------------------------------------------------
|
||||||
|
ALTER TABLE tagged_photo ADD COLUMN content_hash TEXT;
|
||||||
|
|
||||||
|
-- Backfill: for each tagged_photo row, find the content_hash for its
|
||||||
|
-- rel_path. tagged_photo doesn't carry a library_id, so a rel_path that
|
||||||
|
-- exists under multiple libraries with different content is genuinely
|
||||||
|
-- ambiguous — we take the first matching image_exif row. The
|
||||||
|
-- reconciliation pass at runtime cleans up any rows that resolve
|
||||||
|
-- differently once a hash is known per library.
|
||||||
|
UPDATE tagged_photo
|
||||||
|
SET content_hash = (
|
||||||
|
SELECT content_hash FROM image_exif
|
||||||
|
WHERE image_exif.rel_path = tagged_photo.rel_path
|
||||||
|
AND image_exif.content_hash IS NOT NULL
|
||||||
|
LIMIT 1
|
||||||
|
)
|
||||||
|
WHERE content_hash IS NULL;
|
||||||
|
|
||||||
|
-- Hash-key index. Partial (only non-null rows) to keep the index small
|
||||||
|
-- during the transitional window where most rows are still null.
|
||||||
|
CREATE INDEX idx_tagged_photo_content_hash
|
||||||
|
ON tagged_photo (content_hash)
|
||||||
|
WHERE content_hash IS NOT NULL;
|
||||||
|
|
||||||
|
-- ---------------------------------------------------------------------------
|
||||||
|
-- photo_insights
|
||||||
|
-- ---------------------------------------------------------------------------
|
||||||
|
ALTER TABLE photo_insights ADD COLUMN content_hash TEXT;
|
||||||
|
|
||||||
|
-- Backfill keyed on (library_id, rel_path) — photo_insights already
|
||||||
|
-- carries library_id, so the resolution is unambiguous.
|
||||||
|
UPDATE photo_insights
|
||||||
|
SET content_hash = (
|
||||||
|
SELECT content_hash FROM image_exif
|
||||||
|
WHERE image_exif.library_id = photo_insights.library_id
|
||||||
|
AND image_exif.rel_path = photo_insights.rel_path
|
||||||
|
AND image_exif.content_hash IS NOT NULL
|
||||||
|
LIMIT 1
|
||||||
|
)
|
||||||
|
WHERE content_hash IS NULL;
|
||||||
|
|
||||||
|
CREATE INDEX idx_photo_insights_content_hash
|
||||||
|
ON photo_insights (content_hash)
|
||||||
|
WHERE content_hash IS NOT NULL;
|
||||||
@@ -521,6 +521,7 @@ impl InsightChatService {
|
|||||||
training_messages: Some(json),
|
training_messages: Some(json),
|
||||||
backend: effective_backend.clone(),
|
backend: effective_backend.clone(),
|
||||||
fewshot_source_ids: None,
|
fewshot_source_ids: None,
|
||||||
|
content_hash: None,
|
||||||
};
|
};
|
||||||
let cx = opentelemetry::Context::new();
|
let cx = opentelemetry::Context::new();
|
||||||
let mut dao = self.insight_dao.lock().expect("Unable to lock InsightDao");
|
let mut dao = self.insight_dao.lock().expect("Unable to lock InsightDao");
|
||||||
@@ -983,6 +984,7 @@ impl InsightChatService {
|
|||||||
training_messages: Some(json),
|
training_messages: Some(json),
|
||||||
backend: effective_backend.clone(),
|
backend: effective_backend.clone(),
|
||||||
fewshot_source_ids: None,
|
fewshot_source_ids: None,
|
||||||
|
content_hash: None,
|
||||||
};
|
};
|
||||||
let cx = opentelemetry::Context::new();
|
let cx = opentelemetry::Context::new();
|
||||||
let mut dao = self.insight_dao.lock().expect("Unable to lock InsightDao");
|
let mut dao = self.insight_dao.lock().expect("Unable to lock InsightDao");
|
||||||
|
|||||||
@@ -1255,7 +1255,9 @@ impl InsightGenerator {
|
|||||||
.span()
|
.span()
|
||||||
.set_attribute(KeyValue::new("summary_length", summary.len() as i64));
|
.set_attribute(KeyValue::new("summary_length", summary.len() as i64));
|
||||||
|
|
||||||
// 11. Store in database
|
// 11. Store in database. content_hash is None here — store_insight
|
||||||
|
// looks it up from image_exif before persisting; reconciliation
|
||||||
|
// backfills if the hash isn't known yet.
|
||||||
let insight = InsertPhotoInsight {
|
let insight = InsertPhotoInsight {
|
||||||
library_id: crate::libraries::PRIMARY_LIBRARY_ID,
|
library_id: crate::libraries::PRIMARY_LIBRARY_ID,
|
||||||
file_path: file_path.to_string(),
|
file_path: file_path.to_string(),
|
||||||
@@ -1267,6 +1269,7 @@ impl InsightGenerator {
|
|||||||
training_messages: None,
|
training_messages: None,
|
||||||
backend: "local".to_string(),
|
backend: "local".to_string(),
|
||||||
fewshot_source_ids: None,
|
fewshot_source_ids: None,
|
||||||
|
content_hash: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let mut dao = self.insight_dao.lock().expect("Unable to lock InsightDao");
|
let mut dao = self.insight_dao.lock().expect("Unable to lock InsightDao");
|
||||||
@@ -3530,6 +3533,7 @@ Return ONLY the summary, nothing else."#,
|
|||||||
training_messages,
|
training_messages,
|
||||||
backend: backend_label.clone(),
|
backend: backend_label.clone(),
|
||||||
fewshot_source_ids: fewshot_source_ids_json,
|
fewshot_source_ids: fewshot_source_ids_json,
|
||||||
|
content_hash: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
let stored = {
|
let stored = {
|
||||||
|
|||||||
@@ -111,13 +111,30 @@ impl InsightDao for SqliteInsightDao {
|
|||||||
fn store_insight(
|
fn store_insight(
|
||||||
&mut self,
|
&mut self,
|
||||||
context: &opentelemetry::Context,
|
context: &opentelemetry::Context,
|
||||||
insight: InsertPhotoInsight,
|
mut insight: InsertPhotoInsight,
|
||||||
) -> Result<PhotoInsight, DbError> {
|
) -> Result<PhotoInsight, DbError> {
|
||||||
trace_db_call(context, "insert", "store_insight", |_span| {
|
trace_db_call(context, "insert", "store_insight", |_span| {
|
||||||
use schema::photo_insights::dsl::*;
|
use schema::photo_insights::dsl::*;
|
||||||
|
|
||||||
let mut connection = self.connection.lock().expect("Unable to get InsightDao");
|
let mut connection = self.connection.lock().expect("Unable to get InsightDao");
|
||||||
|
|
||||||
|
// Eagerly populate content_hash so this insight follows the
|
||||||
|
// bytes (CLAUDE.md "Multi-library data model"). Caller-
|
||||||
|
// supplied hash wins; otherwise look it up from image_exif
|
||||||
|
// for the (library_id, rel_path) tuple. None is acceptable —
|
||||||
|
// reconciliation backfills it once the hash lands.
|
||||||
|
if insight.content_hash.is_none() {
|
||||||
|
use schema::image_exif as ie;
|
||||||
|
insight.content_hash = ie::table
|
||||||
|
.filter(ie::library_id.eq(insight.library_id))
|
||||||
|
.filter(ie::rel_path.eq(&insight.file_path))
|
||||||
|
.filter(ie::content_hash.is_not_null())
|
||||||
|
.select(ie::content_hash)
|
||||||
|
.first::<Option<String>>(connection.deref_mut())
|
||||||
|
.ok()
|
||||||
|
.flatten();
|
||||||
|
}
|
||||||
|
|
||||||
// Mark all existing insights for this file as no longer current
|
// Mark all existing insights for this file as no longer current
|
||||||
diesel::update(
|
diesel::update(
|
||||||
photo_insights
|
photo_insights
|
||||||
|
|||||||
@@ -16,6 +16,7 @@ pub mod knowledge_dao;
|
|||||||
pub mod location_dao;
|
pub mod location_dao;
|
||||||
pub mod models;
|
pub mod models;
|
||||||
pub mod preview_dao;
|
pub mod preview_dao;
|
||||||
|
pub mod reconcile;
|
||||||
pub mod schema;
|
pub mod schema;
|
||||||
pub mod search_dao;
|
pub mod search_dao;
|
||||||
|
|
||||||
|
|||||||
@@ -108,6 +108,13 @@ pub struct InsertPhotoInsight {
|
|||||||
/// generation). Used downstream to filter out contaminated rows when
|
/// generation). Used downstream to filter out contaminated rows when
|
||||||
/// assembling an unbiased training / evaluation set.
|
/// assembling an unbiased training / evaluation set.
|
||||||
pub fewshot_source_ids: Option<String>,
|
pub fewshot_source_ids: Option<String>,
|
||||||
|
/// Bytes-keyed identity. When present, this insight is considered
|
||||||
|
/// to belong to the content rather than the path — see CLAUDE.md
|
||||||
|
/// "Multi-library data model". The DAO populates this from
|
||||||
|
/// `image_exif.content_hash` at insert time when known; rows
|
||||||
|
/// inserted before the hash is available stay null and the
|
||||||
|
/// reconciliation pass backfills them.
|
||||||
|
pub content_hash: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize, Queryable, Clone, Debug)]
|
#[derive(Serialize, Queryable, Clone, Debug)]
|
||||||
@@ -126,6 +133,7 @@ pub struct PhotoInsight {
|
|||||||
/// `"local"` (Ollama with images) | `"hybrid"` (local vision + OpenRouter chat).
|
/// `"local"` (Ollama with images) | `"hybrid"` (local vision + OpenRouter chat).
|
||||||
pub backend: String,
|
pub backend: String,
|
||||||
pub fewshot_source_ids: Option<String>,
|
pub fewshot_source_ids: Option<String>,
|
||||||
|
pub content_hash: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
// --- Libraries ---
|
// --- Libraries ---
|
||||||
|
|||||||
385
src/database/reconcile.rs
Normal file
385
src/database/reconcile.rs
Normal file
@@ -0,0 +1,385 @@
|
|||||||
|
//! Reconciliation pass for hash-keyed derived data.
|
||||||
|
//!
|
||||||
|
//! As `backfill_unhashed_backlog` populates `image_exif.content_hash`
|
||||||
|
//! for legacy rows, we want the matching `tagged_photo` and
|
||||||
|
//! `photo_insights` rows — which were inserted before the hash was
|
||||||
|
//! known — to inherit the hash too. Otherwise reads keep falling back
|
||||||
|
//! to the rel_path path even when a hash is now available.
|
||||||
|
//!
|
||||||
|
//! Two passes:
|
||||||
|
//! 1. **Hash backfill** — for every `tagged_photo` / `photo_insights`
|
||||||
|
//! row with NULL `content_hash`, look up the matching
|
||||||
|
//! `image_exif.content_hash` and write it. SQL-only; idempotent;
|
||||||
|
//! a no-op once everything is hashed.
|
||||||
|
//! 2. **Insight scalar merge** — when multiple `photo_insights` rows
|
||||||
|
//! share a `content_hash` with `is_current = true`, only the
|
||||||
|
//! earliest `generated_at` keeps `is_current = true` (per the
|
||||||
|
//! "earliest wins" rule in CLAUDE.md → "Multi-library data
|
||||||
|
//! model"). Others are demoted, not deleted, so they remain
|
||||||
|
//! visible in history endpoints.
|
||||||
|
//!
|
||||||
|
//! Tags are set-valued under the policy (union on read), so there's no
|
||||||
|
//! analogous "collapse" pass — duplicate `(tag_id, content_hash)` rows
|
||||||
|
//! across libraries are harmless and correctly de-duped at read time
|
||||||
|
//! by the existing `DISTINCT` queries.
|
||||||
|
//!
|
||||||
|
//! The pass operates on the database alone — no filesystem access —
|
||||||
|
//! so it doesn't need the library availability gate.
|
||||||
|
|
||||||
|
// The lib doesn't call into this module directly — the watcher (in the
|
||||||
|
// bin) does. Dead-code analysis at the lib level can't see that, so
|
||||||
|
// suppress at the module level. Tests still exercise every function.
|
||||||
|
#![allow(dead_code)]
|
||||||
|
|
||||||
|
use diesel::prelude::*;
|
||||||
|
use diesel::sql_query;
|
||||||
|
use diesel::sqlite::SqliteConnection;
|
||||||
|
use log::{debug, info, warn};
|
||||||
|
|
||||||
|
/// Outcome of a reconciliation tick. Tracked so the watcher can log
|
||||||
|
/// progress when something changed and stay quiet when nothing did.
|
||||||
|
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
|
||||||
|
pub struct ReconcileStats {
|
||||||
|
pub tagged_photo_hashes_filled: usize,
|
||||||
|
pub photo_insights_hashes_filled: usize,
|
||||||
|
pub photo_insights_demoted: usize,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl ReconcileStats {
|
||||||
|
pub fn changed(&self) -> bool {
|
||||||
|
self.tagged_photo_hashes_filled > 0
|
||||||
|
|| self.photo_insights_hashes_filled > 0
|
||||||
|
|| self.photo_insights_demoted > 0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Run the reconciliation pass. Idempotent — safe to call on every
|
||||||
|
/// watcher tick. Errors are logged but never propagated; reconciliation
|
||||||
|
/// is best-effort and a transient DB hiccup must not stall the watcher.
|
||||||
|
pub fn run(conn: &mut SqliteConnection) -> ReconcileStats {
|
||||||
|
let mut stats = ReconcileStats::default();
|
||||||
|
|
||||||
|
stats.tagged_photo_hashes_filled = match backfill_tagged_photo_hashes(conn) {
|
||||||
|
Ok(n) => n,
|
||||||
|
Err(e) => {
|
||||||
|
warn!("reconcile: tagged_photo hash backfill failed: {:?}", e);
|
||||||
|
0
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
stats.photo_insights_hashes_filled = match backfill_photo_insights_hashes(conn) {
|
||||||
|
Ok(n) => n,
|
||||||
|
Err(e) => {
|
||||||
|
warn!("reconcile: photo_insights hash backfill failed: {:?}", e);
|
||||||
|
0
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
stats.photo_insights_demoted = match collapse_insight_currents(conn) {
|
||||||
|
Ok(n) => n,
|
||||||
|
Err(e) => {
|
||||||
|
warn!("reconcile: photo_insights scalar merge failed: {:?}", e);
|
||||||
|
0
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
if stats.changed() {
|
||||||
|
info!(
|
||||||
|
"reconcile: filled {} tagged_photo hash(es), {} photo_insights hash(es); demoted {} non-current insight row(s)",
|
||||||
|
stats.tagged_photo_hashes_filled,
|
||||||
|
stats.photo_insights_hashes_filled,
|
||||||
|
stats.photo_insights_demoted,
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
debug!("reconcile: no changes this tick");
|
||||||
|
}
|
||||||
|
|
||||||
|
stats
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Populate `tagged_photo.content_hash` for any row that still has
|
||||||
|
/// NULL by joining on `rel_path` against `image_exif`. tagged_photo
|
||||||
|
/// doesn't carry `library_id`, so a path that exists under multiple
|
||||||
|
/// libraries with different content is genuinely ambiguous; we pick
|
||||||
|
/// any non-null hash for that path. Same trade-off as the migration
|
||||||
|
/// backfill — see `migrations/2026-05-01-000000_hash_keyed_derived_data`.
|
||||||
|
fn backfill_tagged_photo_hashes(conn: &mut SqliteConnection) -> QueryResult<usize> {
|
||||||
|
sql_query(
|
||||||
|
"UPDATE tagged_photo \
|
||||||
|
SET content_hash = ( \
|
||||||
|
SELECT content_hash FROM image_exif \
|
||||||
|
WHERE image_exif.rel_path = tagged_photo.rel_path \
|
||||||
|
AND image_exif.content_hash IS NOT NULL \
|
||||||
|
LIMIT 1 \
|
||||||
|
) \
|
||||||
|
WHERE content_hash IS NULL \
|
||||||
|
AND EXISTS ( \
|
||||||
|
SELECT 1 FROM image_exif \
|
||||||
|
WHERE image_exif.rel_path = tagged_photo.rel_path \
|
||||||
|
AND image_exif.content_hash IS NOT NULL \
|
||||||
|
)",
|
||||||
|
)
|
||||||
|
.execute(conn)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Populate `photo_insights.content_hash` from `image_exif`, keyed on
|
||||||
|
/// `(library_id, rel_path)`. Unambiguous because photo_insights carries
|
||||||
|
/// library_id.
|
||||||
|
fn backfill_photo_insights_hashes(conn: &mut SqliteConnection) -> QueryResult<usize> {
|
||||||
|
sql_query(
|
||||||
|
"UPDATE photo_insights \
|
||||||
|
SET content_hash = ( \
|
||||||
|
SELECT content_hash FROM image_exif \
|
||||||
|
WHERE image_exif.library_id = photo_insights.library_id \
|
||||||
|
AND image_exif.rel_path = photo_insights.rel_path \
|
||||||
|
AND image_exif.content_hash IS NOT NULL \
|
||||||
|
LIMIT 1 \
|
||||||
|
) \
|
||||||
|
WHERE content_hash IS NULL \
|
||||||
|
AND EXISTS ( \
|
||||||
|
SELECT 1 FROM image_exif \
|
||||||
|
WHERE image_exif.library_id = photo_insights.library_id \
|
||||||
|
AND image_exif.rel_path = photo_insights.rel_path \
|
||||||
|
AND image_exif.content_hash IS NOT NULL \
|
||||||
|
)",
|
||||||
|
)
|
||||||
|
.execute(conn)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Scalar-merge step: when multiple rows share a `content_hash` and
|
||||||
|
/// claim `is_current = true`, demote all but the earliest by
|
||||||
|
/// `generated_at` (ties broken by lowest id, deterministic).
|
||||||
|
///
|
||||||
|
/// Demoted rows keep their data — only `is_current` flips. Clients that
|
||||||
|
/// hit `/insights/history` still see the full sequence; only the
|
||||||
|
/// "current" pointer is unique per hash.
|
||||||
|
fn collapse_insight_currents(conn: &mut SqliteConnection) -> QueryResult<usize> {
|
||||||
|
sql_query(
|
||||||
|
"UPDATE photo_insights \
|
||||||
|
SET is_current = 0 \
|
||||||
|
WHERE is_current = 1 \
|
||||||
|
AND content_hash IS NOT NULL \
|
||||||
|
AND id NOT IN ( \
|
||||||
|
SELECT MIN(p2.id) FROM photo_insights p2 \
|
||||||
|
WHERE p2.is_current = 1 \
|
||||||
|
AND p2.content_hash = photo_insights.content_hash \
|
||||||
|
AND p2.generated_at = ( \
|
||||||
|
SELECT MIN(p3.generated_at) FROM photo_insights p3 \
|
||||||
|
WHERE p3.is_current = 1 \
|
||||||
|
AND p3.content_hash = p2.content_hash \
|
||||||
|
) \
|
||||||
|
)",
|
||||||
|
)
|
||||||
|
.execute(conn)
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
use crate::database::test::in_memory_db_connection;
|
||||||
|
|
||||||
|
fn ensure_library(conn: &mut SqliteConnection, library_id: i32) {
|
||||||
|
// Migration seeds library id=1; tests that reference id>1 must
|
||||||
|
// create those rows themselves, otherwise FK enforcement (added
|
||||||
|
// in the tags-edit migration) rejects image_exif inserts.
|
||||||
|
diesel::sql_query(
|
||||||
|
"INSERT OR IGNORE INTO libraries (id, name, root_path, created_at) \
|
||||||
|
VALUES (?, 'test-' || ?, '/tmp/test-' || ?, 0)",
|
||||||
|
)
|
||||||
|
.bind::<diesel::sql_types::Integer, _>(library_id)
|
||||||
|
.bind::<diesel::sql_types::Integer, _>(library_id)
|
||||||
|
.bind::<diesel::sql_types::Integer, _>(library_id)
|
||||||
|
.execute(conn)
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn insert_image_exif(
|
||||||
|
conn: &mut SqliteConnection,
|
||||||
|
library_id: i32,
|
||||||
|
rel_path: &str,
|
||||||
|
content_hash: Option<&str>,
|
||||||
|
) {
|
||||||
|
use crate::database::schema::image_exif;
|
||||||
|
ensure_library(conn, library_id);
|
||||||
|
diesel::sql_query(
|
||||||
|
"INSERT INTO image_exif (library_id, rel_path, created_time, last_modified, content_hash) \
|
||||||
|
VALUES (?, ?, 0, 0, ?)",
|
||||||
|
)
|
||||||
|
.bind::<diesel::sql_types::Integer, _>(library_id)
|
||||||
|
.bind::<diesel::sql_types::Text, _>(rel_path)
|
||||||
|
.bind::<diesel::sql_types::Nullable<diesel::sql_types::Text>, _>(content_hash)
|
||||||
|
.execute(conn)
|
||||||
|
.unwrap();
|
||||||
|
// Keep clippy happy that the import is used.
|
||||||
|
let _ = image_exif::table;
|
||||||
|
}
|
||||||
|
|
||||||
|
fn insert_tagged_photo(conn: &mut SqliteConnection, rel_path: &str, tag_id: i32) {
|
||||||
|
diesel::sql_query(
|
||||||
|
"INSERT INTO tagged_photo (rel_path, tag_id, created_time) VALUES (?, ?, 0)",
|
||||||
|
)
|
||||||
|
.bind::<diesel::sql_types::Text, _>(rel_path)
|
||||||
|
.bind::<diesel::sql_types::Integer, _>(tag_id)
|
||||||
|
.execute(conn)
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn insert_tag(conn: &mut SqliteConnection, id: i32, name: &str) {
|
||||||
|
diesel::sql_query("INSERT INTO tags (id, name, created_time) VALUES (?, ?, 0)")
|
||||||
|
.bind::<diesel::sql_types::Integer, _>(id)
|
||||||
|
.bind::<diesel::sql_types::Text, _>(name)
|
||||||
|
.execute(conn)
|
||||||
|
.unwrap();
|
||||||
|
}
|
||||||
|
|
||||||
|
fn insert_insight(
|
||||||
|
conn: &mut SqliteConnection,
|
||||||
|
library_id: i32,
|
||||||
|
rel_path: &str,
|
||||||
|
generated_at: i64,
|
||||||
|
is_current: bool,
|
||||||
|
) -> i32 {
|
||||||
|
ensure_library(conn, library_id);
|
||||||
|
diesel::sql_query(
|
||||||
|
"INSERT INTO photo_insights (library_id, rel_path, title, summary, generated_at, model_version, is_current, backend) \
|
||||||
|
VALUES (?, ?, 't', 's', ?, 'v', ?, 'local')",
|
||||||
|
)
|
||||||
|
.bind::<diesel::sql_types::Integer, _>(library_id)
|
||||||
|
.bind::<diesel::sql_types::Text, _>(rel_path)
|
||||||
|
.bind::<diesel::sql_types::BigInt, _>(generated_at)
|
||||||
|
.bind::<diesel::sql_types::Bool, _>(is_current)
|
||||||
|
.execute(conn)
|
||||||
|
.unwrap();
|
||||||
|
diesel::sql_query("SELECT last_insert_rowid() AS id")
|
||||||
|
.get_result::<TestId>(conn)
|
||||||
|
.map(|r| r.id)
|
||||||
|
.unwrap()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(QueryableByName)]
|
||||||
|
struct TestId {
|
||||||
|
#[diesel(sql_type = diesel::sql_types::Integer)]
|
||||||
|
id: i32,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(QueryableByName, Debug)]
|
||||||
|
struct HashOnly {
|
||||||
|
#[diesel(sql_type = diesel::sql_types::Nullable<diesel::sql_types::Text>)]
|
||||||
|
content_hash: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(QueryableByName, Debug)]
|
||||||
|
struct CurrentRow {
|
||||||
|
#[diesel(sql_type = diesel::sql_types::Integer)]
|
||||||
|
id: i32,
|
||||||
|
#[diesel(sql_type = diesel::sql_types::Bool)]
|
||||||
|
is_current: bool,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn backfill_fills_tagged_photo_hash_when_image_exif_has_one() {
|
||||||
|
let mut conn = in_memory_db_connection();
|
||||||
|
insert_tag(&mut conn, 1, "vacation");
|
||||||
|
insert_tagged_photo(&mut conn, "trip/IMG.jpg", 1);
|
||||||
|
// No image_exif row yet — backfill no-op.
|
||||||
|
let stats = run(&mut conn);
|
||||||
|
assert_eq!(stats.tagged_photo_hashes_filled, 0);
|
||||||
|
|
||||||
|
// image_exif row appears with a hash; next reconcile fills it.
|
||||||
|
insert_image_exif(&mut conn, 1, "trip/IMG.jpg", Some("hashabc"));
|
||||||
|
let stats = run(&mut conn);
|
||||||
|
assert_eq!(stats.tagged_photo_hashes_filled, 1);
|
||||||
|
|
||||||
|
let row = diesel::sql_query(
|
||||||
|
"SELECT content_hash FROM tagged_photo WHERE rel_path = 'trip/IMG.jpg'",
|
||||||
|
)
|
||||||
|
.get_result::<HashOnly>(&mut conn)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(row.content_hash.as_deref(), Some("hashabc"));
|
||||||
|
|
||||||
|
// Idempotent: a second run is a no-op.
|
||||||
|
let stats = run(&mut conn);
|
||||||
|
assert_eq!(stats.tagged_photo_hashes_filled, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn backfill_skips_tagged_photo_when_image_exif_has_no_hash() {
|
||||||
|
let mut conn = in_memory_db_connection();
|
||||||
|
insert_tag(&mut conn, 1, "vacation");
|
||||||
|
insert_tagged_photo(&mut conn, "trip/IMG.jpg", 1);
|
||||||
|
// image_exif exists but its hash is null.
|
||||||
|
insert_image_exif(&mut conn, 1, "trip/IMG.jpg", None);
|
||||||
|
|
||||||
|
let stats = run(&mut conn);
|
||||||
|
assert_eq!(stats.tagged_photo_hashes_filled, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn backfill_fills_photo_insights_hash_scoped_by_library() {
|
||||||
|
let mut conn = in_memory_db_connection();
|
||||||
|
// Row in library 1 only — must not be filled by a hash from
|
||||||
|
// library 2's same-rel_path entry.
|
||||||
|
insert_image_exif(&mut conn, 1, "shared.jpg", Some("hash-lib1"));
|
||||||
|
let id1 = insert_insight(&mut conn, 1, "shared.jpg", 100, true);
|
||||||
|
|
||||||
|
let stats = run(&mut conn);
|
||||||
|
assert_eq!(stats.photo_insights_hashes_filled, 1);
|
||||||
|
|
||||||
|
let row = diesel::sql_query(
|
||||||
|
"SELECT content_hash FROM photo_insights WHERE id = ?",
|
||||||
|
)
|
||||||
|
.bind::<diesel::sql_types::Integer, _>(id1)
|
||||||
|
.get_result::<HashOnly>(&mut conn)
|
||||||
|
.unwrap();
|
||||||
|
assert_eq!(row.content_hash.as_deref(), Some("hash-lib1"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn collapse_keeps_earliest_is_current_per_hash() {
|
||||||
|
let mut conn = in_memory_db_connection();
|
||||||
|
// Two libraries, same content_hash via image_exif. Insights
|
||||||
|
// were generated independently in each library, both currently
|
||||||
|
// is_current = true. The earlier one wins.
|
||||||
|
insert_image_exif(&mut conn, 1, "a.jpg", Some("h1"));
|
||||||
|
insert_image_exif(&mut conn, 2, "a.jpg", Some("h1"));
|
||||||
|
let earlier = insert_insight(&mut conn, 1, "a.jpg", 100, true);
|
||||||
|
let later = insert_insight(&mut conn, 2, "a.jpg", 200, true);
|
||||||
|
|
||||||
|
// First pass fills the content_hash; second collapses.
|
||||||
|
let stats = run(&mut conn);
|
||||||
|
assert_eq!(stats.photo_insights_hashes_filled, 2);
|
||||||
|
assert_eq!(stats.photo_insights_demoted, 1);
|
||||||
|
|
||||||
|
let rows = diesel::sql_query(
|
||||||
|
"SELECT id, is_current FROM photo_insights ORDER BY id",
|
||||||
|
)
|
||||||
|
.get_results::<CurrentRow>(&mut conn)
|
||||||
|
.unwrap();
|
||||||
|
let earlier_row = rows.iter().find(|r| r.id == earlier).unwrap();
|
||||||
|
let later_row = rows.iter().find(|r| r.id == later).unwrap();
|
||||||
|
assert!(earlier_row.is_current, "earlier insight should remain current");
|
||||||
|
assert!(!later_row.is_current, "later insight should be demoted");
|
||||||
|
|
||||||
|
// Idempotent.
|
||||||
|
let stats = run(&mut conn);
|
||||||
|
assert_eq!(stats.photo_insights_demoted, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn collapse_does_not_demote_a_solo_current_row() {
|
||||||
|
let mut conn = in_memory_db_connection();
|
||||||
|
insert_image_exif(&mut conn, 1, "a.jpg", Some("h1"));
|
||||||
|
let solo = insert_insight(&mut conn, 1, "a.jpg", 100, true);
|
||||||
|
|
||||||
|
let stats = run(&mut conn);
|
||||||
|
assert_eq!(stats.photo_insights_demoted, 0);
|
||||||
|
|
||||||
|
let row = diesel::sql_query(
|
||||||
|
"SELECT id, is_current FROM photo_insights WHERE id = ?",
|
||||||
|
)
|
||||||
|
.bind::<diesel::sql_types::Integer, _>(solo)
|
||||||
|
.get_result::<CurrentRow>(&mut conn)
|
||||||
|
.unwrap();
|
||||||
|
assert!(row.is_current);
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -178,6 +178,7 @@ diesel::table! {
|
|||||||
approved -> Nullable<Bool>,
|
approved -> Nullable<Bool>,
|
||||||
backend -> Text,
|
backend -> Text,
|
||||||
fewshot_source_ids -> Nullable<Text>,
|
fewshot_source_ids -> Nullable<Text>,
|
||||||
|
content_hash -> Nullable<Text>,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -199,6 +200,7 @@ diesel::table! {
|
|||||||
rel_path -> Text,
|
rel_path -> Text,
|
||||||
tag_id -> Integer,
|
tag_id -> Integer,
|
||||||
created_time -> BigInt,
|
created_time -> BigInt,
|
||||||
|
content_hash -> Nullable<Text>,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
11
src/main.rs
11
src/main.rs
@@ -2063,6 +2063,17 @@ fn watch_files(
|
|||||||
update_media_counts(Path::new(&lib.root_path), &excluded_dirs);
|
update_media_counts(Path::new(&lib.root_path), &excluded_dirs);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Reconciliation: cross-library, so it runs once per tick
|
||||||
|
// outside the per-library loop. Idempotent — fast no-op when
|
||||||
|
// there's nothing to do. Operates on the database alone, no
|
||||||
|
// filesystem dependency, so it doesn't need a health gate.
|
||||||
|
// See database::reconcile and CLAUDE.md "Multi-library data
|
||||||
|
// model" for the rules.
|
||||||
|
{
|
||||||
|
let mut conn = image_api::database::connect();
|
||||||
|
let _ = image_api::database::reconcile::run(&mut conn);
|
||||||
|
}
|
||||||
|
|
||||||
if is_full_scan {
|
if is_full_scan {
|
||||||
last_full_scan = now;
|
last_full_scan = now;
|
||||||
}
|
}
|
||||||
|
|||||||
28
src/tags.rs
28
src/tags.rs
@@ -507,6 +507,11 @@ pub struct InsertTaggedPhoto {
|
|||||||
#[diesel(column_name = rel_path)]
|
#[diesel(column_name = rel_path)]
|
||||||
pub photo_name: String,
|
pub photo_name: String,
|
||||||
pub created_time: i64,
|
pub created_time: i64,
|
||||||
|
/// Hash-keyed identity. The DAO populates this from
|
||||||
|
/// `image_exif.content_hash` at insert time when known; the
|
||||||
|
/// reconciliation pass backfills rows inserted before the hash
|
||||||
|
/// landed. See CLAUDE.md "Multi-library data model".
|
||||||
|
pub content_hash: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Queryable, Clone, Debug)]
|
#[derive(Queryable, Clone, Debug)]
|
||||||
@@ -520,6 +525,8 @@ pub struct TaggedPhoto {
|
|||||||
pub tag_id: i32,
|
pub tag_id: i32,
|
||||||
#[allow(dead_code)] // Part of API contract
|
#[allow(dead_code)] // Part of API contract
|
||||||
pub created_time: i64,
|
pub created_time: i64,
|
||||||
|
#[allow(dead_code)]
|
||||||
|
pub content_hash: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
#[derive(Debug, Deserialize)]
|
||||||
@@ -958,11 +965,31 @@ impl TagDao for SqliteTagDao {
|
|||||||
KeyValue::new("tag_id", tag_id.to_string()),
|
KeyValue::new("tag_id", tag_id.to_string()),
|
||||||
]);
|
]);
|
||||||
|
|
||||||
|
// Eagerly populate content_hash so this tag follows the bytes,
|
||||||
|
// not the path (see CLAUDE.md "Multi-library data model").
|
||||||
|
// None is fine — the reconciliation pass will backfill once
|
||||||
|
// image_exif has a hash for this file. We deliberately don't
|
||||||
|
// require library_id here: the tag handler is library-
|
||||||
|
// agnostic by design, and any matching image_exif row's hash
|
||||||
|
// is acceptable. If the path resolves to different bytes in
|
||||||
|
// different libraries, reconciliation per-library refines.
|
||||||
|
let content_hash: Option<String> = {
|
||||||
|
use crate::database::schema::image_exif as ie;
|
||||||
|
ie::table
|
||||||
|
.filter(ie::rel_path.eq(path))
|
||||||
|
.filter(ie::content_hash.is_not_null())
|
||||||
|
.select(ie::content_hash)
|
||||||
|
.first::<Option<String>>(conn.deref_mut())
|
||||||
|
.ok()
|
||||||
|
.flatten()
|
||||||
|
};
|
||||||
|
|
||||||
diesel::insert_into(tagged_photo::table)
|
diesel::insert_into(tagged_photo::table)
|
||||||
.values(InsertTaggedPhoto {
|
.values(InsertTaggedPhoto {
|
||||||
tag_id,
|
tag_id,
|
||||||
photo_name: path.to_string(),
|
photo_name: path.to_string(),
|
||||||
created_time: Utc::now().timestamp(),
|
created_time: Utc::now().timestamp(),
|
||||||
|
content_hash,
|
||||||
})
|
})
|
||||||
.execute(conn.deref_mut())
|
.execute(conn.deref_mut())
|
||||||
.with_context(|| format!("Unable to tag file {:?} in sqlite", path))
|
.with_context(|| format!("Unable to tag file {:?} in sqlite", path))
|
||||||
@@ -1367,6 +1394,7 @@ mod tests {
|
|||||||
tag_id: tag.id,
|
tag_id: tag.id,
|
||||||
created_time: Utc::now().timestamp(),
|
created_time: Utc::now().timestamp(),
|
||||||
photo_name: path.to_string(),
|
photo_name: path.to_string(),
|
||||||
|
content_hash: None,
|
||||||
};
|
};
|
||||||
|
|
||||||
if self.tagged_photos.borrow().contains_key(path) {
|
if self.tagged_photos.borrow().contains_key(path) {
|
||||||
|
|||||||
Reference in New Issue
Block a user