clip-search: backlog drain + /photos/search endpoint
Wires the persistence layer for CLIP semantic search. The watcher's per-tick drain encodes any image_exif row with a known content_hash but no clip_embedding via Apollo (cap CLIP_BACKLOG_MAX_PER_TICK, default 32). On a query, /photos/search encodes the text via Apollo and reranks every stored embedding in-memory. ExifDao additions: - list_clip_unencoded_candidates — partial-index scan for drain - backfill_clip_embedding — touches only the two new columns - list_clip_index — dedup'd (hash, embedding) pull for search clip_watch::run_clip_encoding_pass is the parallel fan-out — tokio runtime per pass with CLIP_ENCODE_CONCURRENCY (default 4). No marker rows for permanent failures yet; per-tick cap bounds the retry cost. /photos/search params: q, limit, threshold (default 0.20), library, model_version. Response is intentionally minimal (path + score) so the frontend joins against existing photo-metadata routes lazily. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -470,6 +470,61 @@ pub trait ExifDao: Sync + Send {
|
||||
source: &str,
|
||||
) -> Result<(), DbError>;
|
||||
|
||||
/// Find image_exif rows needing a CLIP embedding for semantic search:
|
||||
/// `clip_embedding IS NULL AND content_hash IS NOT NULL`, ordered by id
|
||||
/// ASC, limited. Hash-less rows wait for `backfill_unhashed_backlog` to
|
||||
/// hash them first — embedding a row we can't key on bytes is wasted
|
||||
/// work that the next library/move detection would invalidate. Backed
|
||||
/// by the partial index `idx_image_exif_clip_backfill`.
|
||||
///
|
||||
/// Returns `(rel_path, content_hash)` for the given library only. Video
|
||||
/// rows are returned too (the underlying anti-join is shape-uniform);
|
||||
/// the caller filters them out via `file_types::is_image_file` before
|
||||
/// sending to Apollo, mirroring `face_watch::filter_excluded`.
|
||||
///
|
||||
/// **Model upgrades** (re-encoding everything on a new
|
||||
/// `APOLLO_CLIP_MODEL`) are handled out-of-band — run
|
||||
/// `UPDATE image_exif SET clip_embedding = NULL
|
||||
/// WHERE clip_model_version != '<new model>';`
|
||||
/// and the drain picks up the freshly-nulled rows on the next tick.
|
||||
/// Mixing in-flight model versions in a single query is intentionally
|
||||
/// not the drain's problem.
|
||||
fn list_clip_unencoded_candidates(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
library_id: i32,
|
||||
limit: i64,
|
||||
) -> Result<Vec<(String, String)>, DbError>;
|
||||
|
||||
/// Persist a CLIP embedding for an existing row. Touches
|
||||
/// `clip_embedding` and `clip_model_version` only — leaves every
|
||||
/// other column alone so the drain can't accidentally clobber EXIF /
|
||||
/// hash / date-resolver state that other paths have written.
|
||||
fn backfill_clip_embedding(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
library_id: i32,
|
||||
rel_path: &str,
|
||||
embedding: &[u8],
|
||||
model_version: &str,
|
||||
) -> Result<(), DbError>;
|
||||
|
||||
/// Load every `(content_hash, clip_embedding)` pair from the live
|
||||
/// image_exif rows for the given libraries, optionally filtered to a
|
||||
/// single `model_version` (cosine sim across mixed geometries is
|
||||
/// meaningless). Used by `/photos/search` to rerank against the query
|
||||
/// embedding in-memory.
|
||||
///
|
||||
/// Returns one pair per content_hash. If a hash appears under more
|
||||
/// than one library, the first row wins (Diesel's natural ORDER BY id
|
||||
/// ASC). Hash-less and embedding-less rows are filtered server-side.
|
||||
fn list_clip_index(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
library_ids: &[i32],
|
||||
model_version: Option<&str>,
|
||||
) -> Result<Vec<(String, Vec<u8>)>, DbError>;
|
||||
|
||||
/// Operator-driven date_taken override (POST /image/exif/date). Snapshots
|
||||
/// the prior `(date_taken, date_taken_source)` into the `original_*`
|
||||
/// pair on first override, then writes the new value with
|
||||
@@ -1387,6 +1442,146 @@ impl ExifDao for SqliteExifDao {
|
||||
})
|
||||
}
|
||||
|
||||
fn list_clip_unencoded_candidates(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
library_id_val: i32,
|
||||
limit: i64,
|
||||
) -> Result<Vec<(String, String)>, DbError> {
|
||||
trace_db_call(
|
||||
context,
|
||||
"query",
|
||||
"list_clip_unencoded_candidates",
|
||||
|_span| {
|
||||
use schema::image_exif::dsl::*;
|
||||
|
||||
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
|
||||
|
||||
// Partial index `idx_image_exif_clip_backfill` covers the
|
||||
// (clip_embedding IS NULL AND content_hash IS NOT NULL)
|
||||
// filter; the planner hits it directly. ORDER BY id ASC
|
||||
// keeps drain progress monotone across ticks.
|
||||
image_exif
|
||||
.filter(library_id.eq(library_id_val))
|
||||
.filter(clip_embedding.is_null())
|
||||
.filter(content_hash.is_not_null())
|
||||
.select((rel_path, content_hash.assume_not_null()))
|
||||
.order(id.asc())
|
||||
.limit(limit)
|
||||
.load::<(String, String)>(connection.deref_mut())
|
||||
.map_err(|_| anyhow::anyhow!("Query error"))
|
||||
},
|
||||
)
|
||||
.map_err(|_| DbError::new(DbErrorKind::QueryError))
|
||||
}
|
||||
|
||||
fn backfill_clip_embedding(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
library_id_val: i32,
|
||||
rel_path_val: &str,
|
||||
embedding: &[u8],
|
||||
model_version: &str,
|
||||
) -> Result<(), DbError> {
|
||||
trace_db_call(context, "update", "backfill_clip_embedding", |_span| {
|
||||
use schema::image_exif::dsl::*;
|
||||
|
||||
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
|
||||
|
||||
let result = diesel::update(
|
||||
image_exif
|
||||
.filter(library_id.eq(library_id_val))
|
||||
.filter(rel_path.eq(rel_path_val)),
|
||||
)
|
||||
.set((
|
||||
clip_embedding.eq(embedding),
|
||||
clip_model_version.eq(model_version),
|
||||
))
|
||||
.execute(connection.deref_mut());
|
||||
|
||||
match result {
|
||||
Ok(rows) => {
|
||||
if rows == 0 {
|
||||
// Same race as backfill_date_taken — row vanished
|
||||
// between the candidate query and this write. Not
|
||||
// a hard error; the drain re-scans next tick.
|
||||
log::debug!(
|
||||
"backfill_clip_embedding: 0 rows matched lib={} {} \
|
||||
(row likely retired by missing-file scan)",
|
||||
library_id_val,
|
||||
rel_path_val
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
Err(e) => Err(anyhow::anyhow!(
|
||||
"diesel update failed (lib={}, rel_path={}, model={}): {}",
|
||||
library_id_val,
|
||||
rel_path_val,
|
||||
model_version,
|
||||
e
|
||||
)),
|
||||
}
|
||||
})
|
||||
.map_err(|e| {
|
||||
log::warn!("backfill_clip_embedding: {}", e);
|
||||
DbError::new(DbErrorKind::UpdateError)
|
||||
})
|
||||
}
|
||||
|
||||
fn list_clip_index(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
library_ids_val: &[i32],
|
||||
model_version_filter: Option<&str>,
|
||||
) -> Result<Vec<(String, Vec<u8>)>, DbError> {
|
||||
trace_db_call(context, "query", "list_clip_index", |_span| {
|
||||
use schema::image_exif::dsl::*;
|
||||
|
||||
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
|
||||
|
||||
// Build the base filter. content_hash + clip_embedding both
|
||||
// need to be present for the row to be searchable.
|
||||
let mut query = image_exif
|
||||
.filter(content_hash.is_not_null())
|
||||
.filter(clip_embedding.is_not_null())
|
||||
.into_boxed();
|
||||
if !library_ids_val.is_empty() {
|
||||
query = query.filter(library_id.eq_any(library_ids_val));
|
||||
}
|
||||
if let Some(mv) = model_version_filter {
|
||||
query = query.filter(clip_model_version.eq(mv));
|
||||
}
|
||||
|
||||
// Order by id ASC so cross-library duplicates pick the
|
||||
// earliest-ingested row (stable across calls; the in-memory
|
||||
// matrix gets a deterministic row order). Group-by on
|
||||
// content_hash via post-filter — Diesel doesn't expose a
|
||||
// clean DISTINCT ON in this query shape.
|
||||
let rows: Vec<(String, Vec<u8>)> = query
|
||||
.select((
|
||||
content_hash.assume_not_null(),
|
||||
clip_embedding.assume_not_null(),
|
||||
))
|
||||
.order(id.asc())
|
||||
.load::<(String, Vec<u8>)>(connection.deref_mut())
|
||||
.map_err(|_| anyhow::anyhow!("Query error"))?;
|
||||
|
||||
// Dedupe by hash, keeping the first occurrence. Cheap; sized
|
||||
// to ~14k entries on this library.
|
||||
let mut seen: std::collections::HashSet<String> =
|
||||
std::collections::HashSet::with_capacity(rows.len());
|
||||
let mut out = Vec::with_capacity(rows.len());
|
||||
for (h, e) in rows {
|
||||
if seen.insert(h.clone()) {
|
||||
out.push((h, e));
|
||||
}
|
||||
}
|
||||
Ok(out)
|
||||
})
|
||||
.map_err(|_| DbError::new(DbErrorKind::QueryError))
|
||||
}
|
||||
|
||||
fn set_manual_date_taken(
|
||||
&mut self,
|
||||
context: &opentelemetry::Context,
|
||||
|
||||
Reference in New Issue
Block a user