clip-search: backlog drain + /photos/search endpoint

Wires the persistence layer for CLIP semantic search. The watcher's
per-tick drain encodes any image_exif row with a known content_hash
but no clip_embedding via Apollo (cap CLIP_BACKLOG_MAX_PER_TICK,
default 32). On a query, /photos/search encodes the text via Apollo
and reranks every stored embedding in-memory.

ExifDao additions:
- list_clip_unencoded_candidates — partial-index scan for drain
- backfill_clip_embedding — touches only the two new columns
- list_clip_index — dedup'd (hash, embedding) pull for search

clip_watch::run_clip_encoding_pass is the parallel fan-out — tokio
runtime per pass with CLIP_ENCODE_CONCURRENCY (default 4). No marker
rows for permanent failures yet; per-tick cap bounds the retry cost.

/photos/search params: q, limit, threshold (default 0.20), library,
model_version. Response is intentionally minimal (path + score) so
the frontend joins against existing photo-metadata routes lazily.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Cameron Cordes
2026-05-14 14:00:41 -04:00
parent 8d9e76cf15
commit 32195ed89e
9 changed files with 875 additions and 0 deletions

View File

@@ -470,6 +470,61 @@ pub trait ExifDao: Sync + Send {
source: &str,
) -> Result<(), DbError>;
/// Find image_exif rows needing a CLIP embedding for semantic search:
/// `clip_embedding IS NULL AND content_hash IS NOT NULL`, ordered by id
/// ASC, limited. Hash-less rows wait for `backfill_unhashed_backlog` to
/// hash them first — embedding a row we can't key on bytes is wasted
/// work that the next library/move detection would invalidate. Backed
/// by the partial index `idx_image_exif_clip_backfill`.
///
/// Returns `(rel_path, content_hash)` for the given library only. Video
/// rows are returned too (the underlying anti-join is shape-uniform);
/// the caller filters them out via `file_types::is_image_file` before
/// sending to Apollo, mirroring `face_watch::filter_excluded`.
///
/// **Model upgrades** (re-encoding everything on a new
/// `APOLLO_CLIP_MODEL`) are handled out-of-band — run
/// `UPDATE image_exif SET clip_embedding = NULL
/// WHERE clip_model_version != '<new model>';`
/// and the drain picks up the freshly-nulled rows on the next tick.
/// Mixing in-flight model versions in a single query is intentionally
/// not the drain's problem.
fn list_clip_unencoded_candidates(
&mut self,
context: &opentelemetry::Context,
library_id: i32,
limit: i64,
) -> Result<Vec<(String, String)>, DbError>;
/// Persist a CLIP embedding for an existing row. Touches
/// `clip_embedding` and `clip_model_version` only — leaves every
/// other column alone so the drain can't accidentally clobber EXIF /
/// hash / date-resolver state that other paths have written.
fn backfill_clip_embedding(
&mut self,
context: &opentelemetry::Context,
library_id: i32,
rel_path: &str,
embedding: &[u8],
model_version: &str,
) -> Result<(), DbError>;
/// Load every `(content_hash, clip_embedding)` pair from the live
/// image_exif rows for the given libraries, optionally filtered to a
/// single `model_version` (cosine sim across mixed geometries is
/// meaningless). Used by `/photos/search` to rerank against the query
/// embedding in-memory.
///
/// Returns one pair per content_hash. If a hash appears under more
/// than one library, the first row wins (Diesel's natural ORDER BY id
/// ASC). Hash-less and embedding-less rows are filtered server-side.
fn list_clip_index(
&mut self,
context: &opentelemetry::Context,
library_ids: &[i32],
model_version: Option<&str>,
) -> Result<Vec<(String, Vec<u8>)>, DbError>;
/// Operator-driven date_taken override (POST /image/exif/date). Snapshots
/// the prior `(date_taken, date_taken_source)` into the `original_*`
/// pair on first override, then writes the new value with
@@ -1387,6 +1442,146 @@ impl ExifDao for SqliteExifDao {
})
}
fn list_clip_unencoded_candidates(
&mut self,
context: &opentelemetry::Context,
library_id_val: i32,
limit: i64,
) -> Result<Vec<(String, String)>, DbError> {
trace_db_call(
context,
"query",
"list_clip_unencoded_candidates",
|_span| {
use schema::image_exif::dsl::*;
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
// Partial index `idx_image_exif_clip_backfill` covers the
// (clip_embedding IS NULL AND content_hash IS NOT NULL)
// filter; the planner hits it directly. ORDER BY id ASC
// keeps drain progress monotone across ticks.
image_exif
.filter(library_id.eq(library_id_val))
.filter(clip_embedding.is_null())
.filter(content_hash.is_not_null())
.select((rel_path, content_hash.assume_not_null()))
.order(id.asc())
.limit(limit)
.load::<(String, String)>(connection.deref_mut())
.map_err(|_| anyhow::anyhow!("Query error"))
},
)
.map_err(|_| DbError::new(DbErrorKind::QueryError))
}
fn backfill_clip_embedding(
&mut self,
context: &opentelemetry::Context,
library_id_val: i32,
rel_path_val: &str,
embedding: &[u8],
model_version: &str,
) -> Result<(), DbError> {
trace_db_call(context, "update", "backfill_clip_embedding", |_span| {
use schema::image_exif::dsl::*;
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
let result = diesel::update(
image_exif
.filter(library_id.eq(library_id_val))
.filter(rel_path.eq(rel_path_val)),
)
.set((
clip_embedding.eq(embedding),
clip_model_version.eq(model_version),
))
.execute(connection.deref_mut());
match result {
Ok(rows) => {
if rows == 0 {
// Same race as backfill_date_taken — row vanished
// between the candidate query and this write. Not
// a hard error; the drain re-scans next tick.
log::debug!(
"backfill_clip_embedding: 0 rows matched lib={} {} \
(row likely retired by missing-file scan)",
library_id_val,
rel_path_val
);
}
Ok(())
}
Err(e) => Err(anyhow::anyhow!(
"diesel update failed (lib={}, rel_path={}, model={}): {}",
library_id_val,
rel_path_val,
model_version,
e
)),
}
})
.map_err(|e| {
log::warn!("backfill_clip_embedding: {}", e);
DbError::new(DbErrorKind::UpdateError)
})
}
fn list_clip_index(
&mut self,
context: &opentelemetry::Context,
library_ids_val: &[i32],
model_version_filter: Option<&str>,
) -> Result<Vec<(String, Vec<u8>)>, DbError> {
trace_db_call(context, "query", "list_clip_index", |_span| {
use schema::image_exif::dsl::*;
let mut connection = self.connection.lock().expect("Unable to get ExifDao");
// Build the base filter. content_hash + clip_embedding both
// need to be present for the row to be searchable.
let mut query = image_exif
.filter(content_hash.is_not_null())
.filter(clip_embedding.is_not_null())
.into_boxed();
if !library_ids_val.is_empty() {
query = query.filter(library_id.eq_any(library_ids_val));
}
if let Some(mv) = model_version_filter {
query = query.filter(clip_model_version.eq(mv));
}
// Order by id ASC so cross-library duplicates pick the
// earliest-ingested row (stable across calls; the in-memory
// matrix gets a deterministic row order). Group-by on
// content_hash via post-filter — Diesel doesn't expose a
// clean DISTINCT ON in this query shape.
let rows: Vec<(String, Vec<u8>)> = query
.select((
content_hash.assume_not_null(),
clip_embedding.assume_not_null(),
))
.order(id.asc())
.load::<(String, Vec<u8>)>(connection.deref_mut())
.map_err(|_| anyhow::anyhow!("Query error"))?;
// Dedupe by hash, keeping the first occurrence. Cheap; sized
// to ~14k entries on this library.
let mut seen: std::collections::HashSet<String> =
std::collections::HashSet::with_capacity(rows.len());
let mut out = Vec::with_capacity(rows.len());
for (h, e) in rows {
if seen.insert(h.clone()) {
out.push((h, e));
}
}
Ok(out)
})
.map_err(|_| DbError::new(DbErrorKind::QueryError))
}
fn set_manual_date_taken(
&mut self,
context: &opentelemetry::Context,