clip-search: backlog drain + /photos/search endpoint

Wires the persistence layer for CLIP semantic search. The watcher's
per-tick drain encodes any image_exif row with a known content_hash
but no clip_embedding via Apollo (cap CLIP_BACKLOG_MAX_PER_TICK,
default 32). On a query, /photos/search encodes the text via Apollo
and reranks every stored embedding in-memory.

ExifDao additions:
- list_clip_unencoded_candidates — partial-index scan for drain
- backfill_clip_embedding — touches only the two new columns
- list_clip_index — dedup'd (hash, embedding) pull for search

clip_watch::run_clip_encoding_pass is the parallel fan-out — tokio
runtime per pass with CLIP_ENCODE_CONCURRENCY (default 4). No marker
rows for permanent failures yet; per-tick cap bounds the retry cost.

/photos/search params: q, limit, threshold (default 0.20), library,
model_version. Response is intentionally minimal (path + score) so
the frontend joins against existing photo-metadata routes lazily.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Cameron Cordes
2026-05-14 14:00:41 -04:00
parent 8d9e76cf15
commit 32195ed89e
9 changed files with 875 additions and 0 deletions

288
src/clip_search.rs Normal file
View File

@@ -0,0 +1,288 @@
//! `/photos/search?q=<text>` — CLIP semantic photo search.
//!
//! The route lives outside `files.rs` to keep that 1500+ line module
//! focused on EXIF / tag listing. The flow is:
//!
//! 1. Parse query params (`q`, `limit`, `threshold`, optional `library`).
//! 2. Call Apollo's `/api/internal/clip/encode_text` to get the query
//! vector (L2-normalized 768-d f32 for ViT-L/14).
//! 3. Load every `(content_hash, clip_embedding)` for the scope from
//! `image_exif` via `ExifDao::list_clip_index`. ~2843 MB for a 14k
//! library at ViT-L/14; loaded fresh per request — fast enough for
//! v1, optimize via an AppState cache later if needed.
//! 4. Dot product (= cosine since both sides are L2-normalized), filter
//! above `threshold`, top-K by score.
//! 5. Resolve each surviving hash back to a `(library_id, rel_path)` so
//! the frontend can render the photo / hand off to the carousel.
//!
//! Response shape is intentionally minimal — paths + score — so the
//! frontend can reuse existing PhotoGrid rendering by joining against
//! `/api/photos/match` (or calling `/image/metadata` lazily). Don't
//! bake camera/EXIF metadata into this route; it would force a fan-out
//! per result and balloon the response.
use crate::AppState;
use crate::ai::clip_client::ClipError;
use crate::database::ExifDao;
use actix_web::{HttpResponse, Result as ActixResult, web};
use base64::Engine;
use serde::{Deserialize, Serialize};
use std::sync::Mutex;
#[derive(Debug, Deserialize)]
pub struct SearchQuery {
/// Natural-language query. Required; empty triggers 400.
pub q: String,
/// Max results to return. Capped to 200 server-side; the UI almost
/// always wants ≤50. Defaults to 20.
#[serde(default = "default_limit")]
pub limit: usize,
/// Cosine-similarity floor below which results are dropped.
/// 0.20 is the rough "this is plausibly relevant" line for OpenAI
/// CLIP; tunable per call when sweeping. Defaults to 0.20.
#[serde(default = "default_threshold")]
pub threshold: f32,
/// Optional single-library scope. When omitted, every enabled
/// library is searched. Multi-select isn't supported yet — the
/// frontend wires through one or all.
pub library: Option<i32>,
/// Optional model-version filter. Defaults to the live engine's
/// version (queried lazily). Forces a strict join so mid-flight
/// model swaps can't mix geometries in a single response.
#[serde(default)]
pub model_version: Option<String>,
}
fn default_limit() -> usize {
20
}
fn default_threshold() -> f32 {
0.20
}
#[derive(Debug, Serialize)]
pub struct SearchHit {
pub library_id: i32,
pub rel_path: String,
pub content_hash: String,
/// Cosine similarity in [-1, 1]. In practice OpenAI CLIP returns
/// 0.100.40 for the typical photo library.
pub score: f32,
}
#[derive(Debug, Serialize)]
pub struct SearchResponse {
pub query: String,
pub model_version: String,
pub threshold: f32,
pub considered: usize,
pub results: Vec<SearchHit>,
}
#[derive(Debug, Serialize)]
struct SearchError {
error: String,
}
/// Decode a stored `clip_embedding` BLOB back into a `Vec<f32>`. Returns
/// `None` on malformed bytes — those rows get skipped rather than
/// failing the whole query.
fn decode_embedding(bytes: &[u8]) -> Option<Vec<f32>> {
if bytes.is_empty() || bytes.len() % 4 != 0 {
return None;
}
let mut out = Vec::with_capacity(bytes.len() / 4);
for chunk in bytes.chunks_exact(4) {
out.push(f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]));
}
Some(out)
}
#[inline]
fn dot(a: &[f32], b: &[f32]) -> f32 {
a.iter().zip(b.iter()).map(|(x, y)| x * y).sum()
}
pub async fn search_photos(
state: web::Data<AppState>,
exif_dao: web::Data<Mutex<Box<dyn ExifDao>>>,
query: web::Query<SearchQuery>,
) -> ActixResult<HttpResponse> {
let q_text = query.q.trim().to_string();
if q_text.is_empty() {
return Ok(HttpResponse::BadRequest().json(SearchError {
error: "query parameter `q` is required".into(),
}));
}
if !state.clip_client.is_enabled() {
return Ok(HttpResponse::ServiceUnavailable().json(SearchError {
error: "CLIP search is disabled (no Apollo CLIP endpoint configured)".into(),
}));
}
let limit = query.limit.min(200).max(1);
let threshold = query.threshold.clamp(-1.0, 1.0);
// 1. Encode the query text. Fast — Apollo's text encoder is ~50ms
// on CPU. Bail with a clear error message if Apollo's down so the
// user sees "service unavailable" rather than empty results.
let query_resp = match state.clip_client.encode_text(&q_text).await {
Ok(r) => r,
Err(ClipError::Permanent(e)) => {
return Ok(HttpResponse::BadRequest().json(SearchError {
error: format!("query rejected: {e}"),
}));
}
Err(ClipError::Transient(e)) => {
return Ok(HttpResponse::BadGateway().json(SearchError {
error: format!("CLIP service unavailable: {e}"),
}));
}
Err(ClipError::Disabled) => {
return Ok(HttpResponse::ServiceUnavailable().json(SearchError {
error: "CLIP service disabled".into(),
}));
}
};
// decode_embedding works on raw bytes; the wire format is b64.
let query_bytes = base64::engine::general_purpose::STANDARD
.decode(query_resp.embedding.as_bytes())
.unwrap_or_default();
let query_vec = match decode_embedding(&query_bytes) {
Some(v) => v,
None => {
return Ok(HttpResponse::BadGateway().json(SearchError {
error: "CLIP service returned a malformed query embedding".into(),
}));
}
};
// 2. Decide which library scope to search.
let library_ids: Vec<i32> = match query.library {
Some(id) => vec![id],
None => Vec::new(), // empty = all libraries
};
// 3. Pull the (hash, embedding) matrix. Lock contention here is
// bounded — one big SELECT under a mutex Arc<Mutex<dyn ExifDao>>
// and then we release before scoring. If this becomes a hotspot
// we'll cache the decoded matrix in AppState with TTL.
let ctx = opentelemetry::Context::current();
let rows: Vec<(String, Vec<u8>)> = {
let mut dao = exif_dao.lock().expect("exif dao");
match dao.list_clip_index(
&ctx,
&library_ids,
query.model_version.as_deref().or(Some(&query_resp.model_version)),
) {
Ok(r) => r,
Err(e) => {
log::warn!("clip_search: list_clip_index failed: {:?}", e);
return Ok(HttpResponse::InternalServerError().json(SearchError {
error: "failed to load search index".into(),
}));
}
}
};
let considered = rows.len();
if considered == 0 {
return Ok(HttpResponse::Ok().json(SearchResponse {
query: q_text,
model_version: query_resp.model_version,
threshold,
considered,
results: Vec::new(),
}));
}
// 4. Score. Cap the loop's transient allocation; we keep all scores
// and sort at the end. With ~14k entries the sort is microseconds.
let mut scored: Vec<(f32, String)> = Vec::with_capacity(considered);
for (hash, blob) in rows {
let Some(emb) = decode_embedding(&blob) else {
continue;
};
if emb.len() != query_vec.len() {
continue;
}
let sim = dot(&emb, &query_vec);
if sim < threshold {
continue;
}
scored.push((sim, hash));
}
scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
scored.truncate(limit);
if scored.is_empty() {
return Ok(HttpResponse::Ok().json(SearchResponse {
query: q_text,
model_version: query_resp.model_version,
threshold,
considered,
results: Vec::new(),
}));
}
// 5. Resolve each surviving hash back to a `(library_id, rel_path)`.
// `get_rel_paths_by_hash` returns every rel_path; we pick the first
// one for the result. Apollo / the UI can fetch alternatives via
// /image/metadata when needed.
let hashes: Vec<String> = scored.iter().map(|(_, h)| h.clone()).collect();
let path_map = {
let mut dao = exif_dao.lock().expect("exif dao");
match dao.get_rel_paths_for_hashes(&ctx, &hashes) {
Ok(m) => m,
Err(e) => {
log::warn!("clip_search: get_rel_paths_for_hashes failed: {:?}", e);
return Ok(HttpResponse::InternalServerError().json(SearchError {
error: "failed to resolve photo paths".into(),
}));
}
}
};
// We need (library_id, rel_path) — get_rel_paths_for_hashes only
// returns rel_paths. Cross-reference via find_by_content_hash to
// pick the library too. Single call per surviving hash; cheap at
// top-20.
let mut results = Vec::with_capacity(scored.len());
{
let mut dao = exif_dao.lock().expect("exif dao");
for (score, hash) in scored {
let row = match dao.find_by_content_hash(&ctx, &hash) {
Ok(Some(r)) => r,
Ok(None) => continue,
Err(e) => {
log::warn!(
"clip_search: find_by_content_hash failed for {}: {:?}",
hash, e
);
continue;
}
};
// Prefer get_rel_paths_for_hashes's first entry if it
// exists (it shares semantics with `image_exif`'s natural
// order), falling back to the ImageExif row.
let rel_path = path_map
.get(&hash)
.and_then(|paths| paths.first().cloned())
.unwrap_or(row.file_path);
results.push(SearchHit {
library_id: row.library_id,
rel_path,
content_hash: hash,
score,
});
}
}
Ok(HttpResponse::Ok().json(SearchResponse {
query: q_text,
model_version: query_resp.model_version,
threshold,
considered,
results,
}))
}