clip-search: backlog drain + /photos/search endpoint
Wires the persistence layer for CLIP semantic search. The watcher's per-tick drain encodes any image_exif row with a known content_hash but no clip_embedding via Apollo (cap CLIP_BACKLOG_MAX_PER_TICK, default 32). On a query, /photos/search encodes the text via Apollo and reranks every stored embedding in-memory. ExifDao additions: - list_clip_unencoded_candidates — partial-index scan for drain - backfill_clip_embedding — touches only the two new columns - list_clip_index — dedup'd (hash, embedding) pull for search clip_watch::run_clip_encoding_pass is the parallel fan-out — tokio runtime per pass with CLIP_ENCODE_CONCURRENCY (default 4). No marker rows for permanent failures yet; per-tick cap bounds the retry cost. /photos/search params: q, limit, threshold (default 0.20), library, model_version. Response is intentionally minimal (path + score) so the frontend joins against existing photo-metadata routes lazily. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
288
src/clip_search.rs
Normal file
288
src/clip_search.rs
Normal file
@@ -0,0 +1,288 @@
|
||||
//! `/photos/search?q=<text>` — CLIP semantic photo search.
|
||||
//!
|
||||
//! The route lives outside `files.rs` to keep that 1500+ line module
|
||||
//! focused on EXIF / tag listing. The flow is:
|
||||
//!
|
||||
//! 1. Parse query params (`q`, `limit`, `threshold`, optional `library`).
|
||||
//! 2. Call Apollo's `/api/internal/clip/encode_text` to get the query
|
||||
//! vector (L2-normalized 768-d f32 for ViT-L/14).
|
||||
//! 3. Load every `(content_hash, clip_embedding)` for the scope from
|
||||
//! `image_exif` via `ExifDao::list_clip_index`. ~28–43 MB for a 14k
|
||||
//! library at ViT-L/14; loaded fresh per request — fast enough for
|
||||
//! v1, optimize via an AppState cache later if needed.
|
||||
//! 4. Dot product (= cosine since both sides are L2-normalized), filter
|
||||
//! above `threshold`, top-K by score.
|
||||
//! 5. Resolve each surviving hash back to a `(library_id, rel_path)` so
|
||||
//! the frontend can render the photo / hand off to the carousel.
|
||||
//!
|
||||
//! Response shape is intentionally minimal — paths + score — so the
|
||||
//! frontend can reuse existing PhotoGrid rendering by joining against
|
||||
//! `/api/photos/match` (or calling `/image/metadata` lazily). Don't
|
||||
//! bake camera/EXIF metadata into this route; it would force a fan-out
|
||||
//! per result and balloon the response.
|
||||
|
||||
use crate::AppState;
|
||||
use crate::ai::clip_client::ClipError;
|
||||
use crate::database::ExifDao;
|
||||
use actix_web::{HttpResponse, Result as ActixResult, web};
|
||||
use base64::Engine;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::sync::Mutex;
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct SearchQuery {
|
||||
/// Natural-language query. Required; empty triggers 400.
|
||||
pub q: String,
|
||||
/// Max results to return. Capped to 200 server-side; the UI almost
|
||||
/// always wants ≤50. Defaults to 20.
|
||||
#[serde(default = "default_limit")]
|
||||
pub limit: usize,
|
||||
/// Cosine-similarity floor below which results are dropped.
|
||||
/// 0.20 is the rough "this is plausibly relevant" line for OpenAI
|
||||
/// CLIP; tunable per call when sweeping. Defaults to 0.20.
|
||||
#[serde(default = "default_threshold")]
|
||||
pub threshold: f32,
|
||||
/// Optional single-library scope. When omitted, every enabled
|
||||
/// library is searched. Multi-select isn't supported yet — the
|
||||
/// frontend wires through one or all.
|
||||
pub library: Option<i32>,
|
||||
/// Optional model-version filter. Defaults to the live engine's
|
||||
/// version (queried lazily). Forces a strict join so mid-flight
|
||||
/// model swaps can't mix geometries in a single response.
|
||||
#[serde(default)]
|
||||
pub model_version: Option<String>,
|
||||
}
|
||||
|
||||
fn default_limit() -> usize {
|
||||
20
|
||||
}
|
||||
|
||||
fn default_threshold() -> f32 {
|
||||
0.20
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct SearchHit {
|
||||
pub library_id: i32,
|
||||
pub rel_path: String,
|
||||
pub content_hash: String,
|
||||
/// Cosine similarity in [-1, 1]. In practice OpenAI CLIP returns
|
||||
/// 0.10–0.40 for the typical photo library.
|
||||
pub score: f32,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
pub struct SearchResponse {
|
||||
pub query: String,
|
||||
pub model_version: String,
|
||||
pub threshold: f32,
|
||||
pub considered: usize,
|
||||
pub results: Vec<SearchHit>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
struct SearchError {
|
||||
error: String,
|
||||
}
|
||||
|
||||
/// Decode a stored `clip_embedding` BLOB back into a `Vec<f32>`. Returns
|
||||
/// `None` on malformed bytes — those rows get skipped rather than
|
||||
/// failing the whole query.
|
||||
fn decode_embedding(bytes: &[u8]) -> Option<Vec<f32>> {
|
||||
if bytes.is_empty() || bytes.len() % 4 != 0 {
|
||||
return None;
|
||||
}
|
||||
let mut out = Vec::with_capacity(bytes.len() / 4);
|
||||
for chunk in bytes.chunks_exact(4) {
|
||||
out.push(f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]));
|
||||
}
|
||||
Some(out)
|
||||
}
|
||||
|
||||
#[inline]
|
||||
fn dot(a: &[f32], b: &[f32]) -> f32 {
|
||||
a.iter().zip(b.iter()).map(|(x, y)| x * y).sum()
|
||||
}
|
||||
|
||||
pub async fn search_photos(
|
||||
state: web::Data<AppState>,
|
||||
exif_dao: web::Data<Mutex<Box<dyn ExifDao>>>,
|
||||
query: web::Query<SearchQuery>,
|
||||
) -> ActixResult<HttpResponse> {
|
||||
let q_text = query.q.trim().to_string();
|
||||
if q_text.is_empty() {
|
||||
return Ok(HttpResponse::BadRequest().json(SearchError {
|
||||
error: "query parameter `q` is required".into(),
|
||||
}));
|
||||
}
|
||||
if !state.clip_client.is_enabled() {
|
||||
return Ok(HttpResponse::ServiceUnavailable().json(SearchError {
|
||||
error: "CLIP search is disabled (no Apollo CLIP endpoint configured)".into(),
|
||||
}));
|
||||
}
|
||||
|
||||
let limit = query.limit.min(200).max(1);
|
||||
let threshold = query.threshold.clamp(-1.0, 1.0);
|
||||
|
||||
// 1. Encode the query text. Fast — Apollo's text encoder is ~50ms
|
||||
// on CPU. Bail with a clear error message if Apollo's down so the
|
||||
// user sees "service unavailable" rather than empty results.
|
||||
let query_resp = match state.clip_client.encode_text(&q_text).await {
|
||||
Ok(r) => r,
|
||||
Err(ClipError::Permanent(e)) => {
|
||||
return Ok(HttpResponse::BadRequest().json(SearchError {
|
||||
error: format!("query rejected: {e}"),
|
||||
}));
|
||||
}
|
||||
Err(ClipError::Transient(e)) => {
|
||||
return Ok(HttpResponse::BadGateway().json(SearchError {
|
||||
error: format!("CLIP service unavailable: {e}"),
|
||||
}));
|
||||
}
|
||||
Err(ClipError::Disabled) => {
|
||||
return Ok(HttpResponse::ServiceUnavailable().json(SearchError {
|
||||
error: "CLIP service disabled".into(),
|
||||
}));
|
||||
}
|
||||
};
|
||||
// decode_embedding works on raw bytes; the wire format is b64.
|
||||
let query_bytes = base64::engine::general_purpose::STANDARD
|
||||
.decode(query_resp.embedding.as_bytes())
|
||||
.unwrap_or_default();
|
||||
let query_vec = match decode_embedding(&query_bytes) {
|
||||
Some(v) => v,
|
||||
None => {
|
||||
return Ok(HttpResponse::BadGateway().json(SearchError {
|
||||
error: "CLIP service returned a malformed query embedding".into(),
|
||||
}));
|
||||
}
|
||||
};
|
||||
|
||||
// 2. Decide which library scope to search.
|
||||
let library_ids: Vec<i32> = match query.library {
|
||||
Some(id) => vec![id],
|
||||
None => Vec::new(), // empty = all libraries
|
||||
};
|
||||
|
||||
// 3. Pull the (hash, embedding) matrix. Lock contention here is
|
||||
// bounded — one big SELECT under a mutex Arc<Mutex<dyn ExifDao>>
|
||||
// and then we release before scoring. If this becomes a hotspot
|
||||
// we'll cache the decoded matrix in AppState with TTL.
|
||||
let ctx = opentelemetry::Context::current();
|
||||
let rows: Vec<(String, Vec<u8>)> = {
|
||||
let mut dao = exif_dao.lock().expect("exif dao");
|
||||
match dao.list_clip_index(
|
||||
&ctx,
|
||||
&library_ids,
|
||||
query.model_version.as_deref().or(Some(&query_resp.model_version)),
|
||||
) {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
log::warn!("clip_search: list_clip_index failed: {:?}", e);
|
||||
return Ok(HttpResponse::InternalServerError().json(SearchError {
|
||||
error: "failed to load search index".into(),
|
||||
}));
|
||||
}
|
||||
}
|
||||
};
|
||||
let considered = rows.len();
|
||||
if considered == 0 {
|
||||
return Ok(HttpResponse::Ok().json(SearchResponse {
|
||||
query: q_text,
|
||||
model_version: query_resp.model_version,
|
||||
threshold,
|
||||
considered,
|
||||
results: Vec::new(),
|
||||
}));
|
||||
}
|
||||
|
||||
// 4. Score. Cap the loop's transient allocation; we keep all scores
|
||||
// and sort at the end. With ~14k entries the sort is microseconds.
|
||||
let mut scored: Vec<(f32, String)> = Vec::with_capacity(considered);
|
||||
for (hash, blob) in rows {
|
||||
let Some(emb) = decode_embedding(&blob) else {
|
||||
continue;
|
||||
};
|
||||
if emb.len() != query_vec.len() {
|
||||
continue;
|
||||
}
|
||||
let sim = dot(&emb, &query_vec);
|
||||
if sim < threshold {
|
||||
continue;
|
||||
}
|
||||
scored.push((sim, hash));
|
||||
}
|
||||
scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
|
||||
scored.truncate(limit);
|
||||
|
||||
if scored.is_empty() {
|
||||
return Ok(HttpResponse::Ok().json(SearchResponse {
|
||||
query: q_text,
|
||||
model_version: query_resp.model_version,
|
||||
threshold,
|
||||
considered,
|
||||
results: Vec::new(),
|
||||
}));
|
||||
}
|
||||
|
||||
// 5. Resolve each surviving hash back to a `(library_id, rel_path)`.
|
||||
// `get_rel_paths_by_hash` returns every rel_path; we pick the first
|
||||
// one for the result. Apollo / the UI can fetch alternatives via
|
||||
// /image/metadata when needed.
|
||||
let hashes: Vec<String> = scored.iter().map(|(_, h)| h.clone()).collect();
|
||||
let path_map = {
|
||||
let mut dao = exif_dao.lock().expect("exif dao");
|
||||
match dao.get_rel_paths_for_hashes(&ctx, &hashes) {
|
||||
Ok(m) => m,
|
||||
Err(e) => {
|
||||
log::warn!("clip_search: get_rel_paths_for_hashes failed: {:?}", e);
|
||||
return Ok(HttpResponse::InternalServerError().json(SearchError {
|
||||
error: "failed to resolve photo paths".into(),
|
||||
}));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// We need (library_id, rel_path) — get_rel_paths_for_hashes only
|
||||
// returns rel_paths. Cross-reference via find_by_content_hash to
|
||||
// pick the library too. Single call per surviving hash; cheap at
|
||||
// top-20.
|
||||
let mut results = Vec::with_capacity(scored.len());
|
||||
{
|
||||
let mut dao = exif_dao.lock().expect("exif dao");
|
||||
for (score, hash) in scored {
|
||||
let row = match dao.find_by_content_hash(&ctx, &hash) {
|
||||
Ok(Some(r)) => r,
|
||||
Ok(None) => continue,
|
||||
Err(e) => {
|
||||
log::warn!(
|
||||
"clip_search: find_by_content_hash failed for {}: {:?}",
|
||||
hash, e
|
||||
);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
// Prefer get_rel_paths_for_hashes's first entry if it
|
||||
// exists (it shares semantics with `image_exif`'s natural
|
||||
// order), falling back to the ImageExif row.
|
||||
let rel_path = path_map
|
||||
.get(&hash)
|
||||
.and_then(|paths| paths.first().cloned())
|
||||
.unwrap_or(row.file_path);
|
||||
results.push(SearchHit {
|
||||
library_id: row.library_id,
|
||||
rel_path,
|
||||
content_hash: hash,
|
||||
score,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Ok(HttpResponse::Ok().json(SearchResponse {
|
||||
query: q_text,
|
||||
model_version: query_resp.model_version,
|
||||
threshold,
|
||||
considered,
|
||||
results,
|
||||
}))
|
||||
}
|
||||
Reference in New Issue
Block a user