clip-search: backlog drain + /photos/search endpoint

Wires the persistence layer for CLIP semantic search. The watcher's per-tick drain encodes any image_exif row with a known content_hash but no clip_embedding via Apollo (cap CLIP_BACKLOG_MAX_PER_TICK, default 32). On a query, /photos/search encodes the text via Apollo and reranks every stored embedding in-memory. ExifDao additions: - list_clip_unencoded_candidates — partial-index scan for drain - backfill_clip_embedding — touches only the two new columns - list_clip_index — dedup'd (hash, embedding) pull for search clip_watch::run_clip_encoding_pass is the parallel fan-out — tokio runtime per pass with CLIP_ENCODE_CONCURRENCY (default 4). No marker rows for permanent failures yet; per-tick cap bounds the retry cost. /photos/search params: q, limit, threshold (default 0.20), library, model_version. Response is intentionally minimal (path + score) so the frontend joins against existing photo-metadata routes lazily. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-14 14:00:41 -04:00
parent 8d9e76cf15
commit 32195ed89e
9 changed files with 875 additions and 0 deletions
--- a/src/clip_search.rs
+++ b/src/clip_search.rs
@@ -0,0 +1,288 @@
+//! `/photos/search?q=<text>` — CLIP semantic photo search.
+//!
+//! The route lives outside `files.rs` to keep that 1500+ line module
+//! focused on EXIF / tag listing. The flow is:
+//!
+//! 1. Parse query params (`q`, `limit`, `threshold`, optional `library`).
+//! 2. Call Apollo's `/api/internal/clip/encode_text` to get the query
+//!    vector (L2-normalized 768-d f32 for ViT-L/14).
+//! 3. Load every `(content_hash, clip_embedding)` for the scope from
+//!    `image_exif` via `ExifDao::list_clip_index`. ~28–43 MB for a 14k
+//!    library at ViT-L/14; loaded fresh per request — fast enough for
+//!    v1, optimize via an AppState cache later if needed.
+//! 4. Dot product (= cosine since both sides are L2-normalized), filter
+//!    above `threshold`, top-K by score.
+//! 5. Resolve each surviving hash back to a `(library_id, rel_path)` so
+//!    the frontend can render the photo / hand off to the carousel.
+//!
+//! Response shape is intentionally minimal — paths + score — so the
+//! frontend can reuse existing PhotoGrid rendering by joining against
+//! `/api/photos/match` (or calling `/image/metadata` lazily). Don't
+//! bake camera/EXIF metadata into this route; it would force a fan-out
+//! per result and balloon the response.
+
+use crate::AppState;
+use crate::ai::clip_client::ClipError;
+use crate::database::ExifDao;
+use actix_web::{HttpResponse, Result as ActixResult, web};
+use base64::Engine;
+use serde::{Deserialize, Serialize};
+use std::sync::Mutex;
+
+#[derive(Debug, Deserialize)]
+pub struct SearchQuery {
+    /// Natural-language query. Required; empty triggers 400.
+    pub q: String,
+    /// Max results to return. Capped to 200 server-side; the UI almost
+    /// always wants ≤50. Defaults to 20.
+    #[serde(default = "default_limit")]
+    pub limit: usize,
+    /// Cosine-similarity floor below which results are dropped.
+    /// 0.20 is the rough "this is plausibly relevant" line for OpenAI
+    /// CLIP; tunable per call when sweeping. Defaults to 0.20.
+    #[serde(default = "default_threshold")]
+    pub threshold: f32,
+    /// Optional single-library scope. When omitted, every enabled
+    /// library is searched. Multi-select isn't supported yet — the
+    /// frontend wires through one or all.
+    pub library: Option<i32>,
+    /// Optional model-version filter. Defaults to the live engine's
+    /// version (queried lazily). Forces a strict join so mid-flight
+    /// model swaps can't mix geometries in a single response.
+    #[serde(default)]
+    pub model_version: Option<String>,
+}
+
+fn default_limit() -> usize {
+    20
+}
+
+fn default_threshold() -> f32 {
+    0.20
+}
+
+#[derive(Debug, Serialize)]
+pub struct SearchHit {
+    pub library_id: i32,
+    pub rel_path: String,
+    pub content_hash: String,
+    /// Cosine similarity in [-1, 1]. In practice OpenAI CLIP returns
+    /// 0.10–0.40 for the typical photo library.
+    pub score: f32,
+}
+
+#[derive(Debug, Serialize)]
+pub struct SearchResponse {
+    pub query: String,
+    pub model_version: String,
+    pub threshold: f32,
+    pub considered: usize,
+    pub results: Vec<SearchHit>,
+}
+
+#[derive(Debug, Serialize)]
+struct SearchError {
+    error: String,
+}
+
+/// Decode a stored `clip_embedding` BLOB back into a `Vec<f32>`. Returns
+/// `None` on malformed bytes — those rows get skipped rather than
+/// failing the whole query.
+fn decode_embedding(bytes: &[u8]) -> Option<Vec<f32>> {
+    if bytes.is_empty() || bytes.len() % 4 != 0 {
+        return None;
+    }
+    let mut out = Vec::with_capacity(bytes.len() / 4);
+    for chunk in bytes.chunks_exact(4) {
+        out.push(f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]));
+    }
+    Some(out)
+}
+
+#[inline]
+fn dot(a: &[f32], b: &[f32]) -> f32 {
+    a.iter().zip(b.iter()).map(|(x, y)| x * y).sum()
+}
+
+pub async fn search_photos(
+    state: web::Data<AppState>,
+    exif_dao: web::Data<Mutex<Box<dyn ExifDao>>>,
+    query: web::Query<SearchQuery>,
+) -> ActixResult<HttpResponse> {
+    let q_text = query.q.trim().to_string();
+    if q_text.is_empty() {
+        return Ok(HttpResponse::BadRequest().json(SearchError {
+            error: "query parameter `q` is required".into(),
+        }));
+    }
+    if !state.clip_client.is_enabled() {
+        return Ok(HttpResponse::ServiceUnavailable().json(SearchError {
+            error: "CLIP search is disabled (no Apollo CLIP endpoint configured)".into(),
+        }));
+    }
+
+    let limit = query.limit.min(200).max(1);
+    let threshold = query.threshold.clamp(-1.0, 1.0);
+
+    // 1. Encode the query text. Fast — Apollo's text encoder is ~50ms
+    // on CPU. Bail with a clear error message if Apollo's down so the
+    // user sees "service unavailable" rather than empty results.
+    let query_resp = match state.clip_client.encode_text(&q_text).await {
+        Ok(r) => r,
+        Err(ClipError::Permanent(e)) => {
+            return Ok(HttpResponse::BadRequest().json(SearchError {
+                error: format!("query rejected: {e}"),
+            }));
+        }
+        Err(ClipError::Transient(e)) => {
+            return Ok(HttpResponse::BadGateway().json(SearchError {
+                error: format!("CLIP service unavailable: {e}"),
+            }));
+        }
+        Err(ClipError::Disabled) => {
+            return Ok(HttpResponse::ServiceUnavailable().json(SearchError {
+                error: "CLIP service disabled".into(),
+            }));
+        }
+    };
+    // decode_embedding works on raw bytes; the wire format is b64.
+    let query_bytes = base64::engine::general_purpose::STANDARD
+        .decode(query_resp.embedding.as_bytes())
+        .unwrap_or_default();
+    let query_vec = match decode_embedding(&query_bytes) {
+        Some(v) => v,
+        None => {
+            return Ok(HttpResponse::BadGateway().json(SearchError {
+                error: "CLIP service returned a malformed query embedding".into(),
+            }));
+        }
+    };
+
+    // 2. Decide which library scope to search.
+    let library_ids: Vec<i32> = match query.library {
+        Some(id) => vec![id],
+        None => Vec::new(), // empty = all libraries
+    };
+
+    // 3. Pull the (hash, embedding) matrix. Lock contention here is
+    // bounded — one big SELECT under a mutex Arc<Mutex<dyn ExifDao>>
+    // and then we release before scoring. If this becomes a hotspot
+    // we'll cache the decoded matrix in AppState with TTL.
+    let ctx = opentelemetry::Context::current();
+    let rows: Vec<(String, Vec<u8>)> = {
+        let mut dao = exif_dao.lock().expect("exif dao");
+        match dao.list_clip_index(
+            &ctx,
+            &library_ids,
+            query.model_version.as_deref().or(Some(&query_resp.model_version)),
+        ) {
+            Ok(r) => r,
+            Err(e) => {
+                log::warn!("clip_search: list_clip_index failed: {:?}", e);
+                return Ok(HttpResponse::InternalServerError().json(SearchError {
+                    error: "failed to load search index".into(),
+                }));
+            }
+        }
+    };
+    let considered = rows.len();
+    if considered == 0 {
+        return Ok(HttpResponse::Ok().json(SearchResponse {
+            query: q_text,
+            model_version: query_resp.model_version,
+            threshold,
+            considered,
+            results: Vec::new(),
+        }));
+    }
+
+    // 4. Score. Cap the loop's transient allocation; we keep all scores
+    // and sort at the end. With ~14k entries the sort is microseconds.
+    let mut scored: Vec<(f32, String)> = Vec::with_capacity(considered);
+    for (hash, blob) in rows {
+        let Some(emb) = decode_embedding(&blob) else {
+            continue;
+        };
+        if emb.len() != query_vec.len() {
+            continue;
+        }
+        let sim = dot(&emb, &query_vec);
+        if sim < threshold {
+            continue;
+        }
+        scored.push((sim, hash));
+    }
+    scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
+    scored.truncate(limit);
+
+    if scored.is_empty() {
+        return Ok(HttpResponse::Ok().json(SearchResponse {
+            query: q_text,
+            model_version: query_resp.model_version,
+            threshold,
+            considered,
+            results: Vec::new(),
+        }));
+    }
+
+    // 5. Resolve each surviving hash back to a `(library_id, rel_path)`.
+    // `get_rel_paths_by_hash` returns every rel_path; we pick the first
+    // one for the result. Apollo / the UI can fetch alternatives via
+    // /image/metadata when needed.
+    let hashes: Vec<String> = scored.iter().map(|(_, h)| h.clone()).collect();
+    let path_map = {
+        let mut dao = exif_dao.lock().expect("exif dao");
+        match dao.get_rel_paths_for_hashes(&ctx, &hashes) {
+            Ok(m) => m,
+            Err(e) => {
+                log::warn!("clip_search: get_rel_paths_for_hashes failed: {:?}", e);
+                return Ok(HttpResponse::InternalServerError().json(SearchError {
+                    error: "failed to resolve photo paths".into(),
+                }));
+            }
+        }
+    };
+
+    // We need (library_id, rel_path) — get_rel_paths_for_hashes only
+    // returns rel_paths. Cross-reference via find_by_content_hash to
+    // pick the library too. Single call per surviving hash; cheap at
+    // top-20.
+    let mut results = Vec::with_capacity(scored.len());
+    {
+        let mut dao = exif_dao.lock().expect("exif dao");
+        for (score, hash) in scored {
+            let row = match dao.find_by_content_hash(&ctx, &hash) {
+                Ok(Some(r)) => r,
+                Ok(None) => continue,
+                Err(e) => {
+                    log::warn!(
+                        "clip_search: find_by_content_hash failed for {}: {:?}",
+                        hash, e
+                    );
+                    continue;
+                }
+            };
+            // Prefer get_rel_paths_for_hashes's first entry if it
+            // exists (it shares semantics with `image_exif`'s natural
+            // order), falling back to the ImageExif row.
+            let rel_path = path_map
+                .get(&hash)
+                .and_then(|paths| paths.first().cloned())
+                .unwrap_or(row.file_path);
+            results.push(SearchHit {
+                library_id: row.library_id,
+                rel_path,
+                content_hash: hash,
+                score,
+            });
+        }
+    }
+
+    Ok(HttpResponse::Ok().json(SearchResponse {
+        query: q_text,
+        model_version: query_resp.model_version,
+        threshold,
+        considered,
+        results,
+    }))
+}