Files
ImageApi/src/clip_search.rs
Cameron Cordes 87093a63d7 clip-search: accept library_ids (multi-select whitelist) on /photos/search
Previously the endpoint only accepted `library=<id>` (single id) — multi-
select scopes had to be filtered upstream by Apollo, which kept the
filter logic out of FileViewer-React's reach (it calls ImageApi
directly and got no scoping for 2+ active libraries).

Adds `library_ids` (comma-separated id list, e.g. `?library_ids=1,3`).
Parsed inside the existing scope decision: `library_ids` wins when
both are supplied; either / both empty falls back to "every enabled
library" (historical default). Malformed entries return 400.

Dedupes ids while preserving order so a stray `library_ids=1,1,3`
doesn't double-pass to the DAO. The single-id path still works
unchanged for older clients.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 09:30:46 -04:00

353 lines
13 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
//! `/photos/search?q=<text>` — CLIP semantic photo search.
//!
//! The route lives outside `files.rs` to keep that 1500+ line module
//! focused on EXIF / tag listing. The flow is:
//!
//! 1. Parse query params (`q`, `limit`, `threshold`, optional `library`).
//! 2. Call Apollo's `/api/internal/clip/encode_text` to get the query
//! vector (L2-normalized 768-d f32 for ViT-L/14).
//! 3. Load every `(content_hash, clip_embedding)` for the scope from
//! `image_exif` via `ExifDao::list_clip_index`. ~2843 MB for a 14k
//! library at ViT-L/14; loaded fresh per request — fast enough for
//! v1, optimize via an AppState cache later if needed.
//! 4. Dot product (= cosine since both sides are L2-normalized), filter
//! above `threshold`, top-K by score.
//! 5. Resolve each surviving hash back to a `(library_id, rel_path)` so
//! the frontend can render the photo / hand off to the carousel.
//!
//! Response shape is intentionally minimal — paths + score — so the
//! frontend can reuse existing PhotoGrid rendering by joining against
//! `/api/photos/match` (or calling `/image/metadata` lazily). Don't
//! bake camera/EXIF metadata into this route; it would force a fan-out
//! per result and balloon the response.
use crate::AppState;
use crate::ai::clip_client::ClipError;
use crate::database::ExifDao;
use actix_web::{HttpResponse, Result as ActixResult, web};
use base64::Engine;
use serde::{Deserialize, Serialize};
use std::sync::Mutex;
#[derive(Debug, Deserialize)]
pub struct SearchQuery {
/// Natural-language query. Required; empty triggers 400.
pub q: String,
/// Max results to return in this page. Capped to 200 server-side.
/// Defaults to 20. Pair with `offset` for pagination.
#[serde(default = "default_limit")]
pub limit: usize,
/// Zero-based offset into the sorted-and-filtered result set. The
/// scoring loop still runs over the full embedding matrix on every
/// page (cheap at personal-library scale — sub-100ms — and avoids
/// stateful pagination cursors). Defaults to 0.
#[serde(default)]
pub offset: usize,
/// Cosine-similarity floor below which results are dropped.
/// 0.20 is the rough "this is plausibly relevant" line for OpenAI
/// CLIP; tunable per call when sweeping. Defaults to 0.20.
#[serde(default = "default_threshold")]
pub threshold: f32,
/// Optional single-library scope. Legacy param — new clients pass
/// `library_ids` instead so multi-select scopes (Apollo's HUD library
/// chips, FileViewer-React's library picker) actually filter. Kept
/// for back-compat; `library_ids` wins when both are supplied.
pub library: Option<i32>,
/// Optional multi-library scope, comma-separated id list
/// (`?library_ids=1,3`). Empty / omitted = every enabled library
/// (the historical default). Apollo and FileViewer-React both send
/// this when 2+ libraries are selected; the single-library case
/// works through either param interchangeably.
pub library_ids: Option<String>,
/// Optional model-version filter. Defaults to the live engine's
/// version (queried lazily). Forces a strict join so mid-flight
/// model swaps can't mix geometries in a single response.
#[serde(default)]
pub model_version: Option<String>,
}
fn default_limit() -> usize {
20
}
fn default_threshold() -> f32 {
0.20
}
#[derive(Debug, Serialize)]
pub struct SearchHit {
pub library_id: i32,
pub rel_path: String,
pub content_hash: String,
/// Cosine similarity in [-1, 1]. In practice OpenAI CLIP returns
/// 0.100.40 for the typical photo library.
pub score: f32,
}
#[derive(Debug, Serialize)]
pub struct SearchResponse {
pub query: String,
pub model_version: String,
pub threshold: f32,
/// Total embeddings scored (= every photo in scope with a stored
/// embedding). Same value across pages of the same query.
pub considered: usize,
/// Count of results above threshold, before pagination. Lets the
/// client decide whether a "Load more" button is meaningful and
/// stop fetching when ``offset + results.len() >= total_matching``.
pub total_matching: usize,
pub offset: usize,
pub results: Vec<SearchHit>,
}
#[derive(Debug, Serialize)]
struct SearchError {
error: String,
}
/// Decode a stored `clip_embedding` BLOB back into a `Vec<f32>`. Returns
/// `None` on malformed bytes — those rows get skipped rather than
/// failing the whole query.
fn decode_embedding(bytes: &[u8]) -> Option<Vec<f32>> {
if bytes.is_empty() || bytes.len() % 4 != 0 {
return None;
}
let mut out = Vec::with_capacity(bytes.len() / 4);
for chunk in bytes.chunks_exact(4) {
out.push(f32::from_le_bytes([chunk[0], chunk[1], chunk[2], chunk[3]]));
}
Some(out)
}
#[inline]
fn dot(a: &[f32], b: &[f32]) -> f32 {
a.iter().zip(b.iter()).map(|(x, y)| x * y).sum()
}
pub async fn search_photos(
state: web::Data<AppState>,
exif_dao: web::Data<Mutex<Box<dyn ExifDao>>>,
query: web::Query<SearchQuery>,
) -> ActixResult<HttpResponse> {
let q_text = query.q.trim().to_string();
if q_text.is_empty() {
return Ok(HttpResponse::BadRequest().json(SearchError {
error: "query parameter `q` is required".into(),
}));
}
if !state.clip_client.is_enabled() {
return Ok(HttpResponse::ServiceUnavailable().json(SearchError {
error: "CLIP search is disabled (no Apollo CLIP endpoint configured)".into(),
}));
}
let limit = query.limit.clamp(1, 200);
let offset = query.offset;
let threshold = query.threshold.clamp(-1.0, 1.0);
// 1. Encode the query text. Fast — Apollo's text encoder is ~50ms
// on CPU. Bail with a clear error message if Apollo's down so the
// user sees "service unavailable" rather than empty results.
let query_resp = match state.clip_client.encode_text(&q_text).await {
Ok(r) => r,
Err(ClipError::Permanent(e)) => {
return Ok(HttpResponse::BadRequest().json(SearchError {
error: format!("query rejected: {e}"),
}));
}
Err(ClipError::Transient(e)) => {
return Ok(HttpResponse::BadGateway().json(SearchError {
error: format!("CLIP service unavailable: {e}"),
}));
}
Err(ClipError::Disabled) => {
return Ok(HttpResponse::ServiceUnavailable().json(SearchError {
error: "CLIP service disabled".into(),
}));
}
};
// decode_embedding works on raw bytes; the wire format is b64.
let query_bytes = base64::engine::general_purpose::STANDARD
.decode(query_resp.embedding.as_bytes())
.unwrap_or_default();
let query_vec = match decode_embedding(&query_bytes) {
Some(v) => v,
None => {
return Ok(HttpResponse::BadGateway().json(SearchError {
error: "CLIP service returned a malformed query embedding".into(),
}));
}
};
// 2. Decide which library scope to search. `library_ids` (multi)
// wins over the legacy `library` (single) when both are present;
// either / both empty falls back to "every enabled library".
let library_ids: Vec<i32> = if let Some(raw) = query.library_ids.as_deref() {
let mut out: Vec<i32> = Vec::new();
for piece in raw.split(',') {
let trimmed = piece.trim();
if trimmed.is_empty() {
continue;
}
match trimmed.parse::<i32>() {
Ok(id) => {
if !out.contains(&id) {
out.push(id);
}
}
Err(_) => {
return Ok(HttpResponse::BadRequest().json(SearchError {
error: format!("invalid library_ids entry: {trimmed:?}"),
}));
}
}
}
out
} else if let Some(id) = query.library {
vec![id]
} else {
Vec::new()
};
// 3. Pull the (hash, embedding) matrix. Lock contention here is
// bounded — one big SELECT under a mutex Arc<Mutex<dyn ExifDao>>
// and then we release before scoring. If this becomes a hotspot
// we'll cache the decoded matrix in AppState with TTL.
let ctx = opentelemetry::Context::current();
let rows: Vec<(String, Vec<u8>)> = {
let mut dao = exif_dao.lock().expect("exif dao");
match dao.list_clip_index(
&ctx,
&library_ids,
query
.model_version
.as_deref()
.or(Some(&query_resp.model_version)),
) {
Ok(r) => r,
Err(e) => {
log::warn!("clip_search: list_clip_index failed: {:?}", e);
return Ok(HttpResponse::InternalServerError().json(SearchError {
error: "failed to load search index".into(),
}));
}
}
};
let considered = rows.len();
if considered == 0 {
return Ok(HttpResponse::Ok().json(SearchResponse {
query: q_text,
model_version: query_resp.model_version,
threshold,
considered,
total_matching: 0,
offset,
results: Vec::new(),
}));
}
// 4. Score. Cap the loop's transient allocation; we keep all scores
// and sort at the end. With ~14k entries the sort is microseconds.
let mut scored: Vec<(f32, String)> = Vec::with_capacity(considered);
for (hash, blob) in rows {
let Some(emb) = decode_embedding(&blob) else {
continue;
};
if emb.len() != query_vec.len() {
continue;
}
let sim = dot(&emb, &query_vec);
if sim < threshold {
continue;
}
scored.push((sim, hash));
}
scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
let total_matching = scored.len();
// Pagination — slice the sorted list at `[offset, offset+limit)`.
// Offsets past the end produce empty pages rather than an error so
// the client can stop fetching naturally on "load more" past the end.
let scored: Vec<(f32, String)> = if offset >= total_matching {
Vec::new()
} else {
let end = (offset + limit).min(total_matching);
scored[offset..end].to_vec()
};
if scored.is_empty() {
return Ok(HttpResponse::Ok().json(SearchResponse {
query: q_text,
model_version: query_resp.model_version,
threshold,
considered,
total_matching,
offset,
results: Vec::new(),
}));
}
// 5. Resolve each surviving hash back to a `(library_id, rel_path)`.
// `get_rel_paths_by_hash` returns every rel_path; we pick the first
// one for the result. Apollo / the UI can fetch alternatives via
// /image/metadata when needed.
let hashes: Vec<String> = scored.iter().map(|(_, h)| h.clone()).collect();
let path_map = {
let mut dao = exif_dao.lock().expect("exif dao");
match dao.get_rel_paths_for_hashes(&ctx, &hashes) {
Ok(m) => m,
Err(e) => {
log::warn!("clip_search: get_rel_paths_for_hashes failed: {:?}", e);
return Ok(HttpResponse::InternalServerError().json(SearchError {
error: "failed to resolve photo paths".into(),
}));
}
}
};
// We need (library_id, rel_path) — get_rel_paths_for_hashes only
// returns rel_paths. Cross-reference via find_by_content_hash to
// pick the library too. Single call per surviving hash; cheap at
// top-20.
let mut results = Vec::with_capacity(scored.len());
{
let mut dao = exif_dao.lock().expect("exif dao");
for (score, hash) in scored {
let row = match dao.find_by_content_hash(&ctx, &hash) {
Ok(Some(r)) => r,
Ok(None) => continue,
Err(e) => {
log::warn!(
"clip_search: find_by_content_hash failed for {}: {:?}",
hash,
e
);
continue;
}
};
// Prefer get_rel_paths_for_hashes's first entry if it
// exists (it shares semantics with `image_exif`'s natural
// order), falling back to the ImageExif row.
let rel_path = path_map
.get(&hash)
.and_then(|paths| paths.first().cloned())
.unwrap_or(row.file_path);
results.push(SearchHit {
library_id: row.library_id,
rel_path,
content_hash: hash,
score,
});
}
}
Ok(HttpResponse::Ok().json(SearchResponse {
query: q_text,
model_version: query_resp.model_version,
threshold,
considered,
total_matching,
offset,
results,
}))
}