Merge pull request 'Feature/unified nl search' (#106) from feature/unified-nl-search into master
Reviewed-on: #106
This commit was merged in pull request #106.
This commit is contained in:
@@ -80,6 +80,16 @@ AGENTIC_CHAT_MAX_ITERATIONS=6
|
||||
# LLAMA_SWAP_ALLOWED_MODELS=chat,vision,embed
|
||||
# LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180
|
||||
|
||||
# ── Unified search translation model (optional) ─────────────────────────
|
||||
# /photos/search/unified runs one small LLM call to translate a natural-
|
||||
# language query into structured filters + a semantic term, then CLIP-ranks.
|
||||
# That step needs an LLM AND CLIP available at once. On a tight VRAM budget a
|
||||
# large chat model can't co-reside with CLIP, so pin a small, fast model here
|
||||
# (it can stay loaded alongside CLIP and the chat model). Precedence:
|
||||
# UNIFIED_SEARCH_MODEL > the client's selected model > the configured default.
|
||||
# Use the configured backend (LLM_BACKEND); local only — no hybrid.
|
||||
# UNIFIED_SEARCH_MODEL=qwen3-0.6b
|
||||
|
||||
# ── Text-to-speech (optional, requires LLAMA_SWAP_URL) ───────────────────
|
||||
# TTS routes through the same llama-swap proxy (a Chatterbox model id), so it
|
||||
# only needs LLAMA_SWAP_URL — it does NOT require LLM_BACKEND=llamacpp.
|
||||
|
||||
@@ -41,6 +41,10 @@ pub struct SamplingOverrides {
|
||||
pub top_p: Option<f32>,
|
||||
pub top_k: Option<i32>,
|
||||
pub min_p: Option<f32>,
|
||||
/// Reasoning toggle. Only the llama.cpp backend honors it (forwarded as
|
||||
/// `chat_template_kwargs.enable_thinking`); other backends ignore it.
|
||||
/// `None` leaves the model/template default in place.
|
||||
pub enable_thinking: Option<bool>,
|
||||
}
|
||||
|
||||
impl SamplingOverrides {
|
||||
@@ -124,6 +128,7 @@ mod tests {
|
||||
top_p: None,
|
||||
top_k: None,
|
||||
min_p: None,
|
||||
enable_thinking: None,
|
||||
};
|
||||
assert!(!empty.has_sampling());
|
||||
|
||||
@@ -134,6 +139,7 @@ mod tests {
|
||||
top_p: None,
|
||||
top_k: None,
|
||||
min_p: None,
|
||||
enable_thinking: None,
|
||||
};
|
||||
assert!(with_temp.has_sampling());
|
||||
}
|
||||
|
||||
@@ -191,11 +191,13 @@ impl ClipClient {
|
||||
let resp = match self.client.post(&url).json(&body).send().await {
|
||||
Ok(r) => r,
|
||||
Err(e) if e.is_timeout() || e.is_connect() => {
|
||||
log::warn!("clip encode_text network error to {url}: {e}");
|
||||
return Err(ClipError::Transient(anyhow::anyhow!(
|
||||
"clip client network: {e}"
|
||||
)));
|
||||
}
|
||||
Err(e) => {
|
||||
log::warn!("clip encode_text request error to {url}: {e}");
|
||||
return Err(ClipError::Transient(anyhow::anyhow!(
|
||||
"clip client request: {e}"
|
||||
)));
|
||||
@@ -210,6 +212,7 @@ impl ClipClient {
|
||||
return Ok(body);
|
||||
}
|
||||
let body_text = resp.text().await.unwrap_or_default();
|
||||
log::warn!("clip encode_text HTTP {status} from {url}: {body_text}");
|
||||
Err(classify_error_response(status.as_u16(), &body_text))
|
||||
}
|
||||
|
||||
|
||||
@@ -40,6 +40,12 @@ pub struct GeneratePhotoInsightRequest {
|
||||
pub top_k: Option<i32>,
|
||||
#[serde(default)]
|
||||
pub min_p: Option<f32>,
|
||||
/// Reasoning toggle for thinking-capable models. Forwarded to the
|
||||
/// llama.cpp backend as `chat_template_kwargs.enable_thinking`; ignored
|
||||
/// by other backends and the non-agentic (Ollama) path. Only the agentic
|
||||
/// endpoint routes through llama.cpp. None defers to the template default.
|
||||
#[serde(default)]
|
||||
pub enable_thinking: Option<bool>,
|
||||
/// `"local"` (default, Ollama with images) | `"hybrid"` (local vision +
|
||||
/// OpenRouter chat). Only respected by the agentic endpoint.
|
||||
#[serde(default)]
|
||||
@@ -868,6 +874,7 @@ pub async fn generate_agentic_insight_handler(
|
||||
request.top_p,
|
||||
request.top_k,
|
||||
request.min_p,
|
||||
request.enable_thinking,
|
||||
max_iterations,
|
||||
request.backend.clone(),
|
||||
fewshot_examples,
|
||||
@@ -1169,6 +1176,11 @@ pub struct ChatTurnHttpRequest {
|
||||
pub top_k: Option<i32>,
|
||||
#[serde(default)]
|
||||
pub min_p: Option<f32>,
|
||||
/// Reasoning toggle for thinking-capable models. Forwarded to the
|
||||
/// llama.cpp backend as `chat_template_kwargs.enable_thinking`; ignored
|
||||
/// by other backends. None defers to the model/template default.
|
||||
#[serde(default)]
|
||||
pub enable_thinking: Option<bool>,
|
||||
#[serde(default)]
|
||||
pub max_iterations: Option<usize>,
|
||||
/// Per-turn system-prompt override. Ephemeral in append mode,
|
||||
@@ -1247,6 +1259,7 @@ pub async fn chat_turn_handler(
|
||||
top_p: request.top_p,
|
||||
top_k: request.top_k,
|
||||
min_p: request.min_p,
|
||||
enable_thinking: request.enable_thinking,
|
||||
max_iterations: request.max_iterations,
|
||||
system_prompt: request.system_prompt.clone(),
|
||||
persona_id: request.persona_id.clone(),
|
||||
@@ -1473,6 +1486,7 @@ pub async fn chat_stream_handler(
|
||||
top_p: request.top_p,
|
||||
top_k: request.top_k,
|
||||
min_p: request.min_p,
|
||||
enable_thinking: request.enable_thinking,
|
||||
max_iterations: request.max_iterations,
|
||||
system_prompt: request.system_prompt.clone(),
|
||||
persona_id: request.persona_id.clone(),
|
||||
@@ -1618,6 +1632,7 @@ pub async fn turn_async_handler(
|
||||
top_p: request.top_p,
|
||||
top_k: request.top_k,
|
||||
min_p: request.min_p,
|
||||
enable_thinking: request.enable_thinking,
|
||||
max_iterations: request.max_iterations,
|
||||
system_prompt: request.system_prompt.clone(),
|
||||
persona_id: request.persona_id.clone(),
|
||||
|
||||
@@ -70,6 +70,10 @@ pub struct ChatTurnRequest {
|
||||
pub top_p: Option<f32>,
|
||||
pub top_k: Option<i32>,
|
||||
pub min_p: Option<f32>,
|
||||
/// Reasoning toggle for thinking-capable models. Forwarded to the
|
||||
/// llama.cpp backend as `chat_template_kwargs.enable_thinking`; ignored
|
||||
/// by other backends. None defers to the model/template default.
|
||||
pub enable_thinking: Option<bool>,
|
||||
pub max_iterations: Option<usize>,
|
||||
/// Per-turn system-prompt override. In append mode (default), applied
|
||||
/// ephemerally — original system message restored before persistence.
|
||||
@@ -344,6 +348,7 @@ impl InsightChatService {
|
||||
top_p: req.top_p,
|
||||
top_k: req.top_k,
|
||||
min_p: req.min_p,
|
||||
enable_thinking: req.enable_thinking,
|
||||
};
|
||||
let backend = self.generator.resolve_backend(kind, &overrides).await?;
|
||||
let model_used = backend.model().to_string();
|
||||
@@ -847,6 +852,7 @@ impl InsightChatService {
|
||||
top_p: req.top_p,
|
||||
top_k: req.top_k,
|
||||
min_p: req.min_p,
|
||||
enable_thinking: req.enable_thinking,
|
||||
};
|
||||
let backend = self.generator.resolve_backend(kind, &overrides).await?;
|
||||
let model_used = backend.model().to_string();
|
||||
@@ -1017,6 +1023,7 @@ impl InsightChatService {
|
||||
top_p: req.top_p,
|
||||
top_k: req.top_k,
|
||||
min_p: req.min_p,
|
||||
enable_thinking: req.enable_thinking,
|
||||
};
|
||||
let backend = self.generator.resolve_backend(kind, &overrides).await?;
|
||||
let model_used = backend.model().to_string();
|
||||
@@ -1425,6 +1432,7 @@ impl InsightChatService {
|
||||
top_p: req.top_p,
|
||||
top_k: req.top_k,
|
||||
min_p: req.min_p,
|
||||
enable_thinking: req.enable_thinking,
|
||||
};
|
||||
let backend = self.generator.resolve_backend(kind, &overrides).await?;
|
||||
let model_used = backend.model().to_string();
|
||||
@@ -1607,6 +1615,7 @@ impl InsightChatService {
|
||||
top_p: req.top_p,
|
||||
top_k: req.top_k,
|
||||
min_p: req.min_p,
|
||||
enable_thinking: req.enable_thinking,
|
||||
};
|
||||
let backend = self.generator.resolve_backend(kind, &overrides).await?;
|
||||
let model_used = backend.model().to_string();
|
||||
|
||||
@@ -3933,6 +3933,7 @@ Return ONLY the summary, nothing else."#,
|
||||
if let Some(ctx) = overrides.num_ctx {
|
||||
c.set_num_ctx(Some(ctx));
|
||||
}
|
||||
c.set_enable_thinking(overrides.enable_thinking);
|
||||
Box::new(c)
|
||||
} else {
|
||||
// Pure Ollama local.
|
||||
@@ -4064,6 +4065,7 @@ Return ONLY the summary, nothing else."#,
|
||||
top_p: Option<f32>,
|
||||
top_k: Option<i32>,
|
||||
min_p: Option<f32>,
|
||||
enable_thinking: Option<bool>,
|
||||
max_iterations: usize,
|
||||
backend: Option<String>,
|
||||
fewshot_examples: Vec<Vec<ChatMessage>>,
|
||||
@@ -4091,6 +4093,7 @@ Return ONLY the summary, nothing else."#,
|
||||
top_p,
|
||||
top_k,
|
||||
min_p,
|
||||
enable_thinking,
|
||||
};
|
||||
let backend = self.resolve_backend(kind, &overrides).await?;
|
||||
span.set_attribute(KeyValue::new("model", backend.model().to_string()));
|
||||
|
||||
@@ -64,6 +64,12 @@ pub struct LlamaCppClient {
|
||||
top_p: Option<f32>,
|
||||
top_k: Option<i32>,
|
||||
min_p: Option<f32>,
|
||||
/// When `Some`, forwarded to llama-server as
|
||||
/// `chat_template_kwargs: {"enable_thinking": <bool>}`. The Jinja chat
|
||||
/// template (e.g. Qwen3) reads this to gate its reasoning block. `None`
|
||||
/// omits the key entirely, leaving the template's own default. Templates
|
||||
/// that don't reference the key ignore it, so sending it is harmless.
|
||||
enable_thinking: Option<bool>,
|
||||
}
|
||||
|
||||
impl LlamaCppClient {
|
||||
@@ -89,6 +95,7 @@ impl LlamaCppClient {
|
||||
top_p: None,
|
||||
top_k: None,
|
||||
min_p: None,
|
||||
enable_thinking: None,
|
||||
}
|
||||
}
|
||||
|
||||
@@ -104,6 +111,12 @@ impl LlamaCppClient {
|
||||
self.num_ctx = num_ctx;
|
||||
}
|
||||
|
||||
/// Set the reasoning toggle forwarded as `chat_template_kwargs.enable_thinking`.
|
||||
/// `None` leaves the chat template's own default in place.
|
||||
pub fn set_enable_thinking(&mut self, enable_thinking: Option<bool>) {
|
||||
self.enable_thinking = enable_thinking;
|
||||
}
|
||||
|
||||
pub fn set_sampling_params(
|
||||
&mut self,
|
||||
temperature: Option<f32>,
|
||||
@@ -458,6 +471,12 @@ impl LlamaCppClient {
|
||||
// via -c, so we silently drop the override here. The config.yaml
|
||||
// entry is the source of truth for context size.
|
||||
let _ = self.num_ctx;
|
||||
// Reasoning toggle for thinking-capable templates (Qwen3 et al.).
|
||||
// llama-server forwards chat_template_kwargs into the Jinja render
|
||||
// (requires --jinja); templates that ignore the key are unaffected.
|
||||
if let Some(think) = self.enable_thinking {
|
||||
v.push(("chat_template_kwargs", json!({ "enable_thinking": think })));
|
||||
}
|
||||
v
|
||||
}
|
||||
|
||||
|
||||
@@ -10,6 +10,7 @@ pub mod insight_generator;
|
||||
pub mod llamacpp;
|
||||
pub mod llm_client;
|
||||
pub mod local_llm;
|
||||
pub mod nl_query;
|
||||
pub mod ollama;
|
||||
pub mod openrouter;
|
||||
pub mod pronunciation;
|
||||
|
||||
@@ -0,0 +1,408 @@
|
||||
//! Natural-language → structured-query translation for unified photo search.
|
||||
//!
|
||||
//! The unified search endpoint (`/photos/search/unified`, Phase 2) needs to
|
||||
//! turn a free-text query like *"sunset photos in Italy from last summer"*
|
||||
//! into the structured filter the existing `/photos` engine understands plus
|
||||
//! a semantic term for CLIP ranking. That translation is a single grounded
|
||||
//! LLM call, isolated here so it can be unit-tested without a network or the
|
||||
//! full `InsightGenerator`.
|
||||
//!
|
||||
//! Two-stage design:
|
||||
//! 1. The LLM emits a [`RawNlQuery`] — references are by *name* (tags) and
|
||||
//! dates as ISO strings, never numeric ids it could hallucinate.
|
||||
//! 2. [`resolve_raw_query`] maps names against the real tag vocabulary and
|
||||
//! converts ISO dates to unix seconds, producing a [`StructuredQuery`].
|
||||
//! A tag the model invents that isn't in the vocab is surfaced in
|
||||
//! `unmatched_tags` (the caller folds it back into the semantic term)
|
||||
//! rather than silently dropped — this is the anti-noise guard.
|
||||
//!
|
||||
//! Geocoding of `place` and person filtering are intentionally *not* handled
|
||||
//! here: `place` stays as text for the caller to forward-geocode (async, see
|
||||
//! `geo::forward_geocode`), and person filtering is deferred until a
|
||||
//! person→photos resolver exists.
|
||||
|
||||
use crate::ai::llm_client::{ChatMessage, LlmClient, Tool, strip_think_blocks};
|
||||
use anyhow::{Result, anyhow};
|
||||
use serde::{Deserialize, Serialize};
|
||||
|
||||
/// Raw query object as emitted by the LLM. Tag references are by name
|
||||
/// (resolved against the real vocab in Rust); dates are ISO `YYYY-MM-DD`.
|
||||
/// Every field is optional so a partial / minimal model response still
|
||||
/// deserializes.
|
||||
#[derive(Debug, Clone, Default, Deserialize, PartialEq)]
|
||||
pub struct RawNlQuery {
|
||||
/// Visual/scene description handed to CLIP for ranking. The descriptive
|
||||
/// remainder after structured filters are peeled off.
|
||||
#[serde(default)]
|
||||
pub semantic: Option<String>,
|
||||
/// Tag names the photos must have. Matched case-insensitively against
|
||||
/// the supplied vocabulary; non-matches land in `unmatched_tags`.
|
||||
#[serde(default)]
|
||||
pub tags: Vec<String>,
|
||||
/// Tag names the photos must NOT have.
|
||||
#[serde(default)]
|
||||
pub exclude_tags: Vec<String>,
|
||||
#[serde(default)]
|
||||
pub camera_make: Option<String>,
|
||||
#[serde(default)]
|
||||
pub camera_model: Option<String>,
|
||||
#[serde(default)]
|
||||
pub lens_model: Option<String>,
|
||||
/// Free-text place/location name to forward-geocode (e.g. "Italy").
|
||||
#[serde(default)]
|
||||
pub place: Option<String>,
|
||||
/// Inclusive start date, ISO `YYYY-MM-DD`.
|
||||
#[serde(default)]
|
||||
pub date_from: Option<String>,
|
||||
/// Inclusive end date, ISO `YYYY-MM-DD`.
|
||||
#[serde(default)]
|
||||
pub date_to: Option<String>,
|
||||
/// "photo" | "video" — normalized in [`resolve_raw_query`].
|
||||
#[serde(default)]
|
||||
pub media_type: Option<String>,
|
||||
}
|
||||
|
||||
/// Resolved structured query: tag names mapped to ids against the real
|
||||
/// vocab, ISO dates converted to unix seconds. `place` stays as text for the
|
||||
/// caller to forward-geocode into a gps circle. Serializable so the endpoint
|
||||
/// can echo it back to the client as "this is how I read your query"
|
||||
/// (editable filter chips).
|
||||
#[derive(Debug, Clone, Default, PartialEq, Serialize)]
|
||||
pub struct StructuredQuery {
|
||||
pub semantic: Option<String>,
|
||||
pub tag_ids: Vec<i32>,
|
||||
pub exclude_tag_ids: Vec<i32>,
|
||||
/// Tag names the model produced that don't exist in the vocabulary.
|
||||
/// The caller folds these back into the semantic term so the concept
|
||||
/// isn't lost — and surfacing them keeps a hallucinated tag from
|
||||
/// silently filtering the whole library to nothing.
|
||||
pub unmatched_tags: Vec<String>,
|
||||
pub camera_make: Option<String>,
|
||||
pub camera_model: Option<String>,
|
||||
pub lens_model: Option<String>,
|
||||
/// Raw place name awaiting forward-geocoding by the caller.
|
||||
pub place: Option<String>,
|
||||
pub date_from: Option<i64>,
|
||||
pub date_to: Option<i64>,
|
||||
/// Normalized to "photo" | "video"; `None` means no media-type filter.
|
||||
pub media_type: Option<String>,
|
||||
}
|
||||
|
||||
/// Convert an ISO `YYYY-MM-DD` date to a unix timestamp (seconds). With
|
||||
/// `end_of_day`, returns 23:59:59 of that day so a `date_to` filter is
|
||||
/// inclusive of the whole day; otherwise 00:00:00. Returns `None` for any
|
||||
/// unparseable input (the filter is simply omitted rather than erroring).
|
||||
pub fn iso_to_unix(date: &str, end_of_day: bool) -> Option<i64> {
|
||||
let d = chrono::NaiveDate::parse_from_str(date.trim(), "%Y-%m-%d").ok()?;
|
||||
let time = if end_of_day {
|
||||
chrono::NaiveTime::from_hms_opt(23, 59, 59)?
|
||||
} else {
|
||||
chrono::NaiveTime::from_hms_opt(0, 0, 0)?
|
||||
};
|
||||
Some(d.and_time(time).and_utc().timestamp())
|
||||
}
|
||||
|
||||
/// Normalize a free-form media-type string to the engine's vocabulary.
|
||||
/// Anything that isn't clearly photo or video (including "all") yields
|
||||
/// `None` — no filter.
|
||||
fn normalize_media_type(raw: &str) -> Option<String> {
|
||||
match raw.trim().to_lowercase().as_str() {
|
||||
"photo" | "photos" | "image" | "images" | "picture" | "pictures" => {
|
||||
Some("photo".to_string())
|
||||
}
|
||||
"video" | "videos" | "movie" | "movies" | "clip" | "clips" => Some("video".to_string()),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
|
||||
/// Resolve a raw LLM query against the real tag vocabulary, producing the
|
||||
/// structured filter. Pure — no network, no LLM — so it carries the
|
||||
/// correctness-critical mapping logic under unit test.
|
||||
///
|
||||
/// `tag_vocab` is `(tag_id, tag_name)` pairs (the shape `TagDao::get_all_tags`
|
||||
/// yields once the count is dropped). Matching is case-insensitive and exact
|
||||
/// on the trimmed name.
|
||||
pub fn resolve_raw_query(raw: RawNlQuery, tag_vocab: &[(i32, String)]) -> StructuredQuery {
|
||||
// Case-insensitive name → id lookup. Built once per call.
|
||||
let lookup: std::collections::HashMap<String, i32> = tag_vocab
|
||||
.iter()
|
||||
.map(|(id, name)| (name.trim().to_lowercase(), *id))
|
||||
.collect();
|
||||
|
||||
let resolve_names = |names: &[String], ids: &mut Vec<i32>, unmatched: &mut Vec<String>| {
|
||||
for name in names {
|
||||
let key = name.trim().to_lowercase();
|
||||
if key.is_empty() {
|
||||
continue;
|
||||
}
|
||||
match lookup.get(&key) {
|
||||
Some(id) if !ids.contains(id) => ids.push(*id),
|
||||
Some(_) => {} // duplicate, already collected
|
||||
None => {
|
||||
if !unmatched.iter().any(|u| u.eq_ignore_ascii_case(name)) {
|
||||
unmatched.push(name.trim().to_string());
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
let mut tag_ids = Vec::new();
|
||||
let mut unmatched_tags = Vec::new();
|
||||
resolve_names(&raw.tags, &mut tag_ids, &mut unmatched_tags);
|
||||
|
||||
// Excluded tags that don't match a real tag are simply ignored — you
|
||||
// can't exclude a tag that doesn't exist, and folding them into
|
||||
// `semantic` would make no sense.
|
||||
let mut exclude_tag_ids = Vec::new();
|
||||
let mut exclude_unmatched = Vec::new();
|
||||
resolve_names(
|
||||
&raw.exclude_tags,
|
||||
&mut exclude_tag_ids,
|
||||
&mut exclude_unmatched,
|
||||
);
|
||||
|
||||
let clean = |s: Option<String>| s.map(|v| v.trim().to_string()).filter(|v| !v.is_empty());
|
||||
|
||||
StructuredQuery {
|
||||
semantic: clean(raw.semantic),
|
||||
tag_ids,
|
||||
exclude_tag_ids,
|
||||
unmatched_tags,
|
||||
camera_make: clean(raw.camera_make),
|
||||
camera_model: clean(raw.camera_model),
|
||||
lens_model: clean(raw.lens_model),
|
||||
place: clean(raw.place),
|
||||
date_from: raw.date_from.as_deref().and_then(|d| iso_to_unix(d, false)),
|
||||
date_to: raw.date_to.as_deref().and_then(|d| iso_to_unix(d, true)),
|
||||
media_type: raw.media_type.as_deref().and_then(normalize_media_type),
|
||||
}
|
||||
}
|
||||
|
||||
/// Build the grounded system prompt. The model is told the current date (so
|
||||
/// "last summer" resolves) and the exact tag vocabulary (so it uses real
|
||||
/// tags or routes the concept to `semantic` instead of inventing one).
|
||||
fn build_system_prompt(tag_vocab: &[(i32, String)], today: chrono::NaiveDate) -> String {
|
||||
// Cap the vocab dump so a huge library doesn't blow the context window;
|
||||
// the most-used tags are the ones a query is likely to reference.
|
||||
const MAX_TAGS: usize = 400;
|
||||
let mut names: Vec<&str> = tag_vocab.iter().map(|(_, n)| n.as_str()).collect();
|
||||
names.sort_unstable();
|
||||
names.dedup();
|
||||
let shown = names.len().min(MAX_TAGS);
|
||||
let vocab = names[..shown].join(", ");
|
||||
let truncation = if names.len() > MAX_TAGS {
|
||||
format!(" (showing {MAX_TAGS} of {} tags)", names.len())
|
||||
} else {
|
||||
String::new()
|
||||
};
|
||||
|
||||
format!(
|
||||
"You translate a user's natural-language photo-search request into a JSON \
|
||||
filter. Today's date is {today}. Respond with ONLY a JSON object, no prose, no \
|
||||
code fences.\n\n\
|
||||
Schema (all fields optional):\n\
|
||||
{{\n \
|
||||
\"semantic\": string|null, // visual scene/subject for image similarity search\n \
|
||||
\"tags\": string[], // ONLY names from the tag list below\n \
|
||||
\"exclude_tags\": string[], // ONLY names from the tag list below\n \
|
||||
\"camera_make\": string|null,\n \
|
||||
\"camera_model\": string|null,\n \
|
||||
\"lens_model\": string|null,\n \
|
||||
\"place\": string|null, // a location name to look up (city, country, landmark)\n \
|
||||
\"date_from\": \"YYYY-MM-DD\"|null, // inclusive\n \
|
||||
\"date_to\": \"YYYY-MM-DD\"|null, // inclusive\n \
|
||||
\"media_type\": \"photo\"|\"video\"|null\n\
|
||||
}}\n\n\
|
||||
Rules:\n\
|
||||
- Put descriptive/visual concepts (\"sunset\", \"crowded beach\", \"red car\") in \"semantic\".\n\
|
||||
- Only use \"tags\"/\"exclude_tags\" values that appear EXACTLY in the tag list. If a \
|
||||
concept isn't a listed tag, put it in \"semantic\" instead — never invent a tag.\n\
|
||||
- Resolve relative dates against today's date (\"last summer\", \"2023\", \"last month\").\n\
|
||||
- Put place/location names in \"place\" (not \"semantic\").\n\
|
||||
- Omit (use null / empty array) anything the request doesn't mention.\n\n\
|
||||
Available tags{truncation}: {vocab}"
|
||||
)
|
||||
}
|
||||
|
||||
/// Extract the JSON object from a model response that may include a leading
|
||||
/// `<think>` block, code fences, or trailing prose. Strips the think block
|
||||
/// first (so reasoning that mentions braces can't fool the scan), then
|
||||
/// returns the substring from the first `{` to the last `}` inclusive — or
|
||||
/// the trimmed text if no braces are found (which then fails to parse with a
|
||||
/// clear error).
|
||||
fn extract_json(raw: &str) -> String {
|
||||
let s = strip_think_blocks(raw);
|
||||
let start = s.find('{');
|
||||
let end = s.rfind('}');
|
||||
match (start, end) {
|
||||
(Some(a), Some(b)) if b >= a => s[a..=b].to_string(),
|
||||
_ => s.trim().to_string(),
|
||||
}
|
||||
}
|
||||
|
||||
/// Parse a model response string into a [`StructuredQuery`], resolving names
|
||||
/// against the vocab. Separated from the LLM call so it's unit-testable.
|
||||
pub fn parse_response(response: &str, tag_vocab: &[(i32, String)]) -> Result<StructuredQuery> {
|
||||
let json = extract_json(response);
|
||||
let raw: RawNlQuery = serde_json::from_str(&json)
|
||||
.map_err(|e| anyhow!("failed to parse NL query JSON: {e}; raw response: {response:?}"))?;
|
||||
Ok(resolve_raw_query(raw, tag_vocab))
|
||||
}
|
||||
|
||||
/// Translate a natural-language query into a [`StructuredQuery`] via one
|
||||
/// grounded LLM call. The `client` is any configured backend (the unified
|
||||
/// endpoint passes the resolved chat backend); `tag_vocab` grounds the tag
|
||||
/// mapping; `today` anchors relative-date resolution.
|
||||
pub async fn translate_nl_query(
|
||||
client: &dyn LlmClient,
|
||||
nl: &str,
|
||||
tag_vocab: &[(i32, String)],
|
||||
today: chrono::NaiveDate,
|
||||
) -> Result<StructuredQuery> {
|
||||
let system = build_system_prompt(tag_vocab, today);
|
||||
let messages = vec![ChatMessage::system(system), ChatMessage::user(nl)];
|
||||
let (msg, _, _) = client.chat_with_tools(messages, Vec::<Tool>::new()).await?;
|
||||
parse_response(&msg.content, tag_vocab)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn vocab() -> Vec<(i32, String)> {
|
||||
vec![
|
||||
(1, "beach".to_string()),
|
||||
(2, "Sunset".to_string()), // mixed case to exercise case-insensitivity
|
||||
(3, "family".to_string()),
|
||||
]
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iso_to_unix_start_and_end_of_day() {
|
||||
// 2023-01-01 UTC midnight = 1672531200.
|
||||
assert_eq!(iso_to_unix("2023-01-01", false), Some(1_672_531_200));
|
||||
// End of that day is 86399 seconds later.
|
||||
assert_eq!(
|
||||
iso_to_unix("2023-01-01", true),
|
||||
Some(1_672_531_200 + 86_399)
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn iso_to_unix_rejects_garbage() {
|
||||
assert_eq!(iso_to_unix("last summer", false), None);
|
||||
assert_eq!(iso_to_unix("2023-13-99", false), None);
|
||||
assert_eq!(iso_to_unix("", false), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn resolve_matches_tags_case_insensitively() {
|
||||
let raw = RawNlQuery {
|
||||
tags: vec!["BEACH".to_string(), "sunset".to_string()],
|
||||
..Default::default()
|
||||
};
|
||||
let q = resolve_raw_query(raw, &vocab());
|
||||
assert_eq!(q.tag_ids, vec![1, 2]);
|
||||
assert!(q.unmatched_tags.is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn resolve_surfaces_unmatched_tags_not_silently_dropped() {
|
||||
// A hallucinated / non-vocab tag must be surfaced so the caller can
|
||||
// fold it into semantic — never silently used as a hard filter.
|
||||
let raw = RawNlQuery {
|
||||
tags: vec!["beach".to_string(), "golden hour".to_string()],
|
||||
..Default::default()
|
||||
};
|
||||
let q = resolve_raw_query(raw, &vocab());
|
||||
assert_eq!(q.tag_ids, vec![1]);
|
||||
assert_eq!(q.unmatched_tags, vec!["golden hour".to_string()]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn resolve_dedups_repeated_tags() {
|
||||
let raw = RawNlQuery {
|
||||
tags: vec![
|
||||
"beach".to_string(),
|
||||
"Beach".to_string(),
|
||||
"beach".to_string(),
|
||||
],
|
||||
..Default::default()
|
||||
};
|
||||
let q = resolve_raw_query(raw, &vocab());
|
||||
assert_eq!(q.tag_ids, vec![1]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn resolve_normalizes_media_type_and_dates() {
|
||||
let raw = RawNlQuery {
|
||||
media_type: Some("Videos".to_string()),
|
||||
date_from: Some("2023-06-01".to_string()),
|
||||
date_to: Some("2023-06-30".to_string()),
|
||||
..Default::default()
|
||||
};
|
||||
let q = resolve_raw_query(raw, &vocab());
|
||||
assert_eq!(q.media_type.as_deref(), Some("video"));
|
||||
assert_eq!(q.date_from, iso_to_unix("2023-06-01", false));
|
||||
assert_eq!(q.date_to, iso_to_unix("2023-06-30", true));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn resolve_media_type_all_is_no_filter() {
|
||||
let raw = RawNlQuery {
|
||||
media_type: Some("all".to_string()),
|
||||
..Default::default()
|
||||
};
|
||||
assert_eq!(resolve_raw_query(raw, &vocab()).media_type, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn resolve_trims_and_empties_to_none() {
|
||||
let raw = RawNlQuery {
|
||||
semantic: Some(" ".to_string()),
|
||||
camera_make: Some(" Fujifilm ".to_string()),
|
||||
place: Some("".to_string()),
|
||||
..Default::default()
|
||||
};
|
||||
let q = resolve_raw_query(raw, &vocab());
|
||||
assert_eq!(q.semantic, None);
|
||||
assert_eq!(q.camera_make.as_deref(), Some("Fujifilm"));
|
||||
assert_eq!(q.place, None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_response_handles_code_fences_and_prose() {
|
||||
let resp = "Here is the filter:\n```json\n{\"semantic\":\"sunset\",\"tags\":[\"beach\"]}\n```\nDone.";
|
||||
let q = parse_response(resp, &vocab()).expect("parse");
|
||||
assert_eq!(q.semantic.as_deref(), Some("sunset"));
|
||||
assert_eq!(q.tag_ids, vec![1]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_response_handles_think_block_then_json() {
|
||||
let resp = "<think>user wants beach sunsets</think>{\"tags\":[\"beach\",\"sunset\"]}";
|
||||
let q = parse_response(resp, &vocab()).expect("parse");
|
||||
assert_eq!(q.tag_ids, vec![1, 2]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_response_errors_on_non_json() {
|
||||
assert!(parse_response("I cannot help with that.", &vocab()).is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn build_system_prompt_includes_date_and_vocab() {
|
||||
let today = chrono::NaiveDate::from_ymd_opt(2026, 6, 14).unwrap();
|
||||
let prompt = build_system_prompt(&vocab(), today);
|
||||
assert!(
|
||||
prompt.contains("2026-06-14"),
|
||||
"prompt should state today's date"
|
||||
);
|
||||
assert!(prompt.contains("beach"), "prompt should list the vocab");
|
||||
assert!(
|
||||
prompt.contains("never invent a tag"),
|
||||
"prompt should warn against inventing tags"
|
||||
);
|
||||
}
|
||||
}
|
||||
@@ -336,6 +336,7 @@ async fn main() -> anyhow::Result<()> {
|
||||
args.top_p,
|
||||
args.top_k,
|
||||
args.min_p,
|
||||
None, // enable_thinking: leave model/template default
|
||||
args.max_iterations,
|
||||
None,
|
||||
Vec::new(),
|
||||
|
||||
+214
-184
@@ -124,65 +124,161 @@ fn dot(a: &[f32], b: &[f32]) -> f32 {
|
||||
a.iter().zip(b.iter()).map(|(x, y)| x * y).sum()
|
||||
}
|
||||
|
||||
pub async fn search_photos(
|
||||
state: web::Data<AppState>,
|
||||
exif_dao: web::Data<Mutex<Box<dyn ExifDao>>>,
|
||||
query: web::Query<SearchQuery>,
|
||||
) -> ActixResult<HttpResponse> {
|
||||
let q_text = query.q.trim().to_string();
|
||||
if q_text.is_empty() {
|
||||
return Ok(HttpResponse::BadRequest().json(SearchError {
|
||||
error: "query parameter `q` is required".into(),
|
||||
}));
|
||||
}
|
||||
/// Failure modes of [`score_photos`]. Carries enough to let each caller pick
|
||||
/// an appropriate HTTP status (the CLIP service being down is a 502, a
|
||||
/// disabled feature is a 503, a rejected query is a 400, a DB failure 500).
|
||||
pub enum ScoreError {
|
||||
/// CLIP search isn't configured at all (no Apollo endpoint).
|
||||
Disabled,
|
||||
/// The query was rejected by the encoder (client error).
|
||||
Rejected(String),
|
||||
/// The CLIP service is transiently unavailable (upstream error).
|
||||
Unavailable(String),
|
||||
/// The encoder returned an embedding we couldn't decode.
|
||||
MalformedEmbedding,
|
||||
/// A database / index load failure.
|
||||
Internal(String),
|
||||
}
|
||||
|
||||
/// Result of scoring the whole library against a query embedding: the
|
||||
/// resolved model version, how many embeddings were considered, and every
|
||||
/// `(score, content_hash)` above threshold, sorted by descending score.
|
||||
/// Pagination and path resolution are the caller's job (see [`resolve_hits`])
|
||||
/// so this core can be reused for both the plain search endpoint and the
|
||||
/// unified endpoint (which filters by hash before paginating).
|
||||
pub struct ScoredPhotos {
|
||||
pub model_version: String,
|
||||
pub considered: usize,
|
||||
/// `(cosine_score, content_hash)` pairs, descending by score.
|
||||
pub hits: Vec<(f32, String)>,
|
||||
}
|
||||
|
||||
/// Encode `q_text` via CLIP and score it against every stored embedding in
|
||||
/// the given library scope. Returns all matches above `threshold`, sorted by
|
||||
/// descending similarity. Pure of HTTP concerns so it's shared by
|
||||
/// `search_photos` and the unified search endpoint.
|
||||
pub async fn score_photos(
|
||||
state: &AppState,
|
||||
exif_dao: &Mutex<Box<dyn ExifDao>>,
|
||||
q_text: &str,
|
||||
library_ids: &[i32],
|
||||
threshold: f32,
|
||||
model_version: Option<&str>,
|
||||
) -> Result<ScoredPhotos, ScoreError> {
|
||||
if !state.clip_client.is_enabled() {
|
||||
return Ok(HttpResponse::ServiceUnavailable().json(SearchError {
|
||||
error: "CLIP search is disabled (no Apollo CLIP endpoint configured)".into(),
|
||||
}));
|
||||
return Err(ScoreError::Disabled);
|
||||
}
|
||||
|
||||
let limit = query.limit.clamp(1, 200);
|
||||
let offset = query.offset;
|
||||
let threshold = query.threshold.clamp(-1.0, 1.0);
|
||||
|
||||
// 1. Encode the query text. Fast — Apollo's text encoder is ~50ms
|
||||
// on CPU. Bail with a clear error message if Apollo's down so the
|
||||
// user sees "service unavailable" rather than empty results.
|
||||
let query_resp = match state.clip_client.encode_text(&q_text).await {
|
||||
// 1. Encode the query text. Fast — Apollo's text encoder is ~50ms on CPU.
|
||||
let query_resp = match state.clip_client.encode_text(q_text).await {
|
||||
Ok(r) => r,
|
||||
Err(ClipError::Permanent(e)) => {
|
||||
return Ok(HttpResponse::BadRequest().json(SearchError {
|
||||
error: format!("query rejected: {e}"),
|
||||
}));
|
||||
}
|
||||
Err(ClipError::Transient(e)) => {
|
||||
return Ok(HttpResponse::BadGateway().json(SearchError {
|
||||
error: format!("CLIP service unavailable: {e}"),
|
||||
}));
|
||||
}
|
||||
Err(ClipError::Disabled) => {
|
||||
return Ok(HttpResponse::ServiceUnavailable().json(SearchError {
|
||||
error: "CLIP service disabled".into(),
|
||||
}));
|
||||
}
|
||||
Err(ClipError::Permanent(e)) => return Err(ScoreError::Rejected(e.to_string())),
|
||||
Err(ClipError::Transient(e)) => return Err(ScoreError::Unavailable(e.to_string())),
|
||||
Err(ClipError::Disabled) => return Err(ScoreError::Disabled),
|
||||
};
|
||||
// decode_embedding works on raw bytes; the wire format is b64.
|
||||
let query_bytes = base64::engine::general_purpose::STANDARD
|
||||
.decode(query_resp.embedding.as_bytes())
|
||||
.unwrap_or_default();
|
||||
let query_vec = match decode_embedding(&query_bytes) {
|
||||
Some(v) => v,
|
||||
None => {
|
||||
return Ok(HttpResponse::BadGateway().json(SearchError {
|
||||
error: "CLIP service returned a malformed query embedding".into(),
|
||||
}));
|
||||
}
|
||||
};
|
||||
let query_vec = decode_embedding(&query_bytes).ok_or(ScoreError::MalformedEmbedding)?;
|
||||
|
||||
// 2. Decide which library scope to search. `library_ids` (multi)
|
||||
// wins over the legacy `library` (single) when both are present;
|
||||
// either / both empty falls back to "every enabled library".
|
||||
let library_ids: Vec<i32> = if let Some(raw) = query.library_ids.as_deref() {
|
||||
// 2. Pull the (hash, embedding) matrix under the dao lock, release
|
||||
// before scoring. The caller-supplied `model_version` (or the live
|
||||
// engine's) forces a strict join so a mid-flight model swap can't mix
|
||||
// geometries.
|
||||
let ctx = opentelemetry::Context::current();
|
||||
let rows: Vec<(String, Vec<u8>)> = {
|
||||
let mut dao = exif_dao.lock().expect("exif dao");
|
||||
dao.list_clip_index(
|
||||
&ctx,
|
||||
library_ids,
|
||||
model_version.or(Some(&query_resp.model_version)),
|
||||
)
|
||||
.map_err(|e| {
|
||||
log::warn!("clip_search: list_clip_index failed: {:?}", e);
|
||||
ScoreError::Internal("failed to load search index".into())
|
||||
})?
|
||||
};
|
||||
let considered = rows.len();
|
||||
|
||||
// 3. Score. Keep all matches and sort at the end (~microseconds at 14k).
|
||||
let mut hits: Vec<(f32, String)> = Vec::with_capacity(considered);
|
||||
for (hash, blob) in rows {
|
||||
let Some(emb) = decode_embedding(&blob) else {
|
||||
continue;
|
||||
};
|
||||
if emb.len() != query_vec.len() {
|
||||
continue;
|
||||
}
|
||||
let sim = dot(&emb, &query_vec);
|
||||
if sim < threshold {
|
||||
continue;
|
||||
}
|
||||
hits.push((sim, hash));
|
||||
}
|
||||
hits.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
|
||||
|
||||
Ok(ScoredPhotos {
|
||||
model_version: query_resp.model_version,
|
||||
considered,
|
||||
hits,
|
||||
})
|
||||
}
|
||||
|
||||
/// Resolve a page of `(score, content_hash)` pairs back to [`SearchHit`]s
|
||||
/// (each carrying `library_id` + `rel_path`). Hashes that no longer resolve
|
||||
/// to a row are skipped. Shared by both endpoints.
|
||||
pub fn resolve_hits(
|
||||
exif_dao: &Mutex<Box<dyn ExifDao>>,
|
||||
scored: &[(f32, String)],
|
||||
) -> Vec<SearchHit> {
|
||||
if scored.is_empty() {
|
||||
return Vec::new();
|
||||
}
|
||||
let ctx = opentelemetry::Context::current();
|
||||
let hashes: Vec<String> = scored.iter().map(|(_, h)| h.clone()).collect();
|
||||
let mut dao = exif_dao.lock().expect("exif dao");
|
||||
let path_map = dao
|
||||
.get_rel_paths_for_hashes(&ctx, &hashes)
|
||||
.unwrap_or_else(|e| {
|
||||
log::warn!("clip_search: get_rel_paths_for_hashes failed: {:?}", e);
|
||||
std::collections::HashMap::new()
|
||||
});
|
||||
|
||||
let mut results = Vec::with_capacity(scored.len());
|
||||
for (score, hash) in scored {
|
||||
let row = match dao.find_by_content_hash(&ctx, hash) {
|
||||
Ok(Some(r)) => r,
|
||||
Ok(None) => continue,
|
||||
Err(e) => {
|
||||
log::warn!("clip_search: find_by_content_hash failed for {hash}: {e:?}");
|
||||
continue;
|
||||
}
|
||||
};
|
||||
// Prefer get_rel_paths_for_hashes's first entry (shares image_exif's
|
||||
// natural order), falling back to the ImageExif row.
|
||||
let rel_path = path_map
|
||||
.get(hash)
|
||||
.and_then(|paths| paths.first().cloned())
|
||||
.unwrap_or(row.file_path);
|
||||
results.push(SearchHit {
|
||||
library_id: row.library_id,
|
||||
rel_path,
|
||||
content_hash: hash.clone(),
|
||||
score: *score,
|
||||
});
|
||||
}
|
||||
results
|
||||
}
|
||||
|
||||
/// Parse the `library_ids` (multi) / `library` (single) scope params into a
|
||||
/// deduped id list. Empty = "every enabled library". Shared so the unified
|
||||
/// endpoint scopes CLIP identically.
|
||||
pub fn parse_library_scope(
|
||||
library_ids: Option<&str>,
|
||||
library: Option<i32>,
|
||||
) -> Result<Vec<i32>, String> {
|
||||
if let Some(raw) = library_ids {
|
||||
let mut out: Vec<i32> = Vec::new();
|
||||
for piece in raw.split(',') {
|
||||
let trimmed = piece.trim();
|
||||
@@ -195,158 +291,92 @@ pub async fn search_photos(
|
||||
out.push(id);
|
||||
}
|
||||
}
|
||||
Err(_) => {
|
||||
return Ok(HttpResponse::BadRequest().json(SearchError {
|
||||
error: format!("invalid library_ids entry: {trimmed:?}"),
|
||||
}));
|
||||
}
|
||||
Err(_) => return Err(format!("invalid library_ids entry: {trimmed:?}")),
|
||||
}
|
||||
}
|
||||
out
|
||||
} else if let Some(id) = query.library {
|
||||
vec![id]
|
||||
Ok(out)
|
||||
} else if let Some(id) = library {
|
||||
Ok(vec![id])
|
||||
} else {
|
||||
Vec::new()
|
||||
};
|
||||
Ok(Vec::new())
|
||||
}
|
||||
}
|
||||
|
||||
// 3. Pull the (hash, embedding) matrix. Lock contention here is
|
||||
// bounded — one big SELECT under a mutex Arc<Mutex<dyn ExifDao>>
|
||||
// and then we release before scoring. If this becomes a hotspot
|
||||
// we'll cache the decoded matrix in AppState with TTL.
|
||||
let ctx = opentelemetry::Context::current();
|
||||
let rows: Vec<(String, Vec<u8>)> = {
|
||||
let mut dao = exif_dao.lock().expect("exif dao");
|
||||
match dao.list_clip_index(
|
||||
&ctx,
|
||||
&library_ids,
|
||||
query
|
||||
.model_version
|
||||
.as_deref()
|
||||
.or(Some(&query_resp.model_version)),
|
||||
) {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
log::warn!("clip_search: list_clip_index failed: {:?}", e);
|
||||
return Ok(HttpResponse::InternalServerError().json(SearchError {
|
||||
error: "failed to load search index".into(),
|
||||
}));
|
||||
}
|
||||
}
|
||||
};
|
||||
let considered = rows.len();
|
||||
if considered == 0 {
|
||||
return Ok(HttpResponse::Ok().json(SearchResponse {
|
||||
query: q_text,
|
||||
model_version: query_resp.model_version,
|
||||
threshold,
|
||||
considered,
|
||||
total_matching: 0,
|
||||
offset,
|
||||
results: Vec::new(),
|
||||
pub async fn search_photos(
|
||||
state: web::Data<AppState>,
|
||||
exif_dao: web::Data<Mutex<Box<dyn ExifDao>>>,
|
||||
query: web::Query<SearchQuery>,
|
||||
) -> ActixResult<HttpResponse> {
|
||||
let q_text = query.q.trim().to_string();
|
||||
if q_text.is_empty() {
|
||||
return Ok(HttpResponse::BadRequest().json(SearchError {
|
||||
error: "query parameter `q` is required".into(),
|
||||
}));
|
||||
}
|
||||
|
||||
// 4. Score. Cap the loop's transient allocation; we keep all scores
|
||||
// and sort at the end. With ~14k entries the sort is microseconds.
|
||||
let mut scored: Vec<(f32, String)> = Vec::with_capacity(considered);
|
||||
for (hash, blob) in rows {
|
||||
let Some(emb) = decode_embedding(&blob) else {
|
||||
continue;
|
||||
};
|
||||
if emb.len() != query_vec.len() {
|
||||
continue;
|
||||
}
|
||||
let sim = dot(&emb, &query_vec);
|
||||
if sim < threshold {
|
||||
continue;
|
||||
}
|
||||
scored.push((sim, hash));
|
||||
}
|
||||
scored.sort_by(|a, b| b.0.partial_cmp(&a.0).unwrap_or(std::cmp::Ordering::Equal));
|
||||
let total_matching = scored.len();
|
||||
// Pagination — slice the sorted list at `[offset, offset+limit)`.
|
||||
// Offsets past the end produce empty pages rather than an error so
|
||||
// the client can stop fetching naturally on "load more" past the end.
|
||||
let scored: Vec<(f32, String)> = if offset >= total_matching {
|
||||
let limit = query.limit.clamp(1, 200);
|
||||
let offset = query.offset;
|
||||
let threshold = query.threshold.clamp(-1.0, 1.0);
|
||||
|
||||
let library_ids = match parse_library_scope(query.library_ids.as_deref(), query.library) {
|
||||
Ok(ids) => ids,
|
||||
Err(msg) => return Ok(HttpResponse::BadRequest().json(SearchError { error: msg })),
|
||||
};
|
||||
|
||||
let scored = match score_photos(
|
||||
&state,
|
||||
&exif_dao,
|
||||
&q_text,
|
||||
&library_ids,
|
||||
threshold,
|
||||
query.model_version.as_deref(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(s) => s,
|
||||
Err(e) => return Ok(score_error_response(e)),
|
||||
};
|
||||
|
||||
let total_matching = scored.hits.len();
|
||||
// Pagination — slice the sorted list at `[offset, offset+limit)`. Offsets
|
||||
// past the end produce empty pages so "load more" stops naturally.
|
||||
let page: Vec<(f32, String)> = if offset >= total_matching {
|
||||
Vec::new()
|
||||
} else {
|
||||
let end = (offset + limit).min(total_matching);
|
||||
scored[offset..end].to_vec()
|
||||
scored.hits[offset..end].to_vec()
|
||||
};
|
||||
|
||||
if scored.is_empty() {
|
||||
return Ok(HttpResponse::Ok().json(SearchResponse {
|
||||
query: q_text,
|
||||
model_version: query_resp.model_version,
|
||||
threshold,
|
||||
considered,
|
||||
total_matching,
|
||||
offset,
|
||||
results: Vec::new(),
|
||||
}));
|
||||
}
|
||||
|
||||
// 5. Resolve each surviving hash back to a `(library_id, rel_path)`.
|
||||
// `get_rel_paths_by_hash` returns every rel_path; we pick the first
|
||||
// one for the result. Apollo / the UI can fetch alternatives via
|
||||
// /image/metadata when needed.
|
||||
let hashes: Vec<String> = scored.iter().map(|(_, h)| h.clone()).collect();
|
||||
let path_map = {
|
||||
let mut dao = exif_dao.lock().expect("exif dao");
|
||||
match dao.get_rel_paths_for_hashes(&ctx, &hashes) {
|
||||
Ok(m) => m,
|
||||
Err(e) => {
|
||||
log::warn!("clip_search: get_rel_paths_for_hashes failed: {:?}", e);
|
||||
return Ok(HttpResponse::InternalServerError().json(SearchError {
|
||||
error: "failed to resolve photo paths".into(),
|
||||
}));
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// We need (library_id, rel_path) — get_rel_paths_for_hashes only
|
||||
// returns rel_paths. Cross-reference via find_by_content_hash to
|
||||
// pick the library too. Single call per surviving hash; cheap at
|
||||
// top-20.
|
||||
let mut results = Vec::with_capacity(scored.len());
|
||||
{
|
||||
let mut dao = exif_dao.lock().expect("exif dao");
|
||||
for (score, hash) in scored {
|
||||
let row = match dao.find_by_content_hash(&ctx, &hash) {
|
||||
Ok(Some(r)) => r,
|
||||
Ok(None) => continue,
|
||||
Err(e) => {
|
||||
log::warn!(
|
||||
"clip_search: find_by_content_hash failed for {}: {:?}",
|
||||
hash,
|
||||
e
|
||||
);
|
||||
continue;
|
||||
}
|
||||
};
|
||||
// Prefer get_rel_paths_for_hashes's first entry if it
|
||||
// exists (it shares semantics with `image_exif`'s natural
|
||||
// order), falling back to the ImageExif row.
|
||||
let rel_path = path_map
|
||||
.get(&hash)
|
||||
.and_then(|paths| paths.first().cloned())
|
||||
.unwrap_or(row.file_path);
|
||||
results.push(SearchHit {
|
||||
library_id: row.library_id,
|
||||
rel_path,
|
||||
content_hash: hash,
|
||||
score,
|
||||
});
|
||||
}
|
||||
}
|
||||
let results = resolve_hits(&exif_dao, &page);
|
||||
|
||||
Ok(HttpResponse::Ok().json(SearchResponse {
|
||||
query: q_text,
|
||||
model_version: query_resp.model_version,
|
||||
model_version: scored.model_version,
|
||||
threshold,
|
||||
considered,
|
||||
considered: scored.considered,
|
||||
total_matching,
|
||||
offset,
|
||||
results,
|
||||
}))
|
||||
}
|
||||
|
||||
/// Map a [`ScoreError`] to the HTTP response `search_photos` historically
|
||||
/// returned for each failure mode. Reused by the unified endpoint.
|
||||
pub fn score_error_response(e: ScoreError) -> HttpResponse {
|
||||
match e {
|
||||
ScoreError::Disabled => HttpResponse::ServiceUnavailable().json(SearchError {
|
||||
error: "CLIP search is disabled (no Apollo CLIP endpoint configured)".into(),
|
||||
}),
|
||||
ScoreError::Rejected(msg) => HttpResponse::BadRequest().json(SearchError {
|
||||
error: format!("query rejected: {msg}"),
|
||||
}),
|
||||
ScoreError::Unavailable(msg) => HttpResponse::BadGateway().json(SearchError {
|
||||
error: format!("CLIP service unavailable: {msg}"),
|
||||
}),
|
||||
ScoreError::MalformedEmbedding => HttpResponse::BadGateway().json(SearchError {
|
||||
error: "CLIP service returned a malformed query embedding".into(),
|
||||
}),
|
||||
ScoreError::Internal(msg) => {
|
||||
HttpResponse::InternalServerError().json(SearchError { error: msg })
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
+172
@@ -1,4 +1,5 @@
|
||||
/// Geographic calculation utilities for GPS-based search
|
||||
use serde::Deserialize;
|
||||
use std::f64;
|
||||
|
||||
/// Calculate distance between two GPS coordinates using the Haversine formula.
|
||||
@@ -61,6 +62,140 @@ pub fn gps_bounding_box(lat: f64, lon: f64, radius_km: f64) -> (f64, f64, f64, f
|
||||
)
|
||||
}
|
||||
|
||||
/// A place resolved from a free-text query via forward geocoding.
|
||||
///
|
||||
/// The filter pipeline searches a *circle* (`gps_lat`/`gps_lon`/
|
||||
/// `gps_radius_km`), but a place can be anything from a single address to
|
||||
/// a whole country. We collapse Nominatim's bounding box into the smallest
|
||||
/// circle that circumscribes it (see [`bbox_to_circle`]) so "Portland" and
|
||||
/// "Italy" both map onto the existing circle filter without a schema change.
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub struct GeoPlace {
|
||||
/// Nominatim's canonical name for the match (e.g. "Italia").
|
||||
pub display_name: String,
|
||||
/// Centroid latitude in decimal degrees.
|
||||
pub lat: f64,
|
||||
/// Centroid longitude in decimal degrees.
|
||||
pub lon: f64,
|
||||
/// Radius (km) of a circle centred on the centroid that covers the
|
||||
/// matched area. Floored to [`MIN_PLACE_RADIUS_KM`] so a point result
|
||||
/// (whose bounding box is microscopic) still yields a usable circle.
|
||||
pub radius_km: f64,
|
||||
}
|
||||
|
||||
/// Floor for a geocoded place's radius. Point results (a street address)
|
||||
/// come back with a near-zero bounding box; without a floor the circle
|
||||
/// filter would match nothing.
|
||||
pub const MIN_PLACE_RADIUS_KM: f64 = 0.5;
|
||||
|
||||
/// Collapse a bounding box into the centroid + circumscribing radius.
|
||||
///
|
||||
/// Input is Nominatim's `boundingbox` order: `(south_lat, north_lat,
|
||||
/// west_lon, east_lon)`. The radius is the *largest* great-circle distance
|
||||
/// from the centroid to any of the four corners, so the resulting circle
|
||||
/// fully covers the box. (The corners aren't equidistant on a sphere —
|
||||
/// longitude lines converge toward the poles, so the equator-facing edge's
|
||||
/// corners are farthest; taking the max guarantees coverage in either
|
||||
/// hemisphere.)
|
||||
///
|
||||
/// Pure and exact (no flooring) so it can be unit-tested directly; callers
|
||||
/// apply [`MIN_PLACE_RADIUS_KM`] when turning the result into a filter.
|
||||
pub fn bbox_to_circle(south: f64, north: f64, west: f64, east: f64) -> (f64, f64, f64) {
|
||||
let center_lat = (south + north) / 2.0;
|
||||
let center_lon = (west + east) / 2.0;
|
||||
let radius_km = [(south, west), (south, east), (north, west), (north, east)]
|
||||
.iter()
|
||||
.map(|(clat, clon)| haversine_distance(center_lat, center_lon, *clat, *clon))
|
||||
.fold(0.0_f64, f64::max);
|
||||
(center_lat, center_lon, radius_km)
|
||||
}
|
||||
|
||||
/// Raw Nominatim `/search` result. `lat`/`lon` arrive as strings and
|
||||
/// `boundingbox` as a 4-element string array `[south, north, west, east]`.
|
||||
#[derive(Deserialize)]
|
||||
struct NominatimSearchResult {
|
||||
lat: String,
|
||||
lon: String,
|
||||
display_name: String,
|
||||
boundingbox: Option<[String; 4]>,
|
||||
}
|
||||
|
||||
/// Forward-geocode a free-text place name to a [`GeoPlace`] via the public
|
||||
/// OpenStreetMap Nominatim `/search` endpoint.
|
||||
///
|
||||
/// Mirrors `InsightGenerator::reverse_geocode`'s error posture: any network,
|
||||
/// HTTP, or parse failure returns `None` rather than propagating, so a flaky
|
||||
/// geocoder degrades the query to "no location filter" instead of failing it.
|
||||
///
|
||||
/// Nominatim's usage policy requires a `User-Agent` and rate-limits to ~1
|
||||
/// request/second; callers doing this interactively should cache results.
|
||||
pub async fn forward_geocode(query: &str) -> Option<GeoPlace> {
|
||||
let q = query.trim();
|
||||
if q.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
let client = reqwest::Client::new();
|
||||
let response = match client
|
||||
.get("https://nominatim.openstreetmap.org/search")
|
||||
.query(&[("format", "json"), ("limit", "1"), ("q", q)])
|
||||
.header("User-Agent", "ImageAPI/1.0") // Nominatim requires User-Agent
|
||||
.send()
|
||||
.await
|
||||
{
|
||||
Ok(resp) => resp,
|
||||
Err(e) => {
|
||||
log::warn!("Forward geocoding network error for {q:?}: {e}");
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
if !response.status().is_success() {
|
||||
log::warn!(
|
||||
"Forward geocoding HTTP error for {q:?}: {}",
|
||||
response.status()
|
||||
);
|
||||
return None;
|
||||
}
|
||||
|
||||
let results: Vec<NominatimSearchResult> = match response.json().await {
|
||||
Ok(r) => r,
|
||||
Err(e) => {
|
||||
log::warn!("Forward geocoding JSON parse error for {q:?}: {e}");
|
||||
return None;
|
||||
}
|
||||
};
|
||||
|
||||
let top = results.into_iter().next()?;
|
||||
let lat: f64 = top.lat.parse().ok()?;
|
||||
let lon: f64 = top.lon.parse().ok()?;
|
||||
|
||||
// Prefer the bounding box (handles large places); fall back to a
|
||||
// point + floor radius when Nominatim omits it.
|
||||
let (center_lat, center_lon, radius_km) = match &top.boundingbox {
|
||||
Some([s, n, w, e]) => match (s.parse(), n.parse(), w.parse(), e.parse()) {
|
||||
(Ok(s), Ok(n), Ok(w), Ok(e)) => bbox_to_circle(s, n, w, e),
|
||||
_ => (lat, lon, 0.0),
|
||||
},
|
||||
None => (lat, lon, 0.0),
|
||||
};
|
||||
|
||||
let place = GeoPlace {
|
||||
display_name: top.display_name,
|
||||
lat: center_lat,
|
||||
lon: center_lon,
|
||||
radius_km: radius_km.max(MIN_PLACE_RADIUS_KM),
|
||||
};
|
||||
log::info!(
|
||||
"Forward geocoded {q:?} -> {} ({:.4}, {:.4}, r={:.1}km)",
|
||||
place.display_name,
|
||||
place.lat,
|
||||
place.lon,
|
||||
place.radius_km
|
||||
);
|
||||
Some(place)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
@@ -118,4 +253,41 @@ mod tests {
|
||||
distance
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bbox_to_circle_centroid() {
|
||||
// Symmetric box around (10, 20): centroid should land dead centre.
|
||||
let (lat, lon, radius) = bbox_to_circle(9.0, 11.0, 19.0, 21.0);
|
||||
assert!((lat - 10.0).abs() < 1e-9, "centroid lat, got {lat}");
|
||||
assert!((lon - 20.0).abs() < 1e-9, "centroid lon, got {lon}");
|
||||
assert!(radius > 0.0, "radius should be positive, got {radius}");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bbox_to_circle_covers_corner() {
|
||||
// The radius must reach every corner of the box. Verify the
|
||||
// centroid-to-corner distance equals the returned radius for all
|
||||
// four corners (they're symmetric, so all equal).
|
||||
let (south, north, west, east) = (40.0, 42.0, -74.0, -72.0);
|
||||
let (lat, lon, radius) = bbox_to_circle(south, north, west, east);
|
||||
for (clat, clon) in [(south, west), (south, east), (north, west), (north, east)] {
|
||||
let d = haversine_distance(lat, lon, clat, clon);
|
||||
assert!(
|
||||
d <= radius + 1e-6,
|
||||
"corner ({clat},{clon}) at {d}km should be within radius {radius}km"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_bbox_to_circle_country_vs_city_scale() {
|
||||
// A country-sized box yields a far larger radius than a city-sized
|
||||
// one — confirming the bbox approach scales with place size.
|
||||
let (_, _, country) = bbox_to_circle(35.5, 47.1, 6.6, 18.5); // ~Italy
|
||||
let (_, _, city) = bbox_to_circle(45.4, 45.6, -122.8, -122.5); // ~Portland
|
||||
assert!(
|
||||
country > city * 10.0,
|
||||
"country radius {country}km should dwarf city radius {city}km"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -35,6 +35,7 @@ pub mod tags;
|
||||
#[cfg(test)]
|
||||
pub mod testhelpers;
|
||||
pub mod thumbnails;
|
||||
pub mod unified_search;
|
||||
pub mod utils;
|
||||
pub mod video;
|
||||
|
||||
|
||||
@@ -54,6 +54,7 @@ mod perceptual_hash;
|
||||
mod state;
|
||||
mod tags;
|
||||
mod thumbnails;
|
||||
mod unified_search;
|
||||
mod utils;
|
||||
mod video;
|
||||
mod watcher;
|
||||
@@ -333,6 +334,13 @@ fn main() -> std::io::Result<()> {
|
||||
web::resource("/photos/search")
|
||||
.route(web::get().to(clip_search::search_photos)),
|
||||
)
|
||||
.service(
|
||||
// Unified natural-language search: LLM translates the
|
||||
// query into structured filters + a semantic term, then
|
||||
// filters constrain and CLIP ranks. See src/unified_search.rs.
|
||||
web::resource("/photos/search/unified")
|
||||
.route(web::get().to(unified_search::unified_search::<SqliteTagDao>)),
|
||||
)
|
||||
.service(web::resource("/file/move").post(move_file::<RealFileSystem>))
|
||||
.service(handlers::image::get_image)
|
||||
.service(handlers::image::upload_image)
|
||||
|
||||
@@ -309,6 +309,7 @@ pub async fn generate_script_agentic(
|
||||
top_p: None,
|
||||
top_k: None,
|
||||
min_p: None,
|
||||
enable_thinking: None,
|
||||
},
|
||||
)
|
||||
.await
|
||||
|
||||
@@ -0,0 +1,521 @@
|
||||
//! `/photos/search/unified?q=<natural language>` — unified NL photo search.
|
||||
//!
|
||||
//! One free-text box that composes the two existing engines instead of making
|
||||
//! the user pick between them:
|
||||
//! 1. A grounded local-LLM call ([`crate::ai::nl_query`]) translates the
|
||||
//! query into a structured filter + a semantic term.
|
||||
//! 2. Structured filters (tags / EXIF / geo / date / media-type) define the
|
||||
//! candidate set; the semantic term ranks within it via CLIP.
|
||||
//!
|
||||
//! Path A (orchestration): we reuse `clip_search`'s scoring core and the
|
||||
//! existing `ExifDao` / `TagDao` queries, joining on `content_hash`. EXIF rows
|
||||
//! are the universal candidate carrier — each has `(library_id, file_path,
|
||||
//! content_hash, date_taken)` — so the structured filter is just a predicate
|
||||
//! over them, and the CLIP hits (which key on `content_hash`) intersect by
|
||||
//! hash. No new schema, no surgery on `list_photos`.
|
||||
//!
|
||||
//! Degenerate cases collapse to the existing behavior: semantic-only → plain
|
||||
//! CLIP search; filters-only → a date-sorted filtered listing.
|
||||
//!
|
||||
//! Person filtering is intentionally deferred (no person→photos resolver yet).
|
||||
|
||||
use crate::AppState;
|
||||
use crate::ai::backend::{BackendKind, SamplingOverrides};
|
||||
use crate::ai::nl_query::{StructuredQuery, translate_nl_query};
|
||||
use crate::clip_search::{
|
||||
SearchHit, parse_library_scope, resolve_hits, score_error_response, score_photos,
|
||||
};
|
||||
use crate::data::Claims;
|
||||
use crate::database::ExifDao;
|
||||
use crate::file_types::{is_image_file, is_video_file};
|
||||
use crate::geo::{forward_geocode, gps_bounding_box, haversine_distance};
|
||||
use crate::tags::TagDao;
|
||||
use actix_web::HttpResponse;
|
||||
use actix_web::web::{Data, Query};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashSet;
|
||||
use std::path::Path;
|
||||
use std::sync::Mutex;
|
||||
|
||||
#[derive(Debug, Deserialize)]
|
||||
pub struct UnifiedQuery {
|
||||
/// Natural-language query. Required; empty triggers 400.
|
||||
pub q: String,
|
||||
#[serde(default = "default_limit")]
|
||||
pub limit: usize,
|
||||
#[serde(default)]
|
||||
pub offset: usize,
|
||||
/// CLIP cosine floor for the semantic ranking stage. Same default as the
|
||||
/// plain search endpoint.
|
||||
#[serde(default = "default_threshold")]
|
||||
pub threshold: f32,
|
||||
/// Legacy single-library scope (see clip_search).
|
||||
pub library: Option<i32>,
|
||||
/// Multi-library scope, comma-separated ids.
|
||||
pub library_ids: Option<String>,
|
||||
/// Optional model override. The client passes the user's currently-selected
|
||||
/// local model so the translation step reuses a model that's already loaded
|
||||
/// (avoids a llama-swap eviction / cold start). Falls back to the configured
|
||||
/// default local model when absent. Local only — no hybrid here.
|
||||
pub model: Option<String>,
|
||||
}
|
||||
|
||||
fn default_limit() -> usize {
|
||||
20
|
||||
}
|
||||
fn default_threshold() -> f32 {
|
||||
0.20
|
||||
}
|
||||
|
||||
/// A geocoded place echoed back so the client can show / edit the location
|
||||
/// filter it actually searched.
|
||||
#[derive(Debug, Serialize)]
|
||||
struct ResolvedPlace {
|
||||
display_name: String,
|
||||
lat: f64,
|
||||
lon: f64,
|
||||
radius_km: f64,
|
||||
}
|
||||
|
||||
/// How the server interpreted the NL query — echoed to the client to render
|
||||
/// editable filter chips. tag ids map to the client's existing tag list.
|
||||
#[derive(Debug, Serialize)]
|
||||
struct Interpreted {
|
||||
semantic: Option<String>,
|
||||
tag_ids: Vec<i32>,
|
||||
exclude_tag_ids: Vec<i32>,
|
||||
/// Words the model treated as tags that don't exist in the vocab; folded
|
||||
/// into the semantic term and surfaced here so the UI can explain it.
|
||||
unmatched_tags: Vec<String>,
|
||||
camera_make: Option<String>,
|
||||
camera_model: Option<String>,
|
||||
lens_model: Option<String>,
|
||||
date_from: Option<i64>,
|
||||
date_to: Option<i64>,
|
||||
media_type: Option<String>,
|
||||
place: Option<ResolvedPlace>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
struct UnifiedResponse {
|
||||
query: String,
|
||||
interpreted: Interpreted,
|
||||
/// CLIP model version used for ranking; `None` when the query had no
|
||||
/// semantic term (filters-only).
|
||||
model_version: Option<String>,
|
||||
/// Embeddings scored by CLIP (0 when filters-only).
|
||||
considered: usize,
|
||||
/// Matches before pagination.
|
||||
total_matching: usize,
|
||||
offset: usize,
|
||||
results: Vec<SearchHit>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
struct ErrorBody {
|
||||
error: String,
|
||||
}
|
||||
|
||||
fn bad_request(msg: impl Into<String>) -> HttpResponse {
|
||||
HttpResponse::BadRequest().json(ErrorBody { error: msg.into() })
|
||||
}
|
||||
|
||||
/// Combine the model's semantic term with any tag words that didn't match the
|
||||
/// vocab, so a hallucinated/non-vocab tag becomes a soft semantic signal
|
||||
/// rather than being dropped.
|
||||
fn effective_semantic(sq: &StructuredQuery) -> Option<String> {
|
||||
let mut parts: Vec<String> = Vec::new();
|
||||
if let Some(s) = sq.semantic.as_deref() {
|
||||
parts.push(s.to_string());
|
||||
}
|
||||
parts.extend(sq.unmatched_tags.iter().cloned());
|
||||
if parts.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(parts.join(" "))
|
||||
}
|
||||
}
|
||||
|
||||
pub async fn unified_search<TagD: TagDao>(
|
||||
_: Claims,
|
||||
state: Data<AppState>,
|
||||
exif_dao: Data<Mutex<Box<dyn ExifDao>>>,
|
||||
tag_dao: Data<Mutex<TagD>>,
|
||||
query: Query<UnifiedQuery>,
|
||||
) -> HttpResponse {
|
||||
let nl = query.q.trim().to_string();
|
||||
if nl.is_empty() {
|
||||
return bad_request("query parameter `q` is required");
|
||||
}
|
||||
|
||||
let limit = query.limit.clamp(1, 200);
|
||||
let offset = query.offset;
|
||||
let threshold = query.threshold.clamp(-1.0, 1.0);
|
||||
|
||||
let library_ids = match parse_library_scope(query.library_ids.as_deref(), query.library) {
|
||||
Ok(ids) => ids,
|
||||
Err(msg) => return bad_request(msg),
|
||||
};
|
||||
|
||||
let ctx = opentelemetry::Context::current();
|
||||
|
||||
// ── 1. Translate the NL query, grounded on the real tag vocabulary ──
|
||||
let tag_vocab: Vec<(i32, String)> = {
|
||||
let mut dao = tag_dao.lock().expect("tag dao");
|
||||
match dao.get_all_tags(&ctx, None) {
|
||||
Ok(tags) => tags.into_iter().map(|(_, t)| (t.id, t.name)).collect(),
|
||||
Err(e) => {
|
||||
log::warn!("unified_search: get_all_tags failed: {e:?}");
|
||||
Vec::new()
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Respect env/config for the LLM backend (LLM_BACKEND → ollama or
|
||||
// llama-swap); local only, no hybrid, per the feature's design.
|
||||
//
|
||||
// Translation-model precedence:
|
||||
// 1. UNIFIED_SEARCH_MODEL env — pin a small, fast model that can stay
|
||||
// co-resident with CLIP (and the chat model) so translation never
|
||||
// evicts them. This is the recommended setup on a tight VRAM budget.
|
||||
// 2. the client-selected model — routes translation to whatever the user
|
||||
// already has loaded (no swap) when no dedicated model is pinned.
|
||||
// 3. None → resolve_backend uses the configured default local model.
|
||||
let translation_model = std::env::var("UNIFIED_SEARCH_MODEL")
|
||||
.ok()
|
||||
.filter(|m| !m.trim().is_empty())
|
||||
.or_else(|| query.model.clone())
|
||||
.filter(|m| !m.trim().is_empty());
|
||||
let overrides = SamplingOverrides {
|
||||
model: translation_model,
|
||||
num_ctx: None,
|
||||
temperature: None,
|
||||
top_p: None,
|
||||
top_k: None,
|
||||
min_p: None,
|
||||
enable_thinking: None,
|
||||
};
|
||||
let backend = match state
|
||||
.insight_generator
|
||||
.resolve_backend(BackendKind::Local, &overrides)
|
||||
.await
|
||||
{
|
||||
Ok(b) => b,
|
||||
Err(e) => {
|
||||
log::warn!("unified_search: resolve_backend failed: {e:?}");
|
||||
return HttpResponse::ServiceUnavailable().json(ErrorBody {
|
||||
error: "LLM backend unavailable".into(),
|
||||
});
|
||||
}
|
||||
};
|
||||
log::info!("unified_search: translating with model={}", backend.model());
|
||||
|
||||
let today = chrono::Utc::now().date_naive();
|
||||
let sq = match translate_nl_query(backend.chat(), &nl, &tag_vocab, today).await {
|
||||
Ok(sq) => sq,
|
||||
Err(e) => {
|
||||
log::warn!("unified_search: translate_nl_query failed: {e:?}");
|
||||
return HttpResponse::BadGateway().json(ErrorBody {
|
||||
error: "could not interpret the query".into(),
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
// ── 2. Forward-geocode the place name into a gps circle ──
|
||||
let resolved_place = match sq.place.as_deref() {
|
||||
Some(p) => forward_geocode(p).await.map(|g| ResolvedPlace {
|
||||
display_name: g.display_name,
|
||||
lat: g.lat,
|
||||
lon: g.lon,
|
||||
radius_km: g.radius_km,
|
||||
}),
|
||||
None => None,
|
||||
};
|
||||
let gps = resolved_place.as_ref().map(|p| (p.lat, p.lon, p.radius_km));
|
||||
|
||||
let semantic = effective_semantic(&sq);
|
||||
|
||||
let has_exif_filter = sq.camera_make.is_some()
|
||||
|| sq.camera_model.is_some()
|
||||
|| sq.lens_model.is_some()
|
||||
|| sq.date_from.is_some()
|
||||
|| sq.date_to.is_some();
|
||||
let has_struct =
|
||||
has_exif_filter || gps.is_some() || !sq.tag_ids.is_empty() || sq.media_type.is_some();
|
||||
|
||||
// Stage trace: what the model extracted + whether a structured filter is
|
||||
// active. The chips show this to the user too, but logging it makes the
|
||||
// "why no results" path debuggable from the server side.
|
||||
log::info!(
|
||||
"unified_search: q={nl:?} semantic={:?} tag_ids={:?} exclude={:?} place={:?} gps={:?} date=({:?},{:?}) media={:?} unmatched={:?} has_struct={has_struct}",
|
||||
sq.semantic,
|
||||
sq.tag_ids,
|
||||
sq.exclude_tag_ids,
|
||||
resolved_place.as_ref().map(|p| p.display_name.as_str()),
|
||||
gps,
|
||||
sq.date_from,
|
||||
sq.date_to,
|
||||
sq.media_type,
|
||||
sq.unmatched_tags,
|
||||
);
|
||||
|
||||
// ── 3. Build the structured candidate set (EXIF rows passing every
|
||||
// filter). Skipped entirely for a pure-semantic query. ──
|
||||
let mut candidate: Vec<crate::database::models::ImageExif> = Vec::new();
|
||||
let mut allowed_hashes: HashSet<String> = HashSet::new();
|
||||
if has_struct {
|
||||
// Tag membership set (rel_path only — same cross-library imprecision
|
||||
// as the existing /photos tag listing). ANY-mode: a photo matches if
|
||||
// it carries any of the named tags. ALL-mode over-constrains NL
|
||||
// queries (the model maps several words to tags and few photos carry
|
||||
// them all); the semantic term does the precision work instead.
|
||||
let tag_set: Option<HashSet<String>> = if sq.tag_ids.is_empty() {
|
||||
None
|
||||
} else {
|
||||
let mut dao = tag_dao.lock().expect("tag dao");
|
||||
match dao.get_files_with_any_tag_ids(
|
||||
sq.tag_ids.clone(),
|
||||
sq.exclude_tag_ids.clone(),
|
||||
&ctx,
|
||||
) {
|
||||
Ok(files) => Some(files.into_iter().map(|f| f.file_name).collect()),
|
||||
Err(e) => {
|
||||
log::warn!("unified_search: tag filter failed: {e:?}");
|
||||
Some(HashSet::new())
|
||||
}
|
||||
}
|
||||
};
|
||||
log::info!(
|
||||
"unified_search: tag_ids={:?} -> tag_set_files={:?}",
|
||||
sq.tag_ids,
|
||||
tag_set.as_ref().map(|s| s.len())
|
||||
);
|
||||
|
||||
// EXIF query handles camera/lens/gps-box/date. With no EXIF filters
|
||||
// it returns the whole table, which we then narrow by the predicates
|
||||
// below (tags / media / scope). Fine at personal-library scale.
|
||||
let gps_bounds = gps.map(|(lat, lon, r)| gps_bounding_box(lat, lon, r));
|
||||
let rows = {
|
||||
let mut dao = exif_dao.lock().expect("exif dao");
|
||||
dao.query_by_exif(
|
||||
&ctx,
|
||||
None, // scope filtered in-Rust to support multi-library
|
||||
sq.camera_make.as_deref(),
|
||||
sq.camera_model.as_deref(),
|
||||
sq.lens_model.as_deref(),
|
||||
gps_bounds,
|
||||
sq.date_from,
|
||||
sq.date_to,
|
||||
)
|
||||
.unwrap_or_else(|e| {
|
||||
log::warn!("unified_search: query_by_exif failed: {e:?}");
|
||||
Vec::new()
|
||||
})
|
||||
};
|
||||
|
||||
candidate = rows
|
||||
.into_iter()
|
||||
.filter(|row| {
|
||||
// Library scope.
|
||||
if !library_ids.is_empty() && !library_ids.contains(&row.library_id) {
|
||||
return false;
|
||||
}
|
||||
// Precise GPS distance (the EXIF query only did a coarse box).
|
||||
if let Some((lat, lon, radius_km)) = gps {
|
||||
match (row.gps_latitude, row.gps_longitude) {
|
||||
(Some(plat), Some(plon)) => {
|
||||
if haversine_distance(lat, lon, plat as f64, plon as f64) > radius_km {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
_ => return false,
|
||||
}
|
||||
}
|
||||
// Media type.
|
||||
if let Some(mt) = sq.media_type.as_deref() {
|
||||
let p = Path::new(&row.file_path);
|
||||
let ok = if mt == "video" {
|
||||
is_video_file(p)
|
||||
} else {
|
||||
is_image_file(p)
|
||||
};
|
||||
if !ok {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// Tag membership.
|
||||
if let Some(ts) = &tag_set
|
||||
&& !ts.contains(&row.file_path)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
true
|
||||
})
|
||||
.collect();
|
||||
|
||||
allowed_hashes = candidate
|
||||
.iter()
|
||||
.filter_map(|r| r.content_hash.clone())
|
||||
.collect();
|
||||
log::info!(
|
||||
"unified_search: candidate_rows={} allowed_hashes={}",
|
||||
candidate.len(),
|
||||
allowed_hashes.len()
|
||||
);
|
||||
}
|
||||
|
||||
// ── 4. Rank ──
|
||||
match semantic {
|
||||
Some(ref sem) => {
|
||||
// When structured filters are present they ARE the constraint —
|
||||
// CLIP only ranks within the candidate set. So drop the global
|
||||
// similarity threshold (it's tuned for whole-library search and
|
||||
// would pre-discard filter-matching photos that scored just under
|
||||
// it — e.g. a 2022 beach photo at 0.18 — before the intersection
|
||||
// ever runs). With no filters, keep the user's threshold for the
|
||||
// plain semantic case.
|
||||
let clip_threshold = if has_struct { -1.0 } else { threshold };
|
||||
let scored = match score_photos(
|
||||
&state,
|
||||
&exif_dao,
|
||||
sem,
|
||||
&library_ids,
|
||||
clip_threshold,
|
||||
None,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(s) => s,
|
||||
Err(e) => return score_error_response(e),
|
||||
};
|
||||
let considered = scored.considered;
|
||||
let clip_hits = scored.hits.len();
|
||||
let hits: Vec<(f32, String)> = if has_struct {
|
||||
scored
|
||||
.hits
|
||||
.into_iter()
|
||||
.filter(|(_, h)| allowed_hashes.contains(h))
|
||||
.collect()
|
||||
} else {
|
||||
scored.hits
|
||||
};
|
||||
log::info!(
|
||||
"unified_search: clip considered={considered} hits={clip_hits} after_struct_filter={}",
|
||||
hits.len()
|
||||
);
|
||||
let total_matching = hits.len();
|
||||
let page = paginate(&hits, offset, limit);
|
||||
let results = resolve_hits(&exif_dao, &page);
|
||||
HttpResponse::Ok().json(UnifiedResponse {
|
||||
query: nl,
|
||||
interpreted: interpreted(&sq, resolved_place),
|
||||
model_version: Some(scored.model_version),
|
||||
considered: scored.considered,
|
||||
total_matching,
|
||||
offset,
|
||||
results,
|
||||
})
|
||||
}
|
||||
None => {
|
||||
// Filters-only: no semantic term. Require at least one filter,
|
||||
// then return the candidate set newest-first.
|
||||
if !has_struct {
|
||||
return bad_request("query had no searchable terms");
|
||||
}
|
||||
candidate.sort_by(|a, b| b.date_taken.cmp(&a.date_taken));
|
||||
let total_matching = candidate.len();
|
||||
log::info!("unified_search: filters-only matches={total_matching}");
|
||||
let end = (offset + limit).min(total_matching);
|
||||
let results: Vec<SearchHit> = if offset >= total_matching {
|
||||
Vec::new()
|
||||
} else {
|
||||
candidate[offset..end]
|
||||
.iter()
|
||||
.map(|r| SearchHit {
|
||||
library_id: r.library_id,
|
||||
rel_path: r.file_path.clone(),
|
||||
content_hash: r.content_hash.clone().unwrap_or_default(),
|
||||
score: 0.0,
|
||||
})
|
||||
.collect()
|
||||
};
|
||||
HttpResponse::Ok().json(UnifiedResponse {
|
||||
query: nl,
|
||||
interpreted: interpreted(&sq, resolved_place),
|
||||
model_version: None,
|
||||
considered: 0,
|
||||
total_matching,
|
||||
offset,
|
||||
results,
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Slice a sorted hit list at `[offset, offset+limit)`, tolerating
|
||||
/// out-of-range offsets (empty page).
|
||||
fn paginate(hits: &[(f32, String)], offset: usize, limit: usize) -> Vec<(f32, String)> {
|
||||
if offset >= hits.len() {
|
||||
return Vec::new();
|
||||
}
|
||||
let end = (offset + limit).min(hits.len());
|
||||
hits[offset..end].to_vec()
|
||||
}
|
||||
|
||||
fn interpreted(sq: &StructuredQuery, place: Option<ResolvedPlace>) -> Interpreted {
|
||||
Interpreted {
|
||||
semantic: sq.semantic.clone(),
|
||||
tag_ids: sq.tag_ids.clone(),
|
||||
exclude_tag_ids: sq.exclude_tag_ids.clone(),
|
||||
unmatched_tags: sq.unmatched_tags.clone(),
|
||||
camera_make: sq.camera_make.clone(),
|
||||
camera_model: sq.camera_model.clone(),
|
||||
lens_model: sq.lens_model.clone(),
|
||||
date_from: sq.date_from,
|
||||
date_to: sq.date_to,
|
||||
media_type: sq.media_type.clone(),
|
||||
place,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::ai::nl_query::StructuredQuery;
|
||||
|
||||
#[test]
|
||||
fn effective_semantic_combines_semantic_and_unmatched() {
|
||||
let sq = StructuredQuery {
|
||||
semantic: Some("sunset".into()),
|
||||
unmatched_tags: vec!["golden hour".into()],
|
||||
..Default::default()
|
||||
};
|
||||
assert_eq!(
|
||||
effective_semantic(&sq).as_deref(),
|
||||
Some("sunset golden hour")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn effective_semantic_none_when_empty() {
|
||||
let sq = StructuredQuery::default();
|
||||
assert_eq!(effective_semantic(&sq), None);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn effective_semantic_unmatched_only() {
|
||||
let sq = StructuredQuery {
|
||||
unmatched_tags: vec!["disco".into()],
|
||||
..Default::default()
|
||||
};
|
||||
assert_eq!(effective_semantic(&sq).as_deref(), Some("disco"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn paginate_handles_out_of_range_offset() {
|
||||
let hits = vec![(0.9, "a".to_string()), (0.8, "b".to_string())];
|
||||
assert_eq!(paginate(&hits, 5, 10).len(), 0);
|
||||
assert_eq!(paginate(&hits, 0, 1).len(), 1);
|
||||
assert_eq!(paginate(&hits, 1, 10).len(), 1);
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user