diff --git a/.env.example b/.env.example index 718f6bd..f7a1004 100644 --- a/.env.example +++ b/.env.example @@ -53,6 +53,33 @@ AGENTIC_CHAT_MAX_ITERATIONS=6 # OPENROUTER_HTTP_REFERER=https://your-site.example # OPENROUTER_APP_TITLE=ImageApi +# ── AI Insights — local backend switch ────────────────────────────────── +# Picks which local LLM stack the server uses for chat, vision describe, +# and embeddings. `ollama` (default) uses the OLLAMA_* settings above; +# `llamacpp` uses the LLAMA_SWAP_* settings below. The switch is global +# and applies to both `backend=local` and `backend=hybrid` (hybrid keeps +# chat on OpenRouter but still uses this stack for the describe pass). +# Don't flip mid-deploy without re-embedding existing index rows — +# mixed vector spaces break similarity search. +# LLM_BACKEND=ollama + +# ── AI Insights — llama.cpp / llama-swap (optional) ───────────────────── +# Set LLAMA_SWAP_URL plus LLM_BACKEND=llamacpp to swap the local stack +# off Ollama. Talks OpenAI-compatible /v1 to a llama-swap proxy fronting +# per-slot llama-server instances. Chat models receive images directly +# via content-parts (vision-capable models assumed); a separate vision +# slot is used only by the describe_photo tool and describe-image utility. +# LLAMA_SWAP_URL=http://localhost:9292/v1 +# LLAMA_SWAP_PRIMARY_MODEL=chat +# Optional dedicated vision slot for describe_image. Defaults to +# PRIMARY_MODEL so describe_photo works without extra config. +# LLAMA_SWAP_VISION_MODEL=vision +# LLAMA_SWAP_EMBEDDING_MODEL=embed +# Comma-separated allowlist surfaced by /insights/models when +# LLM_BACKEND=llamacpp. All report has_vision=true. +# LLAMA_SWAP_ALLOWED_MODELS=chat,vision,embed +# LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180 + # ── AI Insights — sibling services (optional) ─────────────────────────── # Apollo (places, face inference, CLIP encoders). Single-Apollo deploys # typically set only APOLLO_API_BASE_URL and let the face + CLIP diff --git a/CLAUDE.md b/CLAUDE.md index 7e605cc..7f1da76 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -473,7 +473,7 @@ GET /memories?path=...&recursive=true POST /insights/generate (non-agentic single-shot) POST /insights/generate/agentic (tool-calling loop; body: { file_path, backend?, model?, ... }) GET /insights?path=...&library=... -GET /insights/models (local Ollama models + capabilities) +GET /insights/models (local-backend models + capabilities; Ollama OR llama-swap based on LLM_BACKEND) GET /insights/openrouter/models (curated OpenRouter allowlist) POST /insights/rate (thumbs up/down for training data) @@ -631,6 +631,27 @@ OPENROUTER_EMBEDDING_MODEL=openai/text-embedding-3-small # Optional, embeddings OPENROUTER_HTTP_REFERER=https://your-site.example # Optional attribution header OPENROUTER_APP_TITLE=ImageApi # Optional attribution header +# Local LLM backend switch. `ollama` (default) keeps the OLLAMA_* settings +# above; `llamacpp` swaps the entire local stack (chat + vision describe + +# embeddings) over to llama-swap. The switch is global and applies to +# `backend=local` requests and to `backend=hybrid`'s describe pass (hybrid +# chat still goes to OpenRouter). Don't flip mid-deploy without +# re-embedding — mixed vector spaces break similarity search. +LLM_BACKEND=ollama + +# llama.cpp / llama-swap (used when LLM_BACKEND=llamacpp). OpenAI-compatible +# proxy hosting one or more llama-server processes. Chat models receive +# images directly via content-parts (all models assumed vision-capable). +LLAMA_SWAP_URL=http://localhost:9292/v1 # Required when LLM_BACKEND=llamacpp +LLAMA_SWAP_PRIMARY_MODEL=chat # Chat slot id (matches config.yaml) +LLAMA_SWAP_VISION_MODEL= # Dedicated vision slot for describe_image / describe_photo + # tool. Defaults to PRIMARY_MODEL when unset. +LLAMA_SWAP_EMBEDDING_MODEL=embed # Embedding slot id +LLAMA_SWAP_ALLOWED_MODELS=chat,coder # Curated allowlist surfaced by GET /insights/models + # when LLM_BACKEND=llamacpp. All report has_vision=true. + # Empty = picker shows only the configured primary model. +LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180 # Per-request timeout; bump for slow CPU offload + # Insight Chat Continuation AGENTIC_CHAT_MAX_ITERATIONS=6 # Cap on tool-calling iterations per chat turn (default 6) ``` @@ -650,10 +671,50 @@ The `OllamaClient` provides methods to query available models: This allows runtime verification of model availability before generating insights. +**Local backend switch (`LLM_BACKEND`):** + +One env var decides which "local" stack the server runs against — `ollama` +(default) or `llamacpp`. It's global on purpose: chat, vision, and +embeddings all route through the same backend, so the embedding-vector +column in SQLite stays in one vector space. Don't flip mid-deploy without +re-embedding the affected rows — similarity search will collapse. + +- `LLM_BACKEND=ollama`: chat, vision, and embeddings use Ollama. Vision + capability is probed per-model via `/api/show`. +- `LLM_BACKEND=llamacpp`: chat models receive images directly via OpenAI + content-parts (all models assumed vision-capable). Embeddings hit the + `embed` slot. A dedicated `LLAMA_SWAP_VISION_MODEL` slot (defaults to + the chat model) handles `describe_image` for the `describe_photo` tool. + Requires `LLAMA_SWAP_URL`. + +The per-request `backend=hybrid` override is orthogonal: it always sends +chat to OpenRouter (text-only, images are pre-described and inlined), but +the describe + embed passes still route through whichever `LLM_BACKEND` +is configured. + +**Backend dispatch (`ResolvedBackend`):** + +`InsightGenerator::resolve_backend(kind, overrides)` is the single entry +point that builds clients for a request. Returns a `ResolvedBackend` with +two roles: `.chat()` (the agentic/chat client) and `.local()` (local-only +utility calls: rerank, describe_image, embeddings). `BackendKind` is an +enum (`Local` | `Hybrid`) replacing the stringly-typed `"local"` / +`"hybrid"` labels. `SamplingOverrides` groups model/ctx/temp/top_p/top_k/ +min_p per-request overrides. All downstream code (`execute_tool`, +`run_streaming_agentic_loop`, etc.) takes `&ResolvedBackend` rather than +individual client references. + +`GET /insights/models` returns the local-backend models with capabilities +in the same envelope shape regardless of `LLM_BACKEND`: Ollama servers +when `ollama`, llama-swap slots (from `LLAMA_SWAP_ALLOWED_MODELS`) when +`llamacpp`. No `/insights/llamacpp/models` — the picker reads a single +endpoint. + **Hybrid Backend (OpenRouter):** - Per-request opt-in via `backend=hybrid` on `POST /insights/generate/agentic`. -- Local Ollama still describes the image (vision); the description is inlined - into the chat prompt and the agentic loop runs on OpenRouter. +- Vision describe happens before the agentic loop; the description is inlined + into the chat prompt and the agentic loop runs on OpenRouter. Vision + routes through whichever `LLM_BACKEND` is configured. - `request.model` (if provided) overrides `OPENROUTER_DEFAULT_MODEL` for that call. The mobile picker reads from `OPENROUTER_ALLOWED_MODELS`. - No live capability precheck — the operator-curated allowlist is trusted. @@ -661,6 +722,15 @@ This allows runtime verification of model availability before generating insight - `GET /insights/openrouter/models` returns `{ models, default_model, configured }` for client picker UIs. +**Cross-replay matrix (chat continuation):** +- `local → local` allowed (whether served by Ollama or llama-swap; that's + a deploy-time decision, not a request-time one). +- `hybrid → hybrid` allowed. +- `hybrid → local` allowed (the inlined description replays as text). +- `local → hybrid` rejected — the stored transcript has raw images in the + first user message and OpenRouter providers don't accept that shape + consistently. Regenerate the insight in hybrid mode instead. + **Insight Chat Continuation:** After an agentic insight is generated, the full `Vec` transcript is diff --git a/Cargo.lock b/Cargo.lock index 3f6d0c2..5d3e4ce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2051,7 +2051,7 @@ dependencies = [ [[package]] name = "image-api" -version = "1.1.0" +version = "1.2.0" dependencies = [ "actix", "actix-cors", diff --git a/Cargo.toml b/Cargo.toml index 7713970..7324001 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "image-api" -version = "1.1.0" +version = "1.2.0" authors = ["Cameron Cordes "] edition = "2024" diff --git a/src/ai/backend.rs b/src/ai/backend.rs new file mode 100644 index 0000000..0515f1c --- /dev/null +++ b/src/ai/backend.rs @@ -0,0 +1,140 @@ +use anyhow::{Result, anyhow}; + +use crate::ai::llm_client::LlmClient; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum BackendKind { + Local, + Hybrid, +} + +impl BackendKind { + pub fn parse(s: &str) -> Result { + match s.trim().to_lowercase().as_str() { + "local" | "" => Ok(Self::Local), + "hybrid" => Ok(Self::Hybrid), + other => Err(anyhow!( + "unknown backend '{}'; expected 'local' or 'hybrid'", + other + )), + } + } + + pub fn as_str(&self) -> &'static str { + match self { + Self::Local => "local", + Self::Hybrid => "hybrid", + } + } +} + +impl std::fmt::Display for BackendKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.as_str()) + } +} + +pub struct SamplingOverrides { + pub model: Option, + pub num_ctx: Option, + pub temperature: Option, + pub top_p: Option, + pub top_k: Option, + pub min_p: Option, +} + +impl SamplingOverrides { + pub fn has_sampling(&self) -> bool { + self.temperature.is_some() + || self.top_p.is_some() + || self.top_k.is_some() + || self.min_p.is_some() + } +} + +pub struct ResolvedBackend { + chat: Box, + local: Box, + pub kind: BackendKind, + /// `true` when the chat model receives images directly (Ollama with + /// vision, or llamacpp). `false` for hybrid where we describe-then-inline. + pub images_inline: bool, +} + +impl ResolvedBackend { + pub fn new( + chat: Box, + local: Box, + kind: BackendKind, + images_inline: bool, + ) -> Self { + Self { + chat, + local, + kind, + images_inline, + } + } + + pub fn chat(&self) -> &dyn LlmClient { + self.chat.as_ref() + } + + pub fn local(&self) -> &dyn LlmClient { + self.local.as_ref() + } + + pub fn model(&self) -> &str { + self.chat.primary_model() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_backend_kind() { + assert_eq!(BackendKind::parse("local").unwrap(), BackendKind::Local); + assert_eq!(BackendKind::parse("hybrid").unwrap(), BackendKind::Hybrid); + assert_eq!(BackendKind::parse(" Local ").unwrap(), BackendKind::Local); + assert_eq!(BackendKind::parse("HYBRID").unwrap(), BackendKind::Hybrid); + assert_eq!(BackendKind::parse("").unwrap(), BackendKind::Local); + assert!(BackendKind::parse("vllm").is_err()); + } + + #[test] + fn backend_kind_as_str_roundtrips() { + assert_eq!( + BackendKind::parse(BackendKind::Local.as_str()).unwrap(), + BackendKind::Local + ); + assert_eq!( + BackendKind::parse(BackendKind::Hybrid.as_str()).unwrap(), + BackendKind::Hybrid + ); + } + + #[test] + fn sampling_overrides_has_sampling() { + let empty = SamplingOverrides { + model: None, + num_ctx: None, + temperature: None, + top_p: None, + top_k: None, + min_p: None, + }; + assert!(!empty.has_sampling()); + + let with_temp = SamplingOverrides { + model: None, + num_ctx: Some(4096), + temperature: Some(0.7), + top_p: None, + top_k: None, + min_p: None, + }; + assert!(with_temp.has_sampling()); + } +} diff --git a/src/ai/handlers.rs b/src/ai/handlers.rs index 0e46057..a7a3720 100644 --- a/src/ai/handlers.rs +++ b/src/ai/handlers.rs @@ -470,7 +470,16 @@ pub async fn generate_agentic_insight_handler( } } -/// GET /insights/models - List available models from both servers with capabilities +/// GET /insights/models - Local-backend models with capabilities. Returns +/// Ollama servers when `LLM_BACKEND=ollama` (default), or llama-swap slots +/// when `LLM_BACKEND=llamacpp`. Same envelope shape either way so the +/// client picker doesn't have to branch on backend kind. +/// +/// For llama-swap: `models` comes verbatim from `LLAMA_SWAP_ALLOWED_MODELS` +/// (no live `/v1/models` probe), `has_vision` is true only for the +/// configured `LLAMA_SWAP_VISION_MODEL` slot id, and `has_tool_calling` is +/// reported as true for every slot (llama-server is launched with `--jinja` +/// by convention — a misconfigured slot surfaces as a chat-call error). #[get("/insights/models")] pub async fn get_available_models_handler( _claims: Claims, @@ -478,6 +487,29 @@ pub async fn get_available_models_handler( ) -> impl Responder { log::debug!("Fetching available models with capabilities"); + if crate::ai::local_backend_is_llamacpp() + && let Some(lc) = app_state.llamacpp.as_ref() + { + let models: Vec = app_state + .llamacpp_allowed_models + .iter() + .map(|name| ModelCapabilities { + name: name.clone(), + has_vision: true, + has_tool_calling: true, + }) + .collect(); + let primary = ServerModels { + url: lc.base_url.clone(), + models, + default_model: lc.primary_model.clone(), + }; + return HttpResponse::Ok().json(AvailableModelsResponse { + primary, + fallback: None, + }); + } + let ollama_client = &app_state.ollama; // Fetch models with capabilities from primary server diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs index b2a7af8..9837733 100644 --- a/src/ai/insight_chat.rs +++ b/src/ai/insight_chat.rs @@ -6,10 +6,9 @@ use std::collections::HashMap; use std::sync::{Arc, Mutex}; use tokio::sync::Mutex as TokioMutex; +use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides}; use crate::ai::insight_generator::InsightGenerator; -use crate::ai::llm_client::{ChatMessage, LlmClient, LlmStreamEvent, Tool}; -use crate::ai::ollama::OllamaClient; -use crate::ai::openrouter::OpenRouterClient; +use crate::ai::llm_client::{ChatMessage, LlmStreamEvent, Tool}; use crate::database::InsightDao; use crate::database::models::InsertPhotoInsight; use crate::otel::global_tracer; @@ -91,8 +90,6 @@ pub struct ChatTurnResult { #[derive(Clone)] pub struct InsightChatService { generator: Arc, - ollama: OllamaClient, - openrouter: Option>, insight_dao: Arc>>, chat_locks: ChatLockMap, } @@ -100,15 +97,11 @@ pub struct InsightChatService { impl InsightChatService { pub fn new( generator: Arc, - ollama: OllamaClient, - openrouter: Option>, insight_dao: Arc>>, chat_locks: ChatLockMap, ) -> Self { Self { generator, - ollama, - openrouter, insight_dao, chat_locks, } @@ -303,24 +296,10 @@ impl InsightChatService { .map(|s| s.trim().to_lowercase()) .filter(|s| !s.is_empty()) .unwrap_or_else(|| stored_backend.clone()); - if !matches!(effective_backend.as_str(), "local" | "hybrid") { - bail!( - "unknown backend '{}'; expected 'local' or 'hybrid'", - effective_backend - ); - } - if stored_backend == "local" && effective_backend == "hybrid" { - bail!( - "switching from local to hybrid mid-chat isn't supported yet; \ - regenerate the insight in hybrid mode if you want OpenRouter chat" - ); - } - let is_hybrid = effective_backend == "hybrid"; - span.set_attribute(KeyValue::new("backend", effective_backend.clone())); + validate_cross_replay(&stored_backend, &effective_backend)?; + let kind = BackendKind::parse(&effective_backend)?; + span.set_attribute(KeyValue::new("backend", kind.as_str())); - // 4. Build the chat backend client. Ollama in local mode, a freshly - // cloned OpenRouter client in hybrid mode (clone so per-request - // sampling/model overrides don't leak into shared state). let max_iterations = req .max_iterations .unwrap_or(DEFAULT_MAX_ITERATIONS) @@ -328,91 +307,38 @@ impl InsightChatService { span.set_attribute(KeyValue::new("max_iterations", max_iterations as i64)); let stored_model = insight.model_version.clone(); - let custom_model = req - .model - .clone() - .or_else(|| Some(stored_model.clone())) - .filter(|m| !m.is_empty()); - - let mut ollama_client = self.ollama.clone(); - let mut openrouter_client: Option = None; - - if is_hybrid { - let arc = self.openrouter.as_ref().ok_or_else(|| { - anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured") - })?; - let mut c: OpenRouterClient = (**arc).clone(); - if let Some(ref m) = custom_model { - c.primary_model = m.clone(); - } - if req.temperature.is_some() - || req.top_p.is_some() - || req.top_k.is_some() - || req.min_p.is_some() - { - c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p); - } - if let Some(ctx) = req.num_ctx { - c.set_num_ctx(Some(ctx)); - } - openrouter_client = Some(c); - } else { - // Local-mode model swap. Build a new client when the chat model - // differs from the configured one (mirrors the agentic pattern). - if let Some(ref m) = custom_model - && m != &self.ollama.primary_model - { - ollama_client = OllamaClient::new( - self.ollama.primary_url.clone(), - self.ollama.fallback_url.clone(), - m.clone(), - Some(m.clone()), - ); - } - if req.temperature.is_some() - || req.top_p.is_some() - || req.top_k.is_some() - || req.min_p.is_some() - { - ollama_client.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p); - } - if let Some(ctx) = req.num_ctx { - ollama_client.set_num_ctx(Some(ctx)); - } - } - - let chat_backend: &dyn LlmClient = if let Some(ref c) = openrouter_client { - c - } else { - &ollama_client + let overrides = SamplingOverrides { + model: req + .model + .clone() + .or_else(|| Some(stored_model.clone())) + .filter(|m| !m.is_empty()), + num_ctx: req.num_ctx, + temperature: req.temperature, + top_p: req.top_p, + top_k: req.top_k, + min_p: req.min_p, }; - let model_used = chat_backend.primary_model().to_string(); + let backend = self.generator.resolve_backend(kind, &overrides).await?; + let model_used = backend.model().to_string(); span.set_attribute(KeyValue::new("model", model_used.clone())); - // 5. Decide vision + tool set. In hybrid we always omit - // `describe_photo` (matches the original generation flow). In - // local we trust the stored history's first-user shape: if it - // carries `images`, the original model was vision-capable, and - // we keep `describe_photo` available. + // 5. Decide vision + tool set. In hybrid (describe-then-inline) mode + // we omit `describe_photo`. Otherwise trust the stored history: + // if the first user message carries images, describe_photo stays. let local_first_user_has_image = messages .iter() .find(|m| m.role == "user") .and_then(|m| m.images.as_ref()) .map(|imgs| !imgs.is_empty()) .unwrap_or(false); - let offer_describe_tool = !is_hybrid && local_first_user_has_image; - // current_gate_opts(has_vision) sets gate_opts.has_vision = has_vision - // and probes the per-table presence flags. Pass `offer_describe_tool` - // directly — the `!is_hybrid && local_first_user_has_image` decision - // is the chat-path's vision predicate. + let offer_describe_tool = backend.images_inline && local_first_user_has_image; let gate_opts = self.generator.current_gate_opts_for_persona( offer_describe_tool, Some((req.user_id, &active_persona)), ); let tools = InsightGenerator::build_tool_definitions(gate_opts); - // Image base64 only needed when describe_photo is on the menu. Load - // lazily to avoid disk IO when the loop never invokes it. let image_base64: Option = if offer_describe_tool { self.generator.load_image_as_base64(&normalized).ok() } else { @@ -461,13 +387,13 @@ impl InsightChatService { iterations_used = iteration + 1; log::info!("Chat iteration {}/{}", iterations_used, max_iterations); - let (response, prompt_tokens, eval_tokens) = chat_backend + let (response, prompt_tokens, eval_tokens) = backend + .chat() .chat_with_tools(messages.clone(), tools.clone()) .await?; last_prompt_eval_count = prompt_tokens; last_eval_count = eval_tokens; - // Ollama rejects non-object tool-call arguments on replay. let mut response = response; if let Some(ref mut tcs) = response.tool_calls { for tc in tcs.iter_mut() { @@ -495,13 +421,11 @@ impl InsightChatService { .execute_tool( &tool_call.function.name, &tool_call.function.arguments, - &ollama_client, + &backend, &image_base64, &normalized, req.user_id, &active_persona, - &model_used, - &effective_backend, &loop_cx, ) .await; @@ -515,8 +439,6 @@ impl InsightChatService { } if final_content.is_empty() { - // The model never produced a final answer; ask once more without - // tools to force a textual reply. log::info!( "Chat loop exhausted after {} iterations, requesting final answer", iterations_used @@ -524,7 +446,8 @@ impl InsightChatService { messages.push(ChatMessage::user( "Please write your final answer now without calling any more tools.", )); - let (final_response, prompt_tokens, eval_tokens) = chat_backend + let (final_response, prompt_tokens, eval_tokens) = backend + .chat() .chat_with_tools(messages.clone(), vec![]) .await?; last_prompt_eval_count = prompt_tokens; @@ -560,7 +483,8 @@ impl InsightChatService { Capture the key moment or theme. Return ONLY the title, nothing else.", final_content ); - let title_raw = chat_backend + let title_raw = backend + .chat() .generate( &title_prompt, Some( @@ -585,7 +509,7 @@ impl InsightChatService { model_version: model_used.clone(), is_current: true, training_messages: Some(json), - backend: effective_backend.clone(), + backend: kind.as_str().to_string(), fewshot_source_ids: None, content_hash: None, }; @@ -610,7 +534,7 @@ impl InsightChatService { prompt_eval_count: last_prompt_eval_count, eval_count: last_eval_count, amended_insight_id, - backend_used: effective_backend, + backend_used: kind.as_str().to_string(), model_used, }) } @@ -799,19 +723,8 @@ impl InsightChatService { .map(|s| s.trim().to_lowercase()) .filter(|s| !s.is_empty()) .unwrap_or_else(|| stored_backend.clone()); - if !matches!(effective_backend.as_str(), "local" | "hybrid") { - bail!( - "unknown backend '{}'; expected 'local' or 'hybrid'", - effective_backend - ); - } - if stored_backend == "local" && effective_backend == "hybrid" { - bail!( - "switching from local to hybrid mid-chat isn't supported yet; \ - regenerate the insight in hybrid mode if you want OpenRouter chat" - ); - } - let is_hybrid = effective_backend == "hybrid"; + let kind = BackendKind::parse(&effective_backend)?; + validate_cross_replay(&stored_backend, kind.as_str())?; let max_iterations = req .max_iterations @@ -819,27 +732,31 @@ impl InsightChatService { .clamp(1, env_max_iterations()); let stored_model = insight.model_version.clone(); - let custom_model = req - .model - .clone() - .or_else(|| Some(stored_model.clone())) - .filter(|m| !m.is_empty()); + let overrides = SamplingOverrides { + model: req + .model + .clone() + .or_else(|| Some(stored_model.clone())) + .filter(|m| !m.is_empty()), + num_ctx: req.num_ctx, + temperature: req.temperature, + top_p: req.top_p, + top_k: req.top_k, + min_p: req.min_p, + }; + let backend = self.generator.resolve_backend(kind, &overrides).await?; + let model_used = backend.model().to_string(); - let (chat_backend_holder, ollama_client) = - self.build_chat_clients(is_hybrid, custom_model.as_deref(), &req)?; - let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref(); - let model_used = chat_backend.primary_model().to_string(); - - // Tool set — local mode + first user turn carries an image → - // offer describe_photo. Hybrid: visual description was inlined - // when the insight was bootstrapped, no describe tool needed. + // Tool set — images_inline mode + first user turn carries an image → + // offer describe_photo. Describe-then-inline mode (hybrid only): + // visual description was inlined at bootstrap, no describe tool needed. let local_first_user_has_image = messages .iter() .find(|m| m.role == "user") .and_then(|m| m.images.as_ref()) .map(|imgs| !imgs.is_empty()) .unwrap_or(false); - let offer_describe_tool = !is_hybrid && local_first_user_has_image; + let offer_describe_tool = backend.images_inline && local_first_user_has_image; let gate_opts = self.generator.current_gate_opts_for_persona( offer_describe_tool, Some((req.user_id, &active_persona)), @@ -870,16 +787,13 @@ impl InsightChatService { let outcome = self .run_streaming_agentic_loop( - chat_backend, - &ollama_client, + &backend, &mut messages, tools, &image_base64, &normalized, req.user_id, &active_persona, - &model_used, - &effective_backend, max_iterations, &tx, ) @@ -907,7 +821,8 @@ impl InsightChatService { let mut amended_insight_id: Option = None; if req.amend { - let title = self.generate_title(chat_backend, &final_content).await?; + let (title, body) = crate::ai::insight_generator::parse_title_body(&final_content); + let final_content = body; // Amended rows intentionally do not inherit the parent's // `fewshot_source_ids`. The parent's few-shot influence is still @@ -923,7 +838,7 @@ impl InsightChatService { model_version: model_used.clone(), is_current: true, training_messages: Some(json), - backend: effective_backend.clone(), + backend: kind.as_str().to_string(), fewshot_source_ids: None, content_hash: None, }; @@ -949,7 +864,7 @@ impl InsightChatService { eval_tokens: last_eval_count, num_ctx: req.num_ctx, amended_insight_id, - backend_used: effective_backend, + backend_used: kind.as_str().to_string(), model_used, }) .await; @@ -975,18 +890,23 @@ impl InsightChatService { .filter(|s| !s.trim().is_empty()) .unwrap_or_else(|| "default".to_string()); let effective_backend = resolve_bootstrap_backend(req.backend.as_deref())?; - let is_hybrid = effective_backend == "hybrid"; + let kind = BackendKind::parse(&effective_backend)?; let max_iterations = req .max_iterations .unwrap_or(DEFAULT_MAX_ITERATIONS) .clamp(1, env_max_iterations()); - let custom_model = req.model.clone().filter(|m| !m.is_empty()); - let (chat_backend_holder, ollama_client) = - self.build_chat_clients(is_hybrid, custom_model.as_deref(), &req)?; - let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref(); - let model_used = chat_backend.primary_model().to_string(); + let overrides = SamplingOverrides { + model: req.model.clone().filter(|m| !m.is_empty()), + num_ctx: req.num_ctx, + temperature: req.temperature, + top_p: req.top_p, + top_k: req.top_k, + min_p: req.min_p, + }; + let backend = self.generator.resolve_backend(kind, &overrides).await?; + let model_used = backend.model().to_string(); // Load image bytes once. RAW preview fallback is handled inside // load_image_as_base64. Errors degrade silently — a chat that @@ -1007,18 +927,17 @@ impl InsightChatService { _ => None, }); - // Hybrid backend: pre-describe the image via local Ollama vision - // so OpenRouter chat models (which can't see images directly) get - // the visual description as text. Mirrors the same pre-describe - // pass that `generate_agentic_insight_for_photo` does for hybrid. - let visual_block = if is_hybrid { + // Describe-then-inline (hybrid only): pre-describe the image so a + // text-only chat model gets the visual description inline. + // images_inline backends send images directly to the chat model. + let visual_block = if !backend.images_inline { match image_base64.as_deref() { - Some(b64) => match self.ollama.describe_image(b64).await { + Some(b64) => match backend.local().describe_image(b64).await { Ok(desc) => { format!("Visual description (from local vision model):\n{}\n", desc) } Err(e) => { - log::warn!("hybrid bootstrap: local describe_image failed: {}", e); + log::warn!("{} bootstrap: describe_image failed: {}", kind.as_str(), e); String::new() } }, @@ -1028,10 +947,10 @@ impl InsightChatService { String::new() }; - // Tool gates. Local + image present → expose describe_photo so - // the chat model can re-look at the photo on demand. Hybrid: + // Tool gates. images_inline + image present → expose describe_photo so + // the chat model can re-look at the photo on demand. Non-inline: // already inlined, no tool needed. - let offer_describe_tool = !is_hybrid && image_base64.is_some(); + let offer_describe_tool = backend.images_inline && image_base64.is_some(); let gate_opts = self.generator.current_gate_opts_for_persona( offer_describe_tool, Some((req.user_id, &active_persona)), @@ -1057,23 +976,22 @@ impl InsightChatService { ); let system_msg = ChatMessage::system(system_content); let mut user_msg = ChatMessage::user(req.user_message.clone()); - if !is_hybrid && let Some(ref img) = image_base64 { - user_msg.images = Some(vec![img.clone()]); + if backend.images_inline { + if let Some(ref img) = image_base64 { + user_msg.images = Some(vec![img.clone()]); + } } let mut messages = vec![system_msg, user_msg]; let outcome = self .run_streaming_agentic_loop( - chat_backend, - &ollama_client, + &backend, &mut messages, tools, &image_base64, &normalized, req.user_id, &active_persona, - &model_used, - &effective_backend, max_iterations, &tx, ) @@ -1086,7 +1004,7 @@ impl InsightChatService { final_content, } = outcome; - let title = self.generate_title(chat_backend, &final_content).await?; + let (title, body) = crate::ai::insight_generator::parse_title_body(&final_content); let json = serde_json::to_string(&messages) .map_err(|e| anyhow!("failed to serialize chat history: {}", e))?; @@ -1094,12 +1012,12 @@ impl InsightChatService { library_id: req.library_id, file_path: normalized.clone(), title, - summary: final_content, + summary: body, generated_at: Utc::now().timestamp(), model_version: model_used.clone(), is_current: true, training_messages: Some(json), - backend: effective_backend.clone(), + backend: kind.as_str().to_string(), fewshot_source_ids: None, content_hash: None, }; @@ -1122,7 +1040,7 @@ impl InsightChatService { eval_tokens: last_eval_count, num_ctx: req.num_ctx, amended_insight_id: Some(stored.id), - backend_used: effective_backend, + backend_used: kind.as_str().to_string(), model_used, }) .await; @@ -1130,105 +1048,19 @@ impl InsightChatService { Ok(()) } - /// Set up chat clients (Ollama + optional OpenRouter) shared by - /// bootstrap and continuation. Returns the chat-side backend client - /// (boxed because hybrid and local return different concrete types) - /// and the Ollama client used for describe-image / local tool calls. - fn build_chat_clients( - &self, - is_hybrid: bool, - custom_model: Option<&str>, - req: &ChatTurnRequest, - ) -> Result<(Box, OllamaClient)> { - let mut ollama_client = self.ollama.clone(); - - if is_hybrid { - let arc = self.openrouter.as_ref().ok_or_else(|| { - anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured") - })?; - let mut c: OpenRouterClient = (**arc).clone(); - if let Some(m) = custom_model { - c.primary_model = m.to_string(); - } - if req.temperature.is_some() - || req.top_p.is_some() - || req.top_k.is_some() - || req.min_p.is_some() - { - c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p); - } - if let Some(ctx) = req.num_ctx { - c.set_num_ctx(Some(ctx)); - } - return Ok((Box::new(c), ollama_client)); - } - - if let Some(m) = custom_model - && m != self.ollama.primary_model - { - ollama_client = OllamaClient::new( - self.ollama.primary_url.clone(), - self.ollama.fallback_url.clone(), - m.to_string(), - Some(m.to_string()), - ); - } - if req.temperature.is_some() - || req.top_p.is_some() - || req.top_k.is_some() - || req.min_p.is_some() - { - ollama_client.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p); - } - if let Some(ctx) = req.num_ctx { - ollama_client.set_num_ctx(Some(ctx)); - } - Ok((Box::new(ollama_client.clone()), ollama_client)) - } - - /// Generate a short title via the same chat backend so voice stays - /// consistent with the body. Mirrors generate_agentic_insight_for_photo's - /// titling pass. - async fn generate_title( - &self, - chat_backend: &dyn LlmClient, - final_content: &str, - ) -> Result { - let title_prompt = format!( - "Create a short title (maximum 8 words) for the following journal entry:\n\n{}\n\n\ - Capture the key moment or theme. Return ONLY the title, nothing else.", - final_content - ); - let title_raw = chat_backend - .generate( - &title_prompt, - Some( - "You are my long term memory assistant. Use only the information provided. Do not invent details.", - ), - None, - ) - .await?; - Ok(title_raw.trim().trim_matches('"').to_string()) - } - /// Drive the agentic loop with streaming SSE events. Shared between /// bootstrap and continuation. Mutates `messages` in place (response /// turns + tool results are appended) and returns counters + the /// final assistant content. async fn run_streaming_agentic_loop( &self, - chat_backend: &dyn LlmClient, - ollama_client: &OllamaClient, + backend: &ResolvedBackend, messages: &mut Vec, tools: Vec, image_base64: &Option, normalized: &str, user_id: i32, active_persona: &str, - // Provenance — stamped onto any store_fact tool call made - // during this loop. Mirrors the non-streaming chat path. - model_used: &str, - effective_backend: &str, max_iterations: usize, tx: &tokio::sync::mpsc::Sender, ) -> Result { @@ -1247,7 +1079,8 @@ impl InsightChatService { }) .await; - let mut stream = chat_backend + let mut stream = backend + .chat() .chat_with_tools_stream(messages.clone(), tools.clone()) .await?; @@ -1304,13 +1137,11 @@ impl InsightChatService { .execute_tool( &tool_call.function.name, &tool_call.function.arguments, - ollama_client, + backend, image_base64, normalized, user_id, active_persona, - model_used, - effective_backend, &cx, ) .await; @@ -1345,7 +1176,8 @@ impl InsightChatService { messages.push(ChatMessage::user( "Please write your final answer now without calling any more tools.", )); - let mut stream = chat_backend + let mut stream = backend + .chat() .chat_with_tools_stream(messages.clone(), vec![]) .await?; let mut final_message: Option = None; @@ -1459,6 +1291,34 @@ fn resolve_date_taken_for_context( .map(|dt| dt.format("%Y-%m-%d").to_string()) } +/// Validate a stored→effective backend transition for a chat continuation. +/// Continuation runs against a transcript that was generated with a specific +/// backend; the only blocked transition is `local → hybrid`, because the +/// stored transcript has images embedded in the first user message and the +/// hybrid path (OpenRouter chat with describe-then-inline) can't replay +/// raw image bytes through OpenRouter consistently across providers. +/// `hybrid → local` is allowed (the inlined description replays verbatim +/// as text). +/// +/// Whether "local" routes through Ollama or llama-swap is decided at +/// startup by `LLM_BACKEND`; both share the same transcript shape from +/// the chat-replay perspective. +fn validate_cross_replay(stored: &str, effective: &str) -> Result<()> { + if !matches!(effective, "local" | "hybrid") { + bail!( + "unknown backend '{}'; expected 'local' or 'hybrid'", + effective + ); + } + if stored == "local" && effective == "hybrid" { + bail!( + "switching from local to hybrid mid-chat isn't supported; \ + regenerate the insight in hybrid mode if you want OpenRouter chat" + ); + } + Ok(()) +} + /// Pick the backend label for bootstrap. Bootstrap has no stored insight /// to defer to (that's continuation's behaviour), so the default is /// `"local"`. Returns an error if the supplied label is non-empty but @@ -2082,10 +1942,40 @@ mod tests { #[test] fn bootstrap_backend_rejects_unknown_label() { - let err = resolve_bootstrap_backend(Some("openrouter")).unwrap_err(); - let msg = format!("{}", err); - assert!(msg.contains("unknown backend")); - assert!(msg.contains("openrouter")); + // `llamacpp` is no longer a per-request backend value — it's chosen + // at deploy time via `LLM_BACKEND`. + for label in &["openrouter", "llamacpp", "ollama"] { + let err = resolve_bootstrap_backend(Some(label)).unwrap_err(); + let msg = format!("{}", err); + assert!(msg.contains("unknown backend"), "label={}", label); + } + } + + #[test] + fn cross_replay_rejects_local_to_hybrid() { + let err = validate_cross_replay("local", "hybrid").unwrap_err(); + assert!(format!("{}", err).contains("local to hybrid")); + } + + #[test] + fn cross_replay_allows_supported_transitions() { + assert!(validate_cross_replay("local", "local").is_ok()); + assert!(validate_cross_replay("hybrid", "hybrid").is_ok()); + // Hybrid → local replays the inlined description as plain text. + assert!(validate_cross_replay("hybrid", "local").is_ok()); + } + + #[test] + fn cross_replay_rejects_unknown_effective() { + // Both "openrouter" and the former "llamacpp" value are unknown now. + for label in &["openrouter", "llamacpp"] { + let err = validate_cross_replay("local", label).unwrap_err(); + assert!( + format!("{}", err).contains("unknown backend"), + "label={}", + label + ); + } } #[test] diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs index 2e2da33..8e39c59 100644 --- a/src/ai/insight_generator.rs +++ b/src/ai/insight_generator.rs @@ -10,6 +10,8 @@ use std::io::Cursor; use std::sync::{Arc, Mutex}; use crate::ai::apollo_client::{ApolloClient, ApolloPlace}; +use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides}; +use crate::ai::llamacpp::LlamaCppClient; use crate::ai::llm_client::LlmClient; use crate::ai::ollama::{ChatMessage, OllamaClient, Tool}; use crate::ai::openrouter::OpenRouterClient; @@ -26,6 +28,42 @@ use crate::otel::global_tracer; use crate::tags::TagDao; use crate::utils::{earliest_fs_time, normalize_path}; +/// Parse a "Title: ...\n\n" response into (title, body). +/// Falls back to the first sentence as the title if the model didn't +/// follow the format. +pub(crate) fn parse_title_body(raw: &str) -> (String, String) { + let trimmed = raw.trim(); + + // Try "Title: \n\n<body>" or "Title: <title>\n<body>" + if let Some(rest) = trimmed + .strip_prefix("Title:") + .or_else(|| trimmed.strip_prefix("title:")) + { + let rest = rest.trim_start(); + if let Some(split_pos) = rest.find("\n\n").or_else(|| rest.find('\n')) { + let title = rest[..split_pos].trim(); + let body = rest[split_pos..].trim(); + if !title.is_empty() && !body.is_empty() { + return (title.to_string(), body.to_string()); + } + } + } + + // Fallback: first sentence (up to first `. ` or `.\n`) becomes the title + if let Some(pos) = trimmed.find(". ").or_else(|| trimmed.find(".\n")) { + let title = &trimmed[..pos]; + let body = trimmed[pos + 1..].trim(); + if title.len() <= 100 && !body.is_empty() { + return (title.to_string(), body.to_string()); + } + } + + // Last resort: truncate to 60 chars for title, full text as body + let title: String = trimmed.chars().take(60).collect(); + let title = title.trim_end().to_string(); + (title, trimmed.to_string()) +} + /// Combine an optional personal Apollo Place with an optional Nominatim /// reverse-geocoded city, falling back to bare coordinates when neither /// resolves. Free function so we can test it cheaply without spinning up @@ -68,6 +106,9 @@ pub struct InsightGenerator { /// Optional OpenRouter client, used when `backend=hybrid` is requested. /// `None` when `OPENROUTER_API_KEY` is not configured. openrouter: Option<Arc<OpenRouterClient>>, + /// Optional llama-swap client, used when `backend=llamacpp` is requested. + /// `None` when `LLAMA_SWAP_URL` is not configured. + llamacpp: Option<Arc<LlamaCppClient>>, sms_client: SmsApiClient, /// Optional integration with Apollo's user-defined Places. When the /// integration is disabled (`APOLLO_API_BASE_URL` unset), every @@ -120,6 +161,7 @@ impl InsightGenerator { pub fn new( ollama: OllamaClient, openrouter: Option<Arc<OpenRouterClient>>, + llamacpp: Option<Arc<LlamaCppClient>>, sms_client: SmsApiClient, apollo_client: ApolloClient, insight_dao: Arc<Mutex<Box<dyn InsightDao>>>, @@ -137,6 +179,7 @@ impl InsightGenerator { Self { ollama, openrouter, + llamacpp, sms_client, apollo_client, insight_dao, @@ -465,8 +508,11 @@ impl InsightGenerator { log::info!("RAG QUERY: {}", query); log::info!("========================================"); - // Generate embedding for the query - let query_embedding = self.ollama.generate_embedding(&query).await?; + // Generate embedding for the query via the configured local backend + // (`LLM_BACKEND` switch). Must match the backend that populated the + // daily-summary embeddings or similarity search will be garbage. + let query_embedding = + crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &query).await?; // Search for similar daily summaries with time-based weighting // This prioritizes summaries temporally close to the query date @@ -557,7 +603,7 @@ impl InsightGenerator { let calendar_cx = parent_cx.with_span(span); let query_embedding = if let Some(loc) = location { - match self.ollama.generate_embedding(loc).await { + match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), loc).await { Ok(emb) => Some(emb), Err(e) => { log::warn!("Failed to generate embedding for location '{}': {}", loc, e); @@ -728,16 +774,17 @@ impl InsightGenerator { ) }; - let query_embedding = match self.ollama.generate_embedding(&query_text).await { - Ok(emb) => emb, - Err(e) => { - log::warn!("Failed to generate search embedding: {}", e); - search_cx.span().set_status(Status::Error { - description: e.to_string().into(), - }); - return Ok(None); - } - }; + let query_embedding = + match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &query_text).await { + Ok(emb) => emb, + Err(e) => { + log::warn!("Failed to generate search embedding: {}", e); + search_cx.span().set_status(Status::Error { + description: e.to_string().into(), + }); + return Ok(None); + } + }; let searches = { let mut dao = self @@ -1583,29 +1630,27 @@ Return ONLY the summary, nothing else."#, &self, tool_name: &str, arguments: &serde_json::Value, - ollama: &OllamaClient, + backend: &ResolvedBackend, image_base64: &Option<String>, file_path: &str, user_id: i32, persona_id: &str, - // Provenance — written into entity_facts.created_by_* when - // the loop calls store_fact. The caller knows the actual - // chat-runtime model and backend (which may differ from - // ollama.primary_model in hybrid mode where chat lives on - // OpenRouter while Ollama still handles vision). - model: &str, - backend: &str, cx: &opentelemetry::Context, ) -> String { + let model = backend.model(); + let backend_label = backend.kind.as_str(); let result = match tool_name { - "search_rag" => self.tool_search_rag(arguments, ollama, cx).await, + "search_rag" => self.tool_search_rag(arguments, backend.local(), cx).await, "search_messages" => self.tool_search_messages(arguments, cx).await, "get_sms_messages" => self.tool_get_sms_messages(arguments, cx).await, "get_calendar_events" => self.tool_get_calendar_events(arguments, cx).await, "get_location_history" => self.tool_get_location_history(arguments, cx).await, "get_file_tags" => self.tool_get_file_tags(arguments, cx).await, "get_faces_in_photo" => self.tool_get_faces_in_photo(arguments, cx).await, - "describe_photo" => self.tool_describe_photo(ollama, image_base64).await, + "describe_photo" => { + self.tool_describe_photo(backend.local(), image_base64) + .await + } "reverse_geocode" => self.tool_reverse_geocode(arguments).await, "get_personal_place_at" => self.tool_get_personal_place_at(arguments).await, "recall_entities" => self.tool_recall_entities(arguments, cx).await, @@ -1613,19 +1658,25 @@ Return ONLY the summary, nothing else."#, self.tool_recall_facts_for_photo(arguments, user_id, persona_id, cx) .await } - "store_entity" => self.tool_store_entity(arguments, ollama, cx).await, + "store_entity" => self.tool_store_entity(arguments, cx).await, "store_fact" => { self.tool_store_fact( - arguments, file_path, user_id, persona_id, model, backend, cx, + arguments, + file_path, + user_id, + persona_id, + model, + backend_label, + cx, ) .await } "update_fact" => { - self.tool_update_fact(arguments, user_id, persona_id, model, backend, cx) + self.tool_update_fact(arguments, user_id, persona_id, model, backend_label, cx) .await } "supersede_fact" => { - self.tool_supersede_fact(arguments, user_id, persona_id, model, backend, cx) + self.tool_supersede_fact(arguments, user_id, persona_id, model, backend_label, cx) .await } "get_current_datetime" => Self::tool_get_current_datetime(), @@ -1643,7 +1694,7 @@ Return ONLY the summary, nothing else."#, async fn tool_search_rag( &self, args: &serde_json::Value, - ollama: &OllamaClient, + local: &dyn LlmClient, _cx: &opentelemetry::Context, ) -> String { let query = match args.get("query").and_then(|v| v.as_str()) { @@ -1707,7 +1758,7 @@ Return ONLY the summary, nothing else."#, }; let final_results = if rerank_enabled && results.len() > limit { - match self.rerank_with_llm(&query, &results, limit, ollama).await { + match self.rerank_with_llm(&query, &results, limit, local).await { Ok(reordered) => reordered, Err(e) => { log::warn!("rerank failed, using vector order: {}", e); @@ -1733,7 +1784,7 @@ Return ONLY the summary, nothing else."#, query: &str, candidates: &[String], limit: usize, - ollama: &OllamaClient, + local: &dyn LlmClient, ) -> Result<Vec<String>> { let query_preview: String = query.chars().take(60).collect(); log::info!( @@ -1771,14 +1822,9 @@ Return ONLY the summary, nothing else."#, ); let started = std::time::Instant::now(); - let response = ollama - .generate_no_think( - &prompt, - Some( - "You are a terse relevance ranker. You output only numbers separated by commas.", - ), - ) - .await?; + let system = + Some("You are a terse relevance ranker. You output only numbers separated by commas."); + let response = local.generate(&prompt, system, None).await?; log::info!( "rerank: finished in {} ms (prompt={} chars)", started.elapsed().as_millis(), @@ -1925,6 +1971,12 @@ Return ONLY the summary, nothing else."#, .unwrap_or(20) .clamp(1, 50) as usize; let contact_id = args.get("contact_id").and_then(|v| v.as_i64()); + let contact = args + .get("contact") + .and_then(|v| v.as_str()) + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .filter(|_| contact_id.is_none()); let start_ts = args.get("start_ts").and_then(|v| v.as_i64()); let end_ts = args.get("end_ts").and_then(|v| v.as_i64()); let is_mms = args.get("is_mms").and_then(|v| v.as_bool()); @@ -1932,10 +1984,11 @@ Return ONLY the summary, nothing else."#, let has_date_filter = start_ts.is_some() || end_ts.is_some(); log::info!( - "tool_search_messages: query='{}', mode={}, contact_id={:?}, range=[{:?}, {:?}], is_mms={:?}, has_media={:?}, limit={}", + "tool_search_messages: query='{}', mode={}, contact_id={:?}, contact={:?}, range=[{:?}, {:?}], is_mms={:?}, has_media={:?}, limit={}", query, mode, contact_id, + contact, start_ts, end_ts, is_mms, @@ -1950,6 +2003,7 @@ Return ONLY the summary, nothing else."#, mode: mode.as_str(), limit: user_limit, contact_id, + contact, date_from: start_ts, date_to: end_ts, is_mms, @@ -2350,16 +2404,14 @@ Return ONLY the summary, nothing else."#, out } - /// Tool: describe_photo — generate a visual description of the photo async fn tool_describe_photo( &self, - ollama: &OllamaClient, + local: &dyn LlmClient, image_base64: &Option<String>, ) -> String { log::info!("tool_describe_photo: generating visual description"); - match image_base64 { - Some(img) => match ollama.generate_photo_description(img).await { + Some(img) => match local.describe_image(img).await { Ok(desc) => desc, Err(e) => format!("Error describing photo: {}", e), }, @@ -2602,11 +2654,12 @@ Return ONLY the summary, nothing else."#, } } - /// Tool: store_entity — upsert an entity into the knowledge memory + /// Tool: store_entity — upsert an entity into the knowledge memory. + /// Embeddings go through the configured local backend (`LLM_BACKEND`), + /// independent of the per-request chat backend in the caller. async fn tool_store_entity( &self, args: &serde_json::Value, - ollama: &OllamaClient, cx: &opentelemetry::Context, ) -> String { use crate::database::models::InsertEntity; @@ -2666,18 +2719,20 @@ Return ONLY the summary, nothing else."#, .collect() }; - // Generate embedding for name + description (best-effort) + // Generate embedding for name + description (best-effort) via the + // configured local backend. let embed_text = format!("{} {}", name, description); - let embedding: Option<Vec<u8>> = match ollama.generate_embedding(&embed_text).await { - Ok(vec) => { - let bytes: Vec<u8> = vec.iter().flat_map(|f| f.to_le_bytes()).collect(); - Some(bytes) - } - Err(e) => { - log::warn!("Embedding generation failed for entity '{}': {}", name, e); - None - } - }; + let embedding: Option<Vec<u8>> = + match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &embed_text).await { + Ok(vec) => { + let bytes: Vec<u8> = vec.iter().flat_map(|f| f.to_le_bytes()).collect(); + Some(bytes) + } + Err(e) => { + log::warn!("Embedding generation failed for entity '{}': {}", name, e); + None + } + }; let now = chrono::Utc::now().timestamp(); let insert = InsertEntity { @@ -3025,28 +3080,31 @@ Return ONLY the summary, nothing else."#, } tools.push(Tool::function( - "search_messages", +"search_messages", "Search SMS/MMS messages — bodies and (for MMS) attachment text + filenames. \ - Modes: `fts5` (keyword + phrase + prefix + AND/OR/NOT + NEAR proximity), \ - `semantic` (embedding similarity, requires generated embeddings), `hybrid` (RRF merge, recommended; \ - degrades to fts5 when embeddings absent). Optional filters: `start_ts` / `end_ts` (real-UTC unix \ - seconds), `contact_id`, `is_mms` (true = MMS only, false = SMS only), `has_media` (true = messages \ - with image/video/audio attachments only). For pure date / contact browsing without keywords, prefer \ - `get_sms_messages`. \ - \n\nFTS5 query syntax (works in fts5 + hybrid modes):\n\ - - Phrase: `\"trader joe's\"` — exact word sequence (use double quotes).\n\ - - Prefix: `restaur*` — matches restaurant, restaurants, restauranteur, ….\n\ - - Boolean: `dinner AND tahoe`, `wedding OR reception OR ceremony`, `vacation NOT work` (operators must be UPPERCASE).\n\ - - Proximity: `NEAR(meeting work, 5)` — both terms within 5 tokens of each other.\n\ - - Combine: `(reception OR ceremony) AND tahoe*` — group with parens.\n\ - Unquoted multi-word queries are treated as implicit AND. Apostrophes / hyphens / colons are safe — they no longer downgrade to a slow LIKE scan. Use `mode: \"fts5\"` when you want the operators above to be authoritative; `hybrid` still respects them but may surface semantically-similar non-keyword hits alongside.\n\n\ - Examples:\n\ - - `{query: \"trader joe's\"}` — phrase across all time.\n\ - - `{query: \"dinner\", contact_id: 42, start_ts: 1700000000, end_ts: 1700604800}` — keyword within a contact and a week.\n\ - - `{query: \"vacation\", has_media: true}` — only matches that include photos / videos.\n\ - - `{query: \"wedding OR reception OR ceremony\", mode: \"fts5\"}` — any of several synonyms.\n\ - - `{query: \"restaur*\", mode: \"fts5\"}` — prefix expansion for varying word forms.\n\ - - `{query: \"NEAR(birthday cake, 5)\", mode: \"fts5\"}` — terms close together but in any order.", + Modes: `fts5` (keyword + phrase + prefix + AND/OR/NOT + NEAR proximity), \ + `semantic` (embedding similarity, requires generated embeddings), `hybrid` (RRF merge, recommended; \ + degrades to fts5 when embeddings absent). Optional filters: `start_ts` / `end_ts` (real-UTC unix \ + seconds), `contact` (contact name, case-insensitive), `contact_id` (numeric), `is_mms` \ + (true = MMS only, false = SMS only), `has_media` (true = messages with image/video/audio \ + attachments only). Prefer `contact` over `contact_id` — the name is resolved server-side. \ + If both are provided, `contact_id` takes precedence. \ + For pure date / contact browsing without keywords, prefer `get_sms_messages`. \ + \n\nFTS5 query syntax (works in fts5 + hybrid modes):\n\ + - Phrase: `\"trader joe's\"` — exact word sequence (use double quotes).\n\ + - Prefix: `restaur*` — matches restaurant, restaurants, restauranteur, ….\n\ + - Boolean: `dinner AND tahoe`, `wedding OR reception OR ceremony`, `vacation NOT work` (operators must be UPPERCASE).\n\ + - Proximity: `NEAR(meeting work, 5)` — both terms within 5 tokens of each other.\n\ + - Combine: `(reception OR ceremony) AND tahoe*` — group with parens.\n\ + Unquoted multi-word queries are treated as implicit AND. Apostrophes / hyphens / colons are safe — they no longer downgrade to a slow LIKE scan. Use `mode: \"fts5\"` when you want the operators above to be authoritative; `hybrid` still respects them but may surface semantically-similar non-keyword hits alongside.\n\n\ + Examples:\n\ + - `{query: \"trader joe's\"}` — phrase across all time.\n\ + - `{query: \"dinner\", contact: \"Mom\"}` — keyword scoped to Mom's messages.\n\ + - `{query: \"dinner\", contact_id: 42, start_ts: 1700000000, end_ts: 1700604800}` — keyword within a contact and a week.\n\ + - `{query: \"vacation\", has_media: true}` — only matches that include photos / videos.\n\ + - `{query: \"wedding OR reception OR ceremony\", mode: \"fts5\"}` — any of several synonyms.\n\ + - `{query: \"restaur*\", mode: \"fts5\"}` — prefix expansion for varying word forms.\n\ + - `{query: \"NEAR(birthday cake, 5)\", mode: \"fts5\"}` — terms close together but in any order.", serde_json::json!({ "type": "object", "required": ["query"], @@ -3055,6 +3113,7 @@ Return ONLY the summary, nothing else."#, "mode": { "type": "string", "enum": ["fts5", "semantic", "hybrid"], "description": "Search strategy. Default: hybrid." }, "limit": { "type": "integer", "description": "Max results (default 20, max 50)." }, "contact_id": { "type": "integer", "description": "Optional numeric contact id to scope the search." }, + "contact": { "type": "string", "description": "Optional contact name (case-insensitive). Resolved to contact_id server-side. Use this when you know the name but not the ID." }, "start_ts": { "type": "integer", "description": "Optional inclusive lower bound, real-UTC unix seconds." }, "end_ts": { "type": "integer", "description": "Optional inclusive upper bound, real-UTC unix seconds." }, "is_mms": { "type": "boolean", "description": "Optional: true to restrict to MMS, false to restrict to SMS." }, @@ -3526,7 +3585,8 @@ Return ONLY the summary, nothing else."#, - When you identify people / places / events / things, use store_entity + store_fact to grow the persistent memory.\n\ - Before store_entity, call recall_entities to check whether a similar name already exists; reuse the existing entity_id rather than creating a near-duplicate (e.g. \"Sara\" vs \"Sarah J.\"). The DAO will collapse obvious cosine matches, but choosing the existing id keeps facts and photo links consolidated.\n\ - Predicates should be relationship-shaped verbs that encode a queryable claim — `lives_in`, `works_at`, `attended`, `is_friend_of`, `is_parent_of`, `interested_in`, `married_to`, `owns`. DO NOT use vague speech-act predicates like `expressed`, `said`, `mentioned`, `stated`, `quoted`, `noted`, `discussed`, `thought`, `wondered`. DO NOT store quotations or sentence fragments as `object_value` — paraphrase into a structured claim. Bad: `(Cameron, expressed, \"I'm tempted to get a part-time job there\")`. Good: `(Cameron, considered_employment_at, <Place>)` or `(Cameron, interested_in, \"part-time work\")`.\n\ - - A tool returning no results is informative; continue with the others.", + - A tool returning no results is informative; continue with the others.\n\ + - When writing your final answer, start with \"Title: <short title>\" (max 8 words) on the first line, then a blank line, then the body.", ); let mut out = identity; @@ -3541,6 +3601,184 @@ Return ONLY the summary, nothing else."#, out } + /// Consolidate client construction for the agentic insight loop. + /// + /// Returns a [`ResolvedBackend`] containing the **chat** client (the model + /// that drives the agent loop), the **local** client (always the configured + /// local backend — Ollama or llama-swap — for utility calls like + /// describe_image, rerank, embeddings), the backend kind, and whether the + /// chat model receives images inline. + pub async fn resolve_backend( + &self, + kind: BackendKind, + overrides: &SamplingOverrides, + ) -> Result<ResolvedBackend> { + let local_via_llamacpp = crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); + let is_hybrid = kind == BackendKind::Hybrid; + + // ── chat client ──────────────────────────────────────────────── + let chat: Box<dyn LlmClient> = if is_hybrid { + // Hybrid: chat through OpenRouter. + let arc = self.openrouter.as_ref().ok_or_else(|| { + anyhow::anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured") + })?; + let mut c: OpenRouterClient = (**arc).clone(); + if let Some(ref m) = overrides.model { + c.primary_model = m.clone(); + } + if overrides.has_sampling() { + c.set_sampling_params( + overrides.temperature, + overrides.top_p, + overrides.top_k, + overrides.min_p, + ); + } + if let Some(ctx) = overrides.num_ctx { + c.set_num_ctx(Some(ctx)); + } + Box::new(c) + } else if local_via_llamacpp { + // Local via llama-swap. + let arc = self.llamacpp.as_ref().ok_or_else(|| { + anyhow::anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured") + })?; + let mut c: LlamaCppClient = (**arc).clone(); + if let Some(ref m) = overrides.model { + c.primary_model = m.clone(); + } + if overrides.has_sampling() { + c.set_sampling_params( + overrides.temperature, + overrides.top_p, + overrides.top_k, + overrides.min_p, + ); + } + if let Some(ctx) = overrides.num_ctx { + c.set_num_ctx(Some(ctx)); + } + Box::new(c) + } else { + // Pure Ollama local. + let mut ollama_client = if let Some(ref model) = overrides.model { + OllamaClient::new( + self.ollama.primary_url.clone(), + self.ollama.fallback_url.clone(), + model.clone(), + Some(model.clone()), + ) + } else { + self.ollama.clone() + }; + if overrides.has_sampling() { + ollama_client.set_sampling_params( + overrides.temperature, + overrides.top_p, + overrides.top_k, + overrides.min_p, + ); + } + if let Some(ctx) = overrides.num_ctx { + ollama_client.set_num_ctx(Some(ctx)); + } + Box::new(ollama_client) + }; + + // ── local client (utility calls: rerank, describe_image, etc.) ─ + // For llamacpp in local mode: mirror the chat model selection so + // rerank / describe_image hit the same model that's already + // loaded — avoids a mid-turn model swap in llama-swap exclusive + // mode. In hybrid mode the override is an OpenRouter model id + // (e.g. "google/gemini-3-flash-preview") which llama-swap can't + // serve — keep the configured local slots. + let local: Box<dyn LlmClient> = if local_via_llamacpp { + let mut lc = self.llamacpp.as_ref().unwrap().as_ref().clone(); + if !is_hybrid && let Some(ref m) = overrides.model { + lc.primary_model = m.clone(); + lc.set_vision_model(m.clone()); + } + Box::new(lc) + } else { + Box::new(self.ollama.clone()) + }; + + // ── images_inline ────────────────────────────────────────────── + let images_inline = if is_hybrid { + // Hybrid: chat model never sees images — describe-then-inject. + false + } else if local_via_llamacpp { + // llama-swap models receive images directly via OpenAI content + // parts. Capability probing isn't available (no `/api/show`), + // so assume vision support; a misconfigured model surfaces as + // a chat-call error. + true + } else { + // Pure Ollama: probe model capabilities. + let ollama_for_caps = if let Some(ref model) = overrides.model { + // Verify custom model is available on at least one server. + let available_on_primary = + OllamaClient::is_model_available(&self.ollama.primary_url, model) + .await + .unwrap_or(false); + + let available_on_fallback = if let Some(ref fallback_url) = self.ollama.fallback_url + { + OllamaClient::is_model_available(fallback_url, model) + .await + .unwrap_or(false) + } else { + false + }; + + if !available_on_primary && !available_on_fallback { + anyhow::bail!( + "model not available: '{}' not found on any configured server", + model + ); + } + model.as_str() + } else { + self.ollama.primary_model.as_str() + }; + + let capabilities = match OllamaClient::check_model_capabilities( + &self.ollama.primary_url, + ollama_for_caps, + ) + .await + { + Ok(caps) => caps, + Err(_) => { + let fallback_url = + self.ollama.fallback_url.as_deref().ok_or_else(|| { + anyhow::anyhow!( + "Failed to check model capabilities for '{}': model not found on primary server and no fallback configured", + ollama_for_caps + ) + })?; + OllamaClient::check_model_capabilities(fallback_url, ollama_for_caps) + .await + .map_err(|e| { + anyhow::anyhow!( + "Failed to check model capabilities for '{}': {}", + ollama_for_caps, + e + ) + })? + } + }; + + if !capabilities.has_tool_calling { + anyhow::bail!("tool calling not supported by model '{}'", ollama_for_caps); + } + + capabilities.has_vision + }; + + Ok(ResolvedBackend::new(chat, local, kind, images_inline)) + } + pub async fn generate_agentic_insight_for_photo( &self, file_path: &str, @@ -3568,192 +3806,23 @@ Return ONLY the summary, nothing else."#, span.set_attribute(KeyValue::new("file_path", file_path.clone())); span.set_attribute(KeyValue::new("max_iterations", max_iterations as i64)); - // 1a. Resolve backend label (defaults to "local"). - let backend_label = backend - .as_deref() - .map(|s| s.trim().to_lowercase()) - .filter(|s| !s.is_empty()) - .unwrap_or_else(|| "local".to_string()); - if !matches!(backend_label.as_str(), "local" | "hybrid") { - return Err(anyhow::anyhow!( - "unknown backend '{}'; expected 'local' or 'hybrid'", - backend_label - )); - } - span.set_attribute(KeyValue::new("backend", backend_label.clone())); - let is_hybrid = backend_label == "hybrid"; - - // 1b. Always build an Ollama client. In local mode it owns the chat - // loop; in hybrid mode it still handles describe_image + any - // tool-local calls (e.g. if a future tool needs embeddings). - // Sampling overrides only apply in local mode — in hybrid the - // user's params belong to the OpenRouter chat client. - let apply_sampling_to_ollama = !is_hybrid; - let mut ollama_client = if let Some(ref model) = custom_model - && !is_hybrid - { - log::info!("Using custom model for agentic: {}", model); - span.set_attribute(KeyValue::new("custom_model", model.clone())); - OllamaClient::new( - self.ollama.primary_url.clone(), - self.ollama.fallback_url.clone(), - model.clone(), - Some(model.clone()), - ) - } else { - if !is_hybrid { - span.set_attribute(KeyValue::new("model", self.ollama.primary_model.clone())); - } - self.ollama.clone() - }; - - if apply_sampling_to_ollama { - if let Some(ctx) = num_ctx { - log::info!("Using custom context size: {}", ctx); - span.set_attribute(KeyValue::new("num_ctx", ctx as i64)); - ollama_client.set_num_ctx(Some(ctx)); - } - - if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() { - log::info!( - "Using sampling params — temperature: {:?}, top_p: {:?}, top_k: {:?}, min_p: {:?}", - temperature, - top_p, - top_k, - min_p - ); - if let Some(t) = temperature { - span.set_attribute(KeyValue::new("temperature", t as f64)); - } - if let Some(p) = top_p { - span.set_attribute(KeyValue::new("top_p", p as f64)); - } - if let Some(k) = top_k { - span.set_attribute(KeyValue::new("top_k", k as i64)); - } - if let Some(m) = min_p { - span.set_attribute(KeyValue::new("min_p", m as f64)); - } - ollama_client.set_sampling_params(temperature, top_p, top_k, min_p); - } - } - - // 1c. In hybrid mode, clone the configured OpenRouter client and - // apply per-request overrides. - let openrouter_client: Option<OpenRouterClient> = if is_hybrid { - let arc = self.openrouter.as_ref().ok_or_else(|| { - anyhow::anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured") - })?; - let mut c: OpenRouterClient = (**arc).clone(); - if let Some(ref m) = custom_model { - c.primary_model = m.clone(); - span.set_attribute(KeyValue::new("custom_model", m.clone())); - } - span.set_attribute(KeyValue::new("openrouter_model", c.primary_model.clone())); - if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() { - if let Some(t) = temperature { - span.set_attribute(KeyValue::new("temperature", t as f64)); - } - if let Some(p) = top_p { - span.set_attribute(KeyValue::new("top_p", p as f64)); - } - if let Some(k) = top_k { - span.set_attribute(KeyValue::new("top_k", k as i64)); - } - if let Some(m) = min_p { - span.set_attribute(KeyValue::new("min_p", m as f64)); - } - c.set_sampling_params(temperature, top_p, top_k, min_p); - } - if let Some(ctx) = num_ctx { - span.set_attribute(KeyValue::new("num_ctx", ctx as i64)); - c.set_num_ctx(Some(ctx)); - } - Some(c) - } else { - None + // 1. Resolve backend + build clients. + let kind = BackendKind::parse(backend.as_deref().unwrap_or("local"))?; + span.set_attribute(KeyValue::new("backend", kind.as_str())); + let overrides = SamplingOverrides { + model: custom_model, + num_ctx, + temperature, + top_p, + top_k, + min_p, }; + let backend = self.resolve_backend(kind, &overrides).await?; + span.set_attribute(KeyValue::new("model", backend.model().to_string())); + span.set_attribute(KeyValue::new("images_inline", backend.images_inline)); let insight_cx = current_cx.with_span(span); - // 2. Verify chat model supports tool calling. - // - local: existing Ollama model availability + capability check. - // - hybrid: trust the operator's curated allowlist - // (OPENROUTER_ALLOWED_MODELS) — no live precheck. A bad model id - // surfaces as a chat-call error on the next step. - let has_vision = if is_hybrid { - // In hybrid mode the chat model never sees images directly — we - // describe-then-inject, so `has_vision` drives only whether we - // bother loading the image to describe it, which we always do. - true - } else { - if let Some(ref model_name) = custom_model { - let available_on_primary = - OllamaClient::is_model_available(&ollama_client.primary_url, model_name) - .await - .unwrap_or(false); - - let available_on_fallback = - if let Some(ref fallback_url) = ollama_client.fallback_url { - OllamaClient::is_model_available(fallback_url, model_name) - .await - .unwrap_or(false) - } else { - false - }; - - if !available_on_primary && !available_on_fallback { - anyhow::bail!( - "model not available: '{}' not found on any configured server", - model_name - ); - } - } - - let model_name_for_caps = &ollama_client.primary_model; - let capabilities = match OllamaClient::check_model_capabilities( - &ollama_client.primary_url, - model_name_for_caps, - ) - .await - { - Ok(caps) => caps, - Err(_) => { - let fallback_url = ollama_client.fallback_url.as_deref().ok_or_else(|| { - anyhow::anyhow!( - "Failed to check model capabilities for '{}': model not found on primary server and no fallback configured", - model_name_for_caps - ) - })?; - OllamaClient::check_model_capabilities(fallback_url, model_name_for_caps) - .await - .map_err(|e| { - anyhow::anyhow!( - "Failed to check model capabilities for '{}': {}", - model_name_for_caps, - e - ) - })? - } - }; - - if !capabilities.has_tool_calling { - return Err(anyhow::anyhow!( - "tool calling not supported by model '{}'", - ollama_client.primary_model - )); - } - - insight_cx - .span() - .set_attribute(KeyValue::new("model_has_vision", capabilities.has_vision)); - insight_cx - .span() - .set_attribute(KeyValue::new("model_has_tool_calling", true)); - - capabilities.has_vision - }; - // 3. Fetch EXIF let exif = { let mut exif_dao = self.exif_dao.lock().expect("Unable to lock ExifDao"); @@ -3845,38 +3914,32 @@ Return ONLY the summary, nothing else."#, } }; - // 7. Load image if vision capable. - // In hybrid mode we ALSO describe it locally now so the - // description can be inlined as text — the OpenRouter chat model - // never receives the base64 image directly. - let image_base64 = if has_vision { - match self.load_image_as_base64(&file_path) { - Ok(b64) => { - log::info!("Loaded image for vision-capable agentic model"); - Some(b64) - } - Err(e) => { - log::warn!("Failed to load image for agentic vision: {}", e); - None - } + // 7. Load image. Always attempted — vision-capable models get the + // base64 inline; hybrid mode describes it locally and injects text. + let image_base64 = match self.load_image_as_base64(&file_path) { + Ok(b64) => { + log::info!("Loaded image for agentic model"); + Some(b64) + } + Err(e) => { + log::warn!("Failed to load image for agentic: {}", e); + None } - } else { - None }; - let hybrid_visual_description: Option<String> = if is_hybrid { + // Describe-then-inline (hybrid only). Vision describe routes through + // the local backend so non-text work stays off OpenRouter. + let inlined_visual_description: Option<String> = if !backend.images_inline { match image_base64.as_deref() { - Some(b64) => match self.ollama.describe_image(b64).await { + Some(b64) => match backend.local().describe_image(b64).await { Ok(desc) => { - log::info!( - "Hybrid: local vision describe succeeded ({} chars)", - desc.len() - ); + log::info!("{}: vision describe succeeded ({} chars)", kind, desc.len()); Some(desc) } Err(e) => { log::warn!( - "Hybrid: local vision describe failed, continuing without: {}", + "{}: vision describe failed, continuing without: {}", + kind, e ); None @@ -3934,7 +3997,7 @@ Return ONLY the summary, nothing else."#, .map(|c| format!("Contact/Person: {}", c)) .unwrap_or_else(|| "Contact/Person: unknown".to_string()); - let visual_block = hybrid_visual_description + let visual_block = inlined_visual_description .as_deref() .map(|d| format!("Visual description (from local vision model):\n{}\n\n", d)) .unwrap_or_default(); @@ -3953,31 +4016,24 @@ Return ONLY the summary, nothing else."#, date = date_taken.format("%B %d, %Y"), ); - // 10. Define tools. Gate flags computed from current data presence; - // hybrid mode omits describe_photo since the chat model receives - // the visual description inline (so we pass `false` for has_vision - // in hybrid mode regardless of the model's actual capability). - let gate_opts = self.current_gate_opts(has_vision && !is_hybrid); + // 10. Define tools. describe_photo offered only when the chat model + // sees images directly (images_inline); in hybrid mode the visual + // description is already inlined as text. + let gate_opts = self.current_gate_opts(backend.images_inline); let tools = Self::build_tool_definitions(gate_opts); - // 11. Build initial messages. In hybrid mode images are never - // attached to the wire message — the description is part of - // `user_content`. + // 11. Build initial messages. images_inline → attach base64 to the + // user message; describe-then-inline → text was already injected. let system_msg = ChatMessage::system(system_content); let mut user_msg = ChatMessage::user(user_content); - if !is_hybrid && let Some(ref img) = image_base64 { - user_msg.images = Some(vec![img.clone()]); + if backend.images_inline { + if let Some(ref img) = image_base64 { + user_msg.images = Some(vec![img.clone()]); + } } let mut messages = vec![system_msg, user_msg]; - // 12. Agentic loop — dispatch through the selected backend. - let chat_backend: &dyn LlmClient = if let Some(ref or_c) = openrouter_client { - or_c - } else { - &ollama_client - }; - let loop_span = tracer.start_with_context("ai.agentic.loop", &insight_cx); let loop_cx = insight_cx.with_span(loop_span); @@ -3990,7 +4046,8 @@ Return ONLY the summary, nothing else."#, iterations_used = iteration + 1; log::info!("Agentic iteration {}/{}", iteration + 1, max_iterations); - let (response, prompt_tokens, eval_tokens) = chat_backend + let (response, prompt_tokens, eval_tokens) = backend + .chat() .chat_with_tools(messages.clone(), tools.clone()) .await?; @@ -4030,13 +4087,11 @@ Return ONLY the summary, nothing else."#, .execute_tool( &tool_call.function.name, &tool_call.function.arguments, - &ollama_client, + &backend, &image_base64, &file_path, user_id, &persona_id, - chat_backend.primary_model(), - &backend_label, &loop_cx, ) .await; @@ -4057,10 +4112,11 @@ Return ONLY the summary, nothing else."#, iterations_used ); messages.push(ChatMessage::user(format!( - "Based on the context gathered, please write the final photo insight: a title and a detailed personal summary. Write in first person as {}.", + "Based on the context gathered, please write the final photo insight. Start with \"Title: <short title>\" on the first line (max 8 words), then a blank line, then the detailed personal summary. Write in first person as {}.", user_display_name() ))); - let (final_response, prompt_tokens, eval_tokens) = chat_backend + let (final_response, prompt_tokens, eval_tokens) = backend + .chat() .chat_with_tools(messages.clone(), vec![]) .await?; last_prompt_eval_count = prompt_tokens; @@ -4074,20 +4130,11 @@ Return ONLY the summary, nothing else."#, .set_attribute(KeyValue::new("iterations_used", iterations_used as i64)); loop_cx.span().set_status(Status::Ok); - // 13. Generate title via the same backend so voice stays consistent. - let title_prompt = format!( - "Create a short title (maximum 8 words) for the following journal entry:\n\n{}\n\nCapture the key moment or theme. Return ONLY the title, nothing else.", - final_content - ); - let title_system = custom_system_prompt.as_deref().unwrap_or( - "You are my long term memory assistant. Use only the information provided. Do not invent details.", - ); - let title_raw = chat_backend - .generate(&title_prompt, Some(title_system), None) - .await?; - let title = title_raw.trim().trim_matches('"').to_string(); + // 13. Parse title from the model's inline response. + let (title, body) = parse_title_body(&final_content); + final_content = body; - log::info!("Agentic generated title: {}", title); + log::info!("Agentic parsed title: {}", title); let summary_preview: String = final_content.chars().take(200).collect(); log::info!( "Agentic generated summary ({} chars): {}", @@ -4105,7 +4152,7 @@ Return ONLY the summary, nothing else."#, }; // 15. Store insight (returns the persisted row including its new id) - let model_version = chat_backend.primary_model().to_string(); + let model_version = backend.model().to_string(); let fewshot_source_ids_json = if fewshot_source_ids.is_empty() { None } else { @@ -4120,7 +4167,7 @@ Return ONLY the summary, nothing else."#, model_version, is_current: true, training_messages, - backend: backend_label.clone(), + backend: kind.as_str().to_string(), fewshot_source_ids: fewshot_source_ids_json, content_hash: None, }; @@ -4738,4 +4785,78 @@ mod tests { assert!(out.contains("-> empty (pivoted)")); assert!(out.contains("Final insight: Final title")); } + + #[test] + fn parse_title_body_standard_format() { + let (t, b) = parse_title_body("Title: Summer at the Lake\n\nWe spent the afternoon..."); + assert_eq!(t, "Summer at the Lake"); + assert_eq!(b, "We spent the afternoon..."); + } + + #[test] + fn parse_title_body_single_newline() { + let (t, b) = parse_title_body("Title: Morning Walk\nThe sun was rising..."); + assert_eq!(t, "Morning Walk"); + assert_eq!(b, "The sun was rising..."); + } + + #[test] + fn parse_title_body_lowercase_prefix() { + let (t, b) = parse_title_body("title: Garden Party\n\nEveryone gathered..."); + assert_eq!(t, "Garden Party"); + assert_eq!(b, "Everyone gathered..."); + } + + #[test] + fn parse_title_body_fallback_first_sentence() { + let (t, b) = parse_title_body("A warm summer day. We gathered at the park for a picnic."); + assert_eq!(t, "A warm summer day"); + assert_eq!(b, "We gathered at the park for a picnic."); + } + + #[test] + fn parse_title_body_fallback_truncate() { + let input = "A single long paragraph with no periods or title prefix that just keeps going on and on"; + let (t, b) = parse_title_body(input); + assert!(t.len() <= 60); + assert_eq!(b, input); + } + + /// Regression: hybrid mode was leaking the OpenRouter model override + /// into the local llamacpp client, causing describe_image to send + /// e.g. "google/gemini-3-flash-preview" to llama-swap (which 400s). + #[test] + fn resolve_backend_hybrid_does_not_leak_model_to_local_llamacpp() { + use crate::ai::llamacpp::LlamaCppClient; + + let mut base = + LlamaCppClient::new(Some("http://localhost:9292/v1".into()), Some("chat".into())); + base.set_vision_model("vision".into()); + base.set_embedding_model("embed".into()); + + let openrouter_model = "google/gemini-3-flash-preview"; + let overrides_model: Option<String> = Some(openrouter_model.into()); + let is_hybrid = true; + + // Replicate the resolve_backend local-client construction + // (lines ~3686-3695 of this file). + let mut lc = base.clone(); + if let Some(ref m) = overrides_model { + if !is_hybrid { + lc.primary_model = m.clone(); + lc.set_vision_model(m.clone()); + } + } + + // In hybrid mode the local client must keep its configured slots. + assert_eq!( + lc.vision_model, "vision", + "hybrid mode must not override vision_model with OpenRouter model" + ); + assert_eq!( + lc.primary_model, "chat", + "hybrid mode must not override primary_model with OpenRouter model" + ); + assert_eq!(lc.embedding_model, "embed"); + } } diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs new file mode 100644 index 0000000..e2ba00d --- /dev/null +++ b/src/ai/llamacpp.rs @@ -0,0 +1,1143 @@ +// LlamaCppClient — talks to a llama-swap proxy that fronts one or more +// llama-server processes. llama-swap exposes an OpenAI-compatible HTTP +// surface (`/v1/chat/completions`, `/v1/embeddings`, `/v1/models`), so the +// wire translation mirrors `OpenRouterClient` almost exactly. +// +// Differences from OpenRouter: +// - No bearer auth or attribution headers; llama-swap is LAN-only. +// - Three model slots (`primary_model` = chat, `vision_model`, `embedding_model`) +// each map to a model id in the llama-swap config. `describe_image` and +// `generate_embeddings` issue requests with the appropriate slot id in the +// `model` field, which is how llama-swap selects which backend process to +// run. +// - `/v1/models` returns only the configured slot ids — capabilities aren't +// reported by the API, so we infer `has_vision` from a single config field +// (`vision_model`, defaulting to `"vision"`) and assume `has_tool_calling` +// is true for every slot, since llama-swap entries default to launching +// llama-server with `--jinja`. +// +// First consumer lands alongside the three-way backend dispatch in +// insight_generator / insight_chat. +#![allow(dead_code)] + +use anyhow::{Context, Result, anyhow, bail}; +use async_trait::async_trait; +use reqwest::Client; +use serde::Deserialize; +use serde_json::{Value, json}; +use std::time::Duration; + +use crate::ai::llm_client::{ + ChatMessage, LlmClient, LlmStreamEvent, ModelCapabilities, Tool, ToolCall, ToolCallFunction, +}; +use futures::stream::{BoxStream, StreamExt}; + +const DEFAULT_BASE_URL: &str = "http://localhost:9292/v1"; +const DEFAULT_PRIMARY_MODEL: &str = "chat"; +const DEFAULT_VISION_MODEL: &str = "vision"; +const DEFAULT_EMBEDDING_MODEL: &str = "embed"; +const DEFAULT_REQUEST_TIMEOUT_SECS: u64 = 180; + +/// OpenAI-compatible client targeting a llama-swap proxy in front of one or +/// more llama-server processes. See the module doc-comment for the slot model. +#[derive(Clone)] +pub struct LlamaCppClient { + client: Client, + pub base_url: String, + /// Chat model slot id (e.g. `"chat"`). Used for `generate` / + /// `chat_with_tools` / `chat_with_tools_stream`. + pub primary_model: String, + /// Embedding model slot id (e.g. `"embed"`). Used for + /// `generate_embeddings`. + pub embedding_model: String, + /// Vision model slot id. Used for `describe_image` routing. Defaults + /// to `primary_model` so describe_image works out of the box; override + /// via `LLAMA_SWAP_VISION_MODEL` for a dedicated vision slot. + pub vision_model: String, + num_ctx: Option<i32>, + temperature: Option<f32>, + top_p: Option<f32>, + top_k: Option<i32>, + min_p: Option<f32>, +} + +impl LlamaCppClient { + pub fn new(base_url: Option<String>, primary_model: Option<String>) -> Self { + let timeout_secs = std::env::var("LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS") + .ok() + .and_then(|v| v.parse::<u64>().ok()) + .unwrap_or(DEFAULT_REQUEST_TIMEOUT_SECS); + let pm = primary_model.unwrap_or_else(|| DEFAULT_PRIMARY_MODEL.to_string()); + Self { + client: Client::builder() + .connect_timeout(Duration::from_secs(10)) + .timeout(Duration::from_secs(timeout_secs)) + .build() + .unwrap_or_else(|_| Client::new()), + base_url: base_url.unwrap_or_else(|| DEFAULT_BASE_URL.to_string()), + primary_model: pm.clone(), + embedding_model: DEFAULT_EMBEDDING_MODEL.to_string(), + vision_model: pm, + num_ctx: None, + temperature: None, + top_p: None, + top_k: None, + min_p: None, + } + } + + pub fn set_embedding_model(&mut self, model: String) { + self.embedding_model = model; + } + + pub fn set_vision_model(&mut self, model: String) { + self.vision_model = model; + } + + pub fn set_num_ctx(&mut self, num_ctx: Option<i32>) { + self.num_ctx = num_ctx; + } + + pub fn set_sampling_params( + &mut self, + temperature: Option<f32>, + top_p: Option<f32>, + top_k: Option<i32>, + min_p: Option<f32>, + ) { + self.temperature = temperature; + self.top_p = top_p; + self.top_k = top_k; + self.min_p = min_p; + } + + /// Translate canonical messages to the OpenAI-compatible wire shape. + /// Behaviorally identical to `OpenRouterClient::messages_to_openai` — + /// stringify tool-call arguments, rewrite images into content-parts, attach + /// `tool_call_id` to `role=tool` messages based on the preceding assistant + /// turn's tool calls. + fn messages_to_openai(messages: &[ChatMessage]) -> Vec<Value> { + let mut out = Vec::with_capacity(messages.len()); + let mut last_tool_call_ids: Vec<String> = Vec::new(); + let mut next_tool_result_idx: usize = 0; + + for msg in messages { + let mut obj = serde_json::Map::new(); + obj.insert("role".into(), Value::String(msg.role.clone())); + + // Assistant messages with tool_calls must emit `content: null` + // (not `""`) — some Jinja templates (Mistral-family) treat + // empty-string content as a regular message rather than a + // tool-calling turn, breaking role-alternation validation. + let has_tool_calls = msg.role == "assistant" + && msg.tool_calls.as_ref().is_some_and(|tcs| !tcs.is_empty()); + + match &msg.images { + Some(images) if !images.is_empty() => { + let mut parts: Vec<Value> = Vec::new(); + if !msg.content.is_empty() { + parts.push(json!({"type": "text", "text": msg.content})); + } + for img in images { + let url = image_to_data_url(img); + parts.push(json!({ + "type": "image_url", + "image_url": { "url": url } + })); + } + obj.insert("content".into(), Value::Array(parts)); + } + _ if has_tool_calls && msg.content.is_empty() => { + obj.insert("content".into(), Value::Null); + } + _ => { + obj.insert("content".into(), Value::String(msg.content.clone())); + } + } + + if let Some(tcs) = &msg.tool_calls + && msg.role == "assistant" + { + let converted: Vec<Value> = tcs + .iter() + .enumerate() + .map(|(i, call)| { + let id = call.id.clone().unwrap_or_else(|| format!("call_{}", i)); + let args_str = serde_json::to_string(&call.function.arguments) + .unwrap_or_else(|_| "{}".to_string()); + json!({ + "id": id, + "type": "function", + "function": { + "name": call.function.name, + "arguments": args_str, + } + }) + }) + .collect(); + last_tool_call_ids = converted + .iter() + .filter_map(|v| v.get("id").and_then(|x| x.as_str()).map(String::from)) + .collect(); + next_tool_result_idx = 0; + obj.insert("tool_calls".into(), Value::Array(converted)); + } + + if msg.role == "tool" { + let id = last_tool_call_ids + .get(next_tool_result_idx) + .cloned() + .unwrap_or_else(|| "call_0".to_string()); + obj.insert("tool_call_id".into(), Value::String(id)); + next_tool_result_idx += 1; + } + + out.push(Value::Object(obj)); + } + + out + } + + /// Parse an OpenAI-compatible assistant message back into canonical shape. + /// llama.cpp emits `reasoning_content` on thinking models; we drop it for + /// parity with OpenRouter (which also strips upstream reasoning fields). + fn openai_message_to_chat(msg: &Value) -> Result<ChatMessage> { + let obj = msg + .as_object() + .ok_or_else(|| anyhow!("response message is not an object"))?; + let role = obj + .get("role") + .and_then(|v| v.as_str()) + .unwrap_or("assistant") + .to_string(); + let content = obj + .get("content") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); + + let tool_calls = if let Some(tcs) = obj.get("tool_calls").and_then(|v| v.as_array()) { + let mut parsed = Vec::with_capacity(tcs.len()); + for tc in tcs { + let id = tc.get("id").and_then(|v| v.as_str()).map(String::from); + let function = tc + .get("function") + .ok_or_else(|| anyhow!("tool_call missing function field"))?; + let name = function + .get("name") + .and_then(|v| v.as_str()) + .unwrap_or_default() + .to_string(); + let args_value = match function.get("arguments") { + Some(Value::String(s)) => { + serde_json::from_str::<Value>(s).unwrap_or_else(|_| json!({})) + } + Some(v @ Value::Object(_)) => v.clone(), + _ => json!({}), + }; + parsed.push(ToolCall { + id, + function: ToolCallFunction { + name, + arguments: args_value, + }, + }); + } + Some(parsed) + } else { + None + }; + + Ok(ChatMessage { + role, + content, + tool_calls, + images: None, + }) + } + + fn build_options(&self) -> Vec<(&'static str, Value)> { + let mut v = Vec::new(); + if let Some(t) = self.temperature { + v.push(("temperature", json!(t))); + } + if let Some(p) = self.top_p { + v.push(("top_p", json!(p))); + } + if let Some(k) = self.top_k { + v.push(("top_k", json!(k))); + } + if let Some(m) = self.min_p { + v.push(("min_p", json!(m))); + } + // num_ctx isn't an OpenAI param; llama-server bakes ctx in at launch + // via -c, so we silently drop the override here. The config.yaml + // entry is the source of truth for context size. + let _ = self.num_ctx; + v + } + + /// Issue a chat request with an explicit model id override. Used by + /// `describe_image` to route through the vision slot without mutating + /// `self.primary_model`. + async fn chat_completion_with_model( + &self, + model: &str, + messages: Vec<ChatMessage>, + tools: Vec<Tool>, + ) -> Result<(ChatMessage, Option<i32>, Option<i32>)> { + let url = format!("{}/chat/completions", self.base_url); + let roles: Vec<&str> = messages.iter().map(|m| m.role.as_str()).collect(); + log::debug!( + "llama-swap chat_completion: model={} roles={:?} tools={}", + model, + roles, + tools.len() + ); + let mut body = serde_json::Map::new(); + body.insert("model".into(), Value::String(model.to_string())); + body.insert( + "messages".into(), + Value::Array(Self::messages_to_openai(&messages)), + ); + body.insert("stream".into(), Value::Bool(false)); + if !tools.is_empty() { + body.insert( + "tools".into(), + serde_json::to_value(&tools).context("serializing tools")?, + ); + } + for (k, v) in self.build_options() { + body.insert(k.into(), v); + } + + let resp = self + .client + .post(&url) + .json(&Value::Object(body)) + .send() + .await + .with_context(|| format!("POST {} failed", url))?; + + if !resp.status().is_success() { + let status = resp.status(); + let body = resp.text().await.unwrap_or_default(); + bail!("llama-swap chat request failed: {} — {}", status, body); + } + + let parsed: Value = resp.json().await.context("parsing chat response")?; + let choice = parsed + .get("choices") + .and_then(|v| v.as_array()) + .and_then(|a| a.first()) + .ok_or_else(|| { + anyhow!( + "response missing choices[0]: {}", + extract_error_detail(&parsed) + ) + })?; + let msg = choice.get("message").ok_or_else(|| { + anyhow!( + "choices[0] missing message: {}", + extract_error_detail(&parsed) + ) + })?; + let chat_msg = Self::openai_message_to_chat(msg)?; + + let usage = parsed.get("usage"); + let prompt_tokens = usage + .and_then(|u| u.get("prompt_tokens")) + .and_then(|v| v.as_i64()) + .map(|n| n as i32); + let completion_tokens = usage + .and_then(|u| u.get("completion_tokens")) + .and_then(|v| v.as_i64()) + .map(|n| n as i32); + + log_timings(&parsed, prompt_tokens, completion_tokens); + + Ok((chat_msg, prompt_tokens, completion_tokens)) + } +} + +#[async_trait] +impl LlmClient for LlamaCppClient { + async fn generate( + &self, + prompt: &str, + system: Option<&str>, + images: Option<Vec<String>>, + ) -> Result<String> { + let mut messages: Vec<ChatMessage> = Vec::new(); + if let Some(sys) = system { + messages.push(ChatMessage::system(sys)); + } + let mut user = ChatMessage::user(prompt); + user.images = images; + messages.push(user); + + let (reply, _, _) = self.chat_with_tools(messages, Vec::new()).await?; + Ok(reply.content) + } + + async fn chat_with_tools( + &self, + messages: Vec<ChatMessage>, + tools: Vec<Tool>, + ) -> Result<(ChatMessage, Option<i32>, Option<i32>)> { + log::info!( + "llama-swap chat_with_tools: model={} messages={} tools={}", + self.primary_model, + messages.len(), + tools.len() + ); + self.chat_completion_with_model(&self.primary_model.clone(), messages, tools) + .await + } + + async fn chat_with_tools_stream( + &self, + messages: Vec<ChatMessage>, + tools: Vec<Tool>, + ) -> Result<BoxStream<'static, Result<LlmStreamEvent>>> { + let url = format!("{}/chat/completions", self.base_url); + let roles: Vec<&str> = messages.iter().map(|m| m.role.as_str()).collect(); + log::debug!( + "llama-swap stream: model={} roles={:?} tools={}", + self.primary_model, + roles, + tools.len() + ); + let mut body = serde_json::Map::new(); + body.insert("model".into(), Value::String(self.primary_model.clone())); + body.insert( + "messages".into(), + Value::Array(Self::messages_to_openai(&messages)), + ); + body.insert("stream".into(), Value::Bool(true)); + body.insert( + "stream_options".into(), + serde_json::json!({ "include_usage": true }), + ); + if !tools.is_empty() { + body.insert( + "tools".into(), + serde_json::to_value(&tools).context("serializing tools")?, + ); + } + for (k, v) in self.build_options() { + body.insert(k.into(), v); + } + + let resp = self + .client + .post(&url) + .json(&Value::Object(body)) + .send() + .await + .with_context(|| format!("POST {} failed", url))?; + + if !resp.status().is_success() { + let status = resp.status(); + let body = resp.text().await.unwrap_or_default(); + bail!("llama-swap stream request failed: {} — {}", status, body); + } + + let byte_stream = resp.bytes_stream(); + let stream = async_stream::stream! { + let mut byte_stream = byte_stream; + let mut buf: Vec<u8> = Vec::new(); + let mut accumulated_content = String::new(); + let mut tool_state: std::collections::BTreeMap< + usize, + (Option<String>, Option<String>, String), + > = std::collections::BTreeMap::new(); + let mut role = "assistant".to_string(); + let mut prompt_tokens: Option<i32> = None; + let mut completion_tokens: Option<i32> = None; + let mut last_frame: Option<Value> = None; + let mut done_seen = false; + + while let Some(chunk) = byte_stream.next().await { + let chunk = match chunk { + Ok(b) => b, + Err(e) => { + yield Err(anyhow!("stream read failed: {}", e)); + return; + } + }; + buf.extend_from_slice(&chunk); + + while let Some(sep) = find_double_newline(&buf) { + let frame = buf.drain(..sep + 2).collect::<Vec<_>>(); + let frame_str = match std::str::from_utf8(&frame) { + Ok(s) => s, + Err(_) => continue, + }; + for line in frame_str.lines() { + let line = line.trim_end_matches('\r'); + let payload = match line.strip_prefix("data: ") { + Some(p) => p, + None => continue, + }; + if payload == "[DONE]" { + done_seen = true; + break; + } + let v: Value = match serde_json::from_str(payload) { + Ok(v) => v, + Err(e) => { + log::warn!( + "malformed llama-swap SSE frame: {} ({})", + payload, + e + ); + continue; + } + }; + + if let Some(usage) = v.get("usage") { + prompt_tokens = usage + .get("prompt_tokens") + .and_then(|n| n.as_i64()) + .map(|n| n as i32); + completion_tokens = usage + .get("completion_tokens") + .and_then(|n| n.as_i64()) + .map(|n| n as i32); + last_frame = Some(v.clone()); + } + + let Some(choices) = v.get("choices").and_then(|c| c.as_array()) + else { + continue; + }; + let Some(choice) = choices.first() else { continue }; + let delta = match choice.get("delta") { + Some(d) => d, + None => continue, + }; + if let Some(r) = delta.get("role").and_then(|v| v.as_str()) { + role = r.to_string(); + } + if let Some(content) = + delta.get("content").and_then(|v| v.as_str()) + && !content.is_empty() + { + accumulated_content.push_str(content); + yield Ok(LlmStreamEvent::TextDelta(content.to_string())); + } + if let Some(tcs) = delta.get("tool_calls").and_then(|v| v.as_array()) { + for tc_delta in tcs { + let idx = tc_delta + .get("index") + .and_then(|n| n.as_u64()) + .unwrap_or(0) as usize; + let entry = tool_state + .entry(idx) + .or_insert((None, None, String::new())); + if let Some(id) = + tc_delta.get("id").and_then(|v| v.as_str()) + { + entry.0 = Some(id.to_string()); + } + if let Some(func) = tc_delta.get("function") { + if let Some(name) = + func.get("name").and_then(|v| v.as_str()) + { + entry.1 = Some(name.to_string()); + } + if let Some(args) = + func.get("arguments").and_then(|v| v.as_str()) + { + entry.2.push_str(args); + } + } + } + } + } + if done_seen { + break; + } + } + if done_seen { + break; + } + } + + let tool_calls: Option<Vec<ToolCall>> = if tool_state.is_empty() { + None + } else { + let mut v = Vec::with_capacity(tool_state.len()); + for (_idx, (id, name, args)) in tool_state { + let arguments: Value = if args.trim().is_empty() { + Value::Object(Default::default()) + } else { + serde_json::from_str(&args).unwrap_or_else(|_| { + Value::Object(Default::default()) + }) + }; + v.push(ToolCall { + id, + function: ToolCallFunction { + name: name.unwrap_or_default(), + arguments, + }, + }); + } + Some(v) + }; + + if let Some(ref frame) = last_frame { + log_timings(frame, prompt_tokens, completion_tokens); + } + + let message = ChatMessage { + role, + content: accumulated_content, + tool_calls, + images: None, + }; + yield Ok(LlmStreamEvent::Done { + message, + prompt_eval_count: prompt_tokens, + eval_count: completion_tokens, + }); + }; + + Ok(Box::pin(stream)) + } + + async fn generate_embeddings(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>> { + let url = format!("{}/embeddings", self.base_url); + let body = json!({ + "model": self.embedding_model, + "input": texts, + }); + + let resp = self + .client + .post(&url) + .json(&body) + .send() + .await + .with_context(|| format!("POST {} failed", url))?; + + if !resp.status().is_success() { + let status = resp.status(); + let body = resp.text().await.unwrap_or_default(); + bail!("llama-swap embedding request failed: {} — {}", status, body); + } + + #[derive(Deserialize)] + struct EmbedResponse { + data: Vec<EmbedItem>, + } + #[derive(Deserialize)] + struct EmbedItem { + embedding: Vec<f32>, + } + + let parsed: EmbedResponse = resp.json().await.context("parsing embed response")?; + Ok(parsed.data.into_iter().map(|i| i.embedding).collect()) + } + + async fn describe_image(&self, image_base64: &str) -> Result<String> { + let prompt = "Briefly describe what you see in this image in 1-2 sentences. \ + Focus on the people, location, and activity."; + let system = "You are a scene description assistant. Be concise and factual."; + + let messages = vec![ + ChatMessage::system(system), + ChatMessage { + role: "user".to_string(), + content: prompt.to_string(), + tool_calls: None, + images: Some(vec![image_base64.to_string()]), + }, + ]; + + let (reply, _, _) = self + .chat_completion_with_model(&self.vision_model.clone(), messages, Vec::new()) + .await?; + Ok(reply.content) + } + + async fn list_models(&self) -> Result<Vec<ModelCapabilities>> { + let url = format!("{}/models", self.base_url); + let resp = self + .client + .get(&url) + .send() + .await + .with_context(|| format!("GET {} failed", url))?; + + if !resp.status().is_success() { + let status = resp.status(); + let body = resp.text().await.unwrap_or_default(); + bail!("llama-swap list_models failed: {} — {}", status, body); + } + + let parsed: Value = resp.json().await.context("parsing models response")?; + let data = parsed + .get("data") + .and_then(|v| v.as_array()) + .ok_or_else(|| anyhow!("models response missing data[]"))?; + + let caps: Vec<ModelCapabilities> = data + .iter() + .map(|m| self.parse_model_capabilities(m)) + .collect(); + + Ok(caps) + } + + async fn model_capabilities(&self, model: &str) -> Result<ModelCapabilities> { + let all = self.list_models().await?; + all.into_iter() + .find(|m| m.name == model) + .ok_or_else(|| anyhow!("model '{}' not found on llama-swap", model)) + } + + fn primary_model(&self) -> &str { + &self.primary_model + } +} + +impl LlamaCppClient { + fn parse_model_capabilities(&self, m: &Value) -> ModelCapabilities { + let name = m + .get("id") + .and_then(|v| v.as_str()) + .unwrap_or_default() + .to_string(); + // llama-swap doesn't expose per-model modality flags. Assume all + // configured models support vision and tool calling; a model + // without multimodal support surfaces as a chat-call error. + ModelCapabilities { + name, + has_vision: true, + has_tool_calling: true, + } + } +} + +/// Extract a diagnostic fragment from a llama-swap / llama-server response +/// that doesn't match the expected `{choices: [...]}` shape. llama-server +/// returns errors as `{"error": {"message": "...", "code": N, "type": "..."}}`; +fn log_timings(parsed: &Value, prompt_tokens: Option<i32>, completion_tokens: Option<i32>) { + let timings = match parsed.get("timings") { + Some(t) => t, + None => return, + }; + let prompt_tps = timings.get("prompt_per_second").and_then(|v| v.as_f64()); + let gen_tps = timings.get("predicted_per_second").and_then(|v| v.as_f64()); + let prompt_ms = timings.get("prompt_ms").and_then(|v| v.as_f64()); + let gen_ms = timings.get("predicted_ms").and_then(|v| v.as_f64()); + + let mut parts: Vec<String> = Vec::new(); + if let Some(c) = prompt_tokens { + let mut s = format!("prompt={} tok", c); + if let Some(ms) = prompt_ms { + s.push_str(&format!(" ({:.0} ms", ms)); + if let Some(tps) = prompt_tps { + s.push_str(&format!(", {:.1} tok/s", tps)); + } + s.push(')'); + } + parts.push(s); + } + if let Some(c) = completion_tokens { + let mut s = format!("gen={} tok", c); + if let Some(ms) = gen_ms { + s.push_str(&format!(" ({:.0} ms", ms)); + if let Some(tps) = gen_tps { + s.push_str(&format!(", {:.1} tok/s", tps)); + } + s.push(')'); + } + parts.push(s); + } + if !parts.is_empty() { + log::info!("llama-swap chat metrics — {}", parts.join(", ")); + } +} + +/// llama-swap itself sometimes wraps subprocess failures with its own +/// `{"error": "..."}` flat shape. Surface either when present, otherwise fall +/// back to a truncated raw-JSON view. +fn extract_error_detail(parsed: &Value) -> String { + if let Some(err) = parsed.get("error") { + match err { + Value::Object(_) => { + let message = err + .get("message") + .and_then(|v| v.as_str()) + .unwrap_or("(no message)"); + let code = err + .get("code") + .map(|v| match v { + Value::String(s) => s.clone(), + other => other.to_string(), + }) + .unwrap_or_else(|| "?".to_string()); + let short_message: String = message.chars().take(240).collect(); + return format!("error code={} message=\"{}\"", code, short_message); + } + Value::String(s) => { + let short: String = s.chars().take(240).collect(); + return format!("error=\"{}\"", short); + } + _ => {} + } + } + let raw = parsed.to_string(); + raw.chars().take(300).collect() +} + +fn find_double_newline(buf: &[u8]) -> Option<usize> { + for i in 0..buf.len().saturating_sub(1) { + if buf[i] == b'\n' && buf[i + 1] == b'\n' { + return Some(i); + } + if i + 3 < buf.len() + && buf[i] == b'\r' + && buf[i + 1] == b'\n' + && buf[i + 2] == b'\r' + && buf[i + 3] == b'\n' + { + return Some(i + 1); + } + } + None +} + +fn image_to_data_url(img: &str) -> String { + if img.starts_with("data:") { + img.to_string() + } else { + format!("data:image/jpeg;base64,{}", img) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn tool_call_arguments_stringified_on_send() { + let msg = ChatMessage { + role: "assistant".into(), + content: String::new(), + tool_calls: Some(vec![ToolCall { + id: Some("call_abc".into()), + function: ToolCallFunction { + name: "search_sms".into(), + arguments: json!({"query": "hello", "limit": 5}), + }, + }]), + images: None, + }; + + let wire = LlamaCppClient::messages_to_openai(&[msg]); + let tcs = wire[0] + .get("tool_calls") + .and_then(|v| v.as_array()) + .expect("tool_calls present"); + let args = tcs[0] + .get("function") + .and_then(|f| f.get("arguments")) + .and_then(|a| a.as_str()) + .expect("arguments stringified"); + let parsed: Value = serde_json::from_str(args).unwrap(); + assert_eq!(parsed["query"], "hello"); + assert_eq!(parsed["limit"], 5); + } + + #[test] + fn tool_call_arguments_parsed_on_receive() { + let response_msg = json!({ + "role": "assistant", + "content": "", + "tool_calls": [{ + "id": "call_xyz", + "type": "function", + "function": { + "name": "get_weather", + "arguments": "{\"city\":\"Boston\",\"units\":\"celsius\"}" + } + }] + }); + let parsed = LlamaCppClient::openai_message_to_chat(&response_msg).unwrap(); + let tcs = parsed.tool_calls.unwrap(); + assert_eq!(tcs.len(), 1); + assert_eq!(tcs[0].function.name, "get_weather"); + assert_eq!(tcs[0].function.arguments["city"], "Boston"); + assert_eq!(tcs[0].function.arguments["units"], "celsius"); + assert_eq!(tcs[0].id.as_deref(), Some("call_xyz")); + } + + #[test] + fn tool_call_arguments_accept_native_json_on_receive() { + // Some llama.cpp builds emit arguments as a JSON object directly when + // jinja's tool-output strict-string rule isn't applied — accept both. + let response_msg = json!({ + "role": "assistant", + "content": "", + "tool_calls": [{ + "id": "call_1", + "type": "function", + "function": { + "name": "foo", + "arguments": {"nested": {"k": 1}} + } + }] + }); + let parsed = LlamaCppClient::openai_message_to_chat(&response_msg).unwrap(); + let tc = &parsed.tool_calls.unwrap()[0]; + assert_eq!(tc.function.arguments["nested"]["k"], 1); + } + + #[test] + fn images_become_content_parts() { + let mut msg = ChatMessage::user("What is in this photo?"); + msg.images = Some(vec!["BASE64DATA".into()]); + + let wire = LlamaCppClient::messages_to_openai(&[msg]); + let content = wire[0].get("content").and_then(|v| v.as_array()).unwrap(); + assert_eq!(content.len(), 2); + assert_eq!(content[0]["type"], "text"); + assert_eq!(content[0]["text"], "What is in this photo?"); + assert_eq!(content[1]["type"], "image_url"); + assert_eq!( + content[1]["image_url"]["url"], + "data:image/jpeg;base64,BASE64DATA" + ); + } + + #[test] + fn data_url_images_pass_through_unchanged() { + let mut msg = ChatMessage::user(""); + msg.images = Some(vec!["data:image/png;base64,ABCDEF".into()]); + let wire = LlamaCppClient::messages_to_openai(&[msg]); + let content = wire[0].get("content").and_then(|v| v.as_array()).unwrap(); + assert_eq!(content.len(), 1); + assert_eq!( + content[0]["image_url"]["url"], + "data:image/png;base64,ABCDEF" + ); + } + + #[test] + fn text_only_message_stays_string() { + let msg = ChatMessage::user("hello"); + let wire = LlamaCppClient::messages_to_openai(&[msg]); + assert_eq!(wire[0]["content"], "hello"); + assert!(wire[0]["content"].as_str().is_some()); + } + + #[test] + fn tool_result_inherits_tool_call_id_from_prior_assistant() { + let assistant = ChatMessage { + role: "assistant".into(), + content: String::new(), + tool_calls: Some(vec![ToolCall { + id: Some("call_42".into()), + function: ToolCallFunction { + name: "lookup".into(), + arguments: json!({}), + }, + }]), + images: None, + }; + let tool_result = ChatMessage::tool_result("found it"); + + let wire = LlamaCppClient::messages_to_openai(&[assistant, tool_result]); + assert_eq!(wire[1]["role"], "tool"); + assert_eq!(wire[1]["tool_call_id"], "call_42"); + } + + #[test] + fn multiple_tool_results_map_to_sequential_call_ids() { + let assistant = ChatMessage { + role: "assistant".into(), + content: String::new(), + tool_calls: Some(vec![ + ToolCall { + id: Some("call_A".into()), + function: ToolCallFunction { + name: "a".into(), + arguments: json!({}), + }, + }, + ToolCall { + id: Some("call_B".into()), + function: ToolCallFunction { + name: "b".into(), + arguments: json!({}), + }, + }, + ]), + images: None, + }; + let r1 = ChatMessage::tool_result("a result"); + let r2 = ChatMessage::tool_result("b result"); + + let wire = LlamaCppClient::messages_to_openai(&[assistant, r1, r2]); + assert_eq!(wire[1]["tool_call_id"], "call_A"); + assert_eq!(wire[2]["tool_call_id"], "call_B"); + } + + #[test] + fn missing_tool_call_id_gets_synthetic_fallback() { + let assistant = ChatMessage { + role: "assistant".into(), + content: String::new(), + tool_calls: Some(vec![ToolCall { + id: None, + function: ToolCallFunction { + name: "noid".into(), + arguments: json!({}), + }, + }]), + images: None, + }; + let wire = LlamaCppClient::messages_to_openai(&[assistant]); + let tcs = wire[0] + .get("tool_calls") + .and_then(|v| v.as_array()) + .unwrap(); + assert_eq!(tcs[0]["id"], "call_0"); + } + + #[test] + fn all_models_report_vision_capable() { + let mut c = LlamaCppClient::new(None, Some("chat".into())); + c.set_vision_model("vision".into()); + + let m_chat = json!({ "id": "chat" }); + let m_vision = json!({ "id": "vision" }); + let m_other = json!({ "id": "embed" }); + + let chat = c.parse_model_capabilities(&m_chat); + let vision = c.parse_model_capabilities(&m_vision); + let other = c.parse_model_capabilities(&m_other); + + assert!(chat.has_vision); + assert!(chat.has_tool_calling); + assert!(vision.has_vision); + assert!(other.has_vision); + } + + #[test] + fn vision_model_defaults_to_primary() { + let c = LlamaCppClient::new(None, Some("qwen3:32b".into())); + assert_eq!(c.primary_model, "qwen3:32b"); + assert_eq!(c.vision_model, "qwen3:32b"); + } + + #[test] + fn vision_model_explicit_override_diverges_from_primary() { + let mut c = LlamaCppClient::new(None, Some("qwen3:32b".into())); + c.set_vision_model("minicpm-v".into()); + assert_eq!(c.primary_model, "qwen3:32b"); + assert_eq!(c.vision_model, "minicpm-v"); + } + + #[test] + fn cloned_local_with_model_override_keeps_all_slots_consistent() { + // Simulates what resolve_backend does for the `local` client: + // clone the configured client, then override primary + vision + // to match the user-selected chat model. This prevents mid-turn + // model swaps in llama-swap exclusive mode. + let mut base = LlamaCppClient::new(None, Some("chat".into())); + base.set_vision_model("vision".into()); + base.set_embedding_model("embed".into()); + + let mut local = base.clone(); + let user_selected = "qwen3:32b"; + local.primary_model = user_selected.to_string(); + local.set_vision_model(user_selected.to_string()); + + // Chat generation (rerank) routes through primary_model. + assert_eq!(local.primary_model, user_selected); + // describe_image routes through vision_model. + assert_eq!(local.vision_model, user_selected); + // Embeddings stay on the dedicated slot — separate endpoint, + // no model swap conflict. + assert_eq!(local.embedding_model, "embed"); + } + + #[test] + fn hybrid_mode_local_client_preserves_vision_model() { + // In hybrid mode, overrides.model is an OpenRouter model id + // (e.g. "google/gemini-3-flash-preview"). The local llamacpp + // client must NOT adopt that as its vision_model — it should + // keep the configured LLAMA_SWAP_VISION_MODEL so describe_image + // hits the correct local slot instead of sending an unknown + // model name to llama-swap. + let mut base = LlamaCppClient::new(None, Some("chat".into())); + base.set_vision_model("vision".into()); + base.set_embedding_model("embed".into()); + + // Simulate what resolve_backend SHOULD do for hybrid mode: + // clone but do NOT override primary_model / vision_model. + let local = base.clone(); + + // The local client keeps its configured slots. + assert_eq!(local.primary_model, "chat"); + assert_eq!(local.vision_model, "vision"); + assert_eq!(local.embedding_model, "embed"); + } + + #[test] + fn assistant_tool_calls_emit_null_content() { + let msg = ChatMessage { + role: "assistant".into(), + content: String::new(), + tool_calls: Some(vec![ToolCall { + id: Some("call_1".into()), + function: ToolCallFunction { + name: "search".into(), + arguments: json!({}), + }, + }]), + images: None, + }; + let wire = LlamaCppClient::messages_to_openai(&[msg]); + assert!( + wire[0]["content"].is_null(), + "empty content + tool_calls should emit null" + ); + } + + #[test] + fn assistant_with_content_and_tool_calls_preserves_content() { + let msg = ChatMessage { + role: "assistant".into(), + content: "Let me search for that.".into(), + tool_calls: Some(vec![ToolCall { + id: Some("call_1".into()), + function: ToolCallFunction { + name: "search".into(), + arguments: json!({}), + }, + }]), + images: None, + }; + let wire = LlamaCppClient::messages_to_openai(&[msg]); + assert_eq!(wire[0]["content"], "Let me search for that."); + } + + #[test] + fn assistant_without_tool_calls_keeps_empty_string_content() { + let msg = ChatMessage { + role: "assistant".into(), + content: String::new(), + tool_calls: None, + images: None, + }; + let wire = LlamaCppClient::messages_to_openai(&[msg]); + assert_eq!(wire[0]["content"], ""); + } +} diff --git a/src/ai/mod.rs b/src/ai/mod.rs index 3468325..c991c71 100644 --- a/src/ai/mod.rs +++ b/src/ai/mod.rs @@ -1,10 +1,12 @@ pub mod apollo_client; +pub mod backend; pub mod clip_client; pub mod daily_summary_job; pub mod face_client; pub mod handlers; pub mod insight_chat; pub mod insight_generator; +pub mod llamacpp; pub mod llm_client; pub mod ollama; pub mod openrouter; @@ -23,6 +25,7 @@ pub use handlers::{ get_insight_handler, get_openrouter_models_handler, rate_insight_handler, }; pub use insight_generator::InsightGenerator; +pub use llamacpp::LlamaCppClient; #[allow(unused_imports)] pub use llm_client::{ ChatMessage, LlmClient, ModelCapabilities, Tool, ToolCall, ToolCallFunction, ToolFunction, @@ -38,3 +41,87 @@ pub use sms_client::{SmsApiClient, SmsMessage}; pub fn user_display_name() -> String { std::env::var("USER_NAME").unwrap_or_else(|_| "Me".to_string()) } + +/// One switch for the "local" LLM stack: when `LLM_BACKEND=llamacpp` is +/// set, chat / vision describe / embeddings all route through llama-swap +/// instead of Ollama. Any other value (including unset, the default) is +/// Ollama. This is intentionally global — embeddings must be drawn from +/// a single source or similarity search across the index breaks (mixed +/// vector spaces, possibly mixed dims). The `backend=hybrid` per-request +/// override remains orthogonal: it always sends chat to OpenRouter, and +/// uses `LLM_BACKEND` for the describe-then-inline vision pass. +pub fn local_backend_is_llamacpp() -> bool { + matches!( + std::env::var("LLM_BACKEND") + .ok() + .as_deref() + .map(|s| s.trim().to_lowercase()) + .as_deref(), + Some("llamacpp") + ) +} + +/// Embed one string via the configured local backend. Routes through +/// llama-swap when `LLM_BACKEND=llamacpp` (and a client is configured), +/// else Ollama. Returns the single embedding vector. See +/// [`local_backend_is_llamacpp`] for the rationale on consistency. +pub async fn embed_one( + ollama: &OllamaClient, + llamacpp: Option<&LlamaCppClient>, + text: &str, +) -> anyhow::Result<Vec<f32>> { + if local_backend_is_llamacpp() { + if let Some(lc) = llamacpp { + let mut vecs = <LlamaCppClient as LlmClient>::generate_embeddings(lc, &[text]).await?; + return vecs + .pop() + .ok_or_else(|| anyhow::anyhow!("llama-swap returned no embeddings")); + } + log::warn!( + "LLM_BACKEND=llamacpp but LlamaCppClient is unconfigured; falling back to Ollama embeddings" + ); + } + ollama.generate_embedding(text).await +} + +#[cfg(test)] +mod env_dispatch_tests { + use super::*; + + fn with_env<F: FnOnce()>(key: &str, val: Option<&str>, f: F) { + let prev = std::env::var(key).ok(); + match val { + Some(v) => unsafe { std::env::set_var(key, v) }, + None => unsafe { std::env::remove_var(key) }, + } + f(); + match prev { + Some(v) => unsafe { std::env::set_var(key, v) }, + None => unsafe { std::env::remove_var(key) }, + } + } + + #[test] + fn llm_backend_defaults_to_ollama() { + with_env("LLM_BACKEND", None, || { + assert!(!local_backend_is_llamacpp()); + }); + } + + #[test] + fn llm_backend_llamacpp_case_insensitive() { + with_env("LLM_BACKEND", Some("LlamaCpp"), || { + assert!(local_backend_is_llamacpp()); + }); + with_env("LLM_BACKEND", Some(" llamacpp "), || { + assert!(local_backend_is_llamacpp()); + }); + } + + #[test] + fn llm_backend_unknown_value_is_ollama() { + with_env("LLM_BACKEND", Some("vllm"), || { + assert!(!local_backend_is_llamacpp()); + }); + } +} diff --git a/src/ai/sms_client.rs b/src/ai/sms_client.rs index 6661bac..d5e175f 100644 --- a/src/ai/sms_client.rs +++ b/src/ai/sms_client.rs @@ -281,6 +281,9 @@ impl SmsApiClient { if let Some(cid) = params.contact_id { url.push_str(&format!("&contact_id={}", cid)); } + if let Some(ref c) = params.contact { + url.push_str(&format!("&contact={}", urlencoding::encode(c))); + } if let Some(off) = params.offset { url.push_str(&format!("&offset={}", off)); } @@ -413,6 +416,9 @@ pub struct SmsSearchParams<'a> { pub mode: &'a str, pub limit: usize, pub contact_id: Option<i64>, + /// Contact name (case-insensitive). Resolved to a numeric ID by the + /// SMS-API server when `contact_id` is not set. + pub contact: Option<String>, /// Unix-seconds inclusive lower bound on `date`. pub date_from: Option<i64>, /// Unix-seconds inclusive upper bound on `date`. diff --git a/src/bin/populate_knowledge.rs b/src/bin/populate_knowledge.rs index 29945d7..71f2f8a 100644 --- a/src/bin/populate_knowledge.rs +++ b/src/bin/populate_knowledge.rs @@ -195,6 +195,7 @@ async fn main() -> anyhow::Result<()> { let generator = InsightGenerator::new( ollama, None, + None, sms_client, apollo_client, insight_dao.clone(), diff --git a/src/content_hash.rs b/src/content_hash.rs index d68dbef..a2a4632 100644 --- a/src/content_hash.rs +++ b/src/content_hash.rs @@ -62,6 +62,15 @@ pub fn large_preview_path(thumbs_dir: &Path, hash: &str) -> PathBuf { .join(format!("{}.jpg", hash)) } +/// Hash-keyed xlarge-preview path: `<thumbs_dir>/_xlarge/<hash[..2]>/<hash>.jpg`. +pub fn xlarge_preview_path(thumbs_dir: &Path, hash: &str) -> PathBuf { + let shard = shard_prefix(hash); + thumbs_dir + .join("_xlarge") + .join(shard) + .join(format!("{}.jpg", hash)) +} + /// Hash-keyed HLS output directory: `<video_dir>/<hash[..2]>/<hash>/`. /// The playlist lives at `playlist.m3u8` inside this directory and its /// segments are co-located so HLS relative references Just Work. See diff --git a/src/data/mod.rs b/src/data/mod.rs index 4780d6c..931b3c6 100644 --- a/src/data/mod.rs +++ b/src/data/mod.rs @@ -194,6 +194,7 @@ pub enum MediaType { #[serde(rename_all = "lowercase")] pub enum PhotoSize { Full, + XLarge, Large, Thumb, } diff --git a/src/handlers/image.rs b/src/handlers/image.rs index ca0f598..c1dbffb 100644 --- a/src/handlers/image.rs +++ b/src/handlers/image.rs @@ -83,12 +83,14 @@ pub async fn get_image( if let Some((library, path)) = resolved { let image_size = req.size.unwrap_or(PhotoSize::Full); - // `size=large` is only meaningful for stills — there's no useful - // "2048px video preview" tier. Videos fall back to the existing - // thumb pipeline (which already handles gif/static selection). - // `mut` so the Large branch can downgrade itself to `Full` after a - // generation failure (RAW-preview branch below keys off `Full`). - let mut image_size = if image_size == PhotoSize::Large && file_types::is_video_file(&path) { + // `size=large|xlarge` is only meaningful for stills — there's no + // useful "resized video preview" tier. Videos fall back to the + // existing thumb pipeline (which already handles gif/static + // selection). `mut` so preview branches can downgrade to `Full` + // after a generation failure. + let mut image_size = if (image_size == PhotoSize::Large || image_size == PhotoSize::XLarge) + && file_types::is_video_file(&path) + { PhotoSize::Thumb } else { image_size @@ -196,6 +198,93 @@ pub async fn get_image( image_size = PhotoSize::Full; } + if image_size == PhotoSize::XLarge { + let relative_path = path + .strip_prefix(&library.root_path) + .expect("Error stripping library root prefix from xlarge preview"); + let relative_path_str = relative_path.to_string_lossy().replace('\\', "/"); + let thumbs = Path::new(&app_state.thumbnail_path); + let xlarge_dir = thumbs.join("_xlarge"); + + let hash_xlarge_path: Option<PathBuf> = { + let mut dao = exif_dao.lock().expect("Unable to lock ExifDao"); + match dao.get_exif(&context, &relative_path_str) { + Ok(Some(row)) => row + .content_hash + .as_deref() + .map(|h| content_hash::xlarge_preview_path(thumbs, h)), + _ => None, + } + }; + let scoped_legacy_xlarge_path = + content_hash::library_scoped_legacy_path(&xlarge_dir, library.id, relative_path); + + let existing = hash_xlarge_path + .as_ref() + .filter(|p| p.exists()) + .cloned() + .or_else(|| { + if scoped_legacy_xlarge_path.exists() { + Some(scoped_legacy_xlarge_path.clone()) + } else { + None + } + }); + + if let Some(found) = existing { + if let Ok(file) = NamedFile::open(&found) { + span.set_status(Status::Ok); + return file + .use_etag(true) + .use_last_modified(true) + .prefer_utf8(true) + .into_response(&request); + } + } + + let dest = hash_xlarge_path + .clone() + .unwrap_or_else(|| scoped_legacy_xlarge_path.clone()); + let src = path.clone(); + let dest_for_block = dest.clone(); + let generated = web::block(move || { + if let Some(parent) = dest_for_block.parent() { + std::fs::create_dir_all(parent)?; + } + let tmp = dest_for_block.with_extension("jpg.tmp"); + crate::thumbnails::generate_xlarge_preview(&src, &tmp)?; + std::fs::rename(&tmp, &dest_for_block)?; + Ok::<(), std::io::Error>(()) + }) + .await; + + match generated { + Ok(Ok(())) => { + if let Ok(file) = NamedFile::open(&dest) { + span.set_status(Status::Ok); + return file + .use_etag(true) + .use_last_modified(true) + .prefer_utf8(true) + .into_response(&request); + } + } + Ok(Err(e)) => { + warn!( + "XLarge preview generation failed for {:?}: {} — falling back to original", + path, e + ); + } + Err(e) => { + warn!( + "XLarge preview blocking-pool error for {:?}: {} — falling back to original", + path, e + ); + } + } + image_size = PhotoSize::Full; + } + if image_size == PhotoSize::Thumb { let relative_path = path .strip_prefix(&library.root_path) diff --git a/src/state.rs b/src/state.rs index 8f1bd4e..8cfccbb 100644 --- a/src/state.rs +++ b/src/state.rs @@ -2,6 +2,7 @@ use crate::ai::apollo_client::ApolloClient; use crate::ai::clip_client::ClipClient; use crate::ai::face_client::FaceClient; use crate::ai::insight_chat::{ChatLockMap, InsightChatService}; +use crate::ai::llamacpp::LlamaCppClient; use crate::ai::openrouter::OpenRouterClient; use crate::ai::{InsightGenerator, OllamaClient, SmsApiClient}; use crate::database::{ @@ -62,6 +63,16 @@ pub struct AppState { /// Curated list of OpenRouter model ids exposed to clients. Sourced from /// `OPENROUTER_ALLOWED_MODELS` (comma-separated). Empty when unset. pub openrouter_allowed_models: Vec<String>, + /// `None` when `LLAMA_SWAP_URL` is not configured. Consulted only when a + /// request explicitly opts into `backend=llamacpp`. Same shape as the + /// `openrouter` slot — present here so handlers can route to it without + /// threading through the generator. + #[allow(dead_code)] + pub llamacpp: Option<Arc<LlamaCppClient>>, + /// Curated list of llama-swap model ids exposed to clients. Sourced from + /// `LLAMA_SWAP_ALLOWED_MODELS` (comma-separated). Empty when unset; the + /// server then falls back to `LLAMA_SWAP_PRIMARY_MODEL`. + pub llamacpp_allowed_models: Vec<String>, pub sms_client: SmsApiClient, pub insight_generator: InsightGenerator, /// Chat continuation service. Hold an Arc so handlers can clone cheaply. @@ -105,6 +116,8 @@ impl AppState { ollama: OllamaClient, openrouter: Option<Arc<OpenRouterClient>>, openrouter_allowed_models: Vec<String>, + llamacpp: Option<Arc<LlamaCppClient>>, + llamacpp_allowed_models: Vec<String>, sms_client: SmsApiClient, insight_generator: InsightGenerator, insight_chat: Arc<InsightChatService>, @@ -145,6 +158,8 @@ impl AppState { ollama, openrouter, openrouter_allowed_models, + llamacpp, + llamacpp_allowed_models, sms_client, insight_generator, insight_chat, @@ -186,6 +201,9 @@ impl Default for AppState { let openrouter = build_openrouter_from_env(); let openrouter_allowed_models = parse_openrouter_allowed_models(); + let llamacpp = build_llamacpp_from_env(); + let llamacpp_allowed_models = parse_llamacpp_allowed_models(); + let sms_api_url = env::var("SMS_API_URL").unwrap_or_else(|_| "http://localhost:8000".to_string()); let sms_api_token = env::var("SMS_API_TOKEN").ok(); @@ -250,6 +268,7 @@ impl Default for AppState { let insight_generator = InsightGenerator::new( ollama.clone(), openrouter.clone(), + llamacpp.clone(), sms_client.clone(), apollo_client.clone(), insight_dao.clone(), @@ -271,8 +290,6 @@ impl Default for AppState { Arc::new(tokio::sync::Mutex::new(std::collections::HashMap::new())); let insight_chat = Arc::new(InsightChatService::new( Arc::new(insight_generator.clone()), - ollama.clone(), - openrouter.clone(), insight_dao.clone(), chat_locks, )); @@ -294,6 +311,8 @@ impl Default for AppState { ollama, openrouter, openrouter_allowed_models, + llamacpp, + llamacpp_allowed_models, sms_client, insight_generator, insight_chat, @@ -335,6 +354,37 @@ fn parse_openrouter_allowed_models() -> Vec<String> { .collect() } +/// Build a `LlamaCppClient` from environment variables. Returns `None` when +/// `LLAMA_SWAP_URL` is unset. The client is constructed unconditionally +/// when the URL is set (so it's available even under `LLM_BACKEND=ollama` +/// for ad-hoc tooling), but the agentic / chat paths only route through it +/// when `LLM_BACKEND=llamacpp`. Slot ids default to the names the bundled +/// `llama-swap/config.yaml` uses — `chat` / `vision` / `embed`. +fn build_llamacpp_from_env() -> Option<Arc<LlamaCppClient>> { + let base_url = env::var("LLAMA_SWAP_URL").ok()?; + let primary_model = env::var("LLAMA_SWAP_PRIMARY_MODEL").ok(); + let mut client = LlamaCppClient::new(Some(base_url), primary_model); + if let Ok(model) = env::var("LLAMA_SWAP_EMBEDDING_MODEL") { + client.set_embedding_model(model); + } + if let Ok(model) = env::var("LLAMA_SWAP_VISION_MODEL") { + client.set_vision_model(model); + } + Some(Arc::new(client)) +} + +/// Parse `LLAMA_SWAP_ALLOWED_MODELS` (comma-separated) into a vec. Used to +/// populate the model picker when `LLM_BACKEND=llamacpp` — `/insights/models` +/// surfaces these slots with capabilities. Empty when unset. +fn parse_llamacpp_allowed_models() -> Vec<String> { + env::var("LLAMA_SWAP_ALLOWED_MODELS") + .unwrap_or_default() + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect() +} + #[cfg(test)] impl AppState { /// Creates an AppState instance for testing with temporary directories @@ -397,6 +447,7 @@ impl AppState { let insight_generator = InsightGenerator::new( ollama.clone(), None, + None, sms_client.clone(), apollo_client.clone(), insight_dao.clone(), @@ -416,8 +467,6 @@ impl AppState { Arc::new(tokio::sync::Mutex::new(std::collections::HashMap::new())); let insight_chat = Arc::new(InsightChatService::new( Arc::new(insight_generator.clone()), - ollama.clone(), - None, insight_dao.clone(), chat_locks, )); @@ -445,6 +494,8 @@ impl AppState { ollama, None, Vec::new(), + None, + Vec::new(), sms_client, insight_generator, insight_chat, diff --git a/src/thumbnails.rs b/src/thumbnails.rs index 51b6200..a7334b0 100644 --- a/src/thumbnails.rs +++ b/src/thumbnails.rs @@ -36,12 +36,19 @@ use crate::video::actors::{generate_image_thumbnail_ffmpeg, generate_video_thumb /// `size=full` and the handler streams the original bytes. pub const LARGE_PREVIEW_MAX_DIM: u32 = 2048; -/// JPEG quality for the large preview tier. 85 is the conventional -/// "indistinguishable from source at viewing size" point — well above the -/// `image` crate's default ~75, but well below quality-90+ territory where -/// file size doubles for no perceptible win. +/// JPEG quality for the large and xlarge preview tiers. 85 is the +/// conventional "indistinguishable from source at viewing size" point — +/// well above the `image` crate's default ~75, but well below quality-90+ +/// territory where file size doubles for no perceptible win. const LARGE_PREVIEW_JPEG_QUALITY: u8 = 85; +/// Maximum long-edge size (px) for the xlarge preview tier. Bridges the +/// gap between `large` (2048px, ~16MB decoded) and the original bytes +/// (potentially 48+ MP / ~192MB decoded). At 4096px the decoded bitmap is +/// ~64MB — enough for 2-3× pinch-zoom on any phone before the viewer +/// needs to stream the true original. +pub const XLARGE_PREVIEW_MAX_DIM: u32 = 4096; + lazy_static! { pub static ref IMAGE_GAUGE: IntGauge = IntGauge::new( "imageserver_image_total", @@ -205,6 +212,86 @@ fn generate_large_preview_ffmpeg(src: &Path, dest: &Path) -> std::io::Result<()> Ok(()) } +/// Generate the on-demand xlarge-preview tier (≈4096 long edge JPEG). +/// +/// Same waterfall as [`generate_large_preview`] but targeting +/// [`XLARGE_PREVIEW_MAX_DIM`]. Sources whose long edge is already below +/// the cap are encoded at native size (no upscale). +pub fn generate_xlarge_preview(src: &Path, dest: &Path) -> std::io::Result<()> { + let orientation = exif::read_orientation(src).unwrap_or(1); + + if let Some(preview) = exif::extract_embedded_jpeg_preview(src) { + let img = image::load_from_memory(&preview).map_err(|e| { + std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!("decode embedded preview {:?}: {}", src, e), + ) + })?; + let img = exif::apply_orientation(img, orientation); + return encode_xlarge_jpeg(img, dest); + } + + if file_types::needs_ffmpeg_thumbnail(src) { + return generate_xlarge_preview_ffmpeg(src, dest); + } + + let img = image::open(src).map_err(|e| { + std::io::Error::new(std::io::ErrorKind::InvalidData, format!("{:?}: {}", src, e)) + })?; + let img = exif::apply_orientation(img, orientation); + encode_xlarge_jpeg(img, dest) +} + +fn encode_xlarge_jpeg(img: image::DynamicImage, dest: &Path) -> std::io::Result<()> { + let (w, h) = img.dimensions(); + let max_dim = w.max(h); + let scaled = if max_dim > XLARGE_PREVIEW_MAX_DIM { + img.thumbnail(XLARGE_PREVIEW_MAX_DIM, XLARGE_PREVIEW_MAX_DIM) + } else { + img + }; + let file = std::fs::File::create(dest) + .map_err(|e| std::io::Error::other(format!("create {:?}: {}", dest, e)))?; + let mut writer = std::io::BufWriter::new(file); + let mut encoder = JpegEncoder::new_with_quality(&mut writer, LARGE_PREVIEW_JPEG_QUALITY); + encoder + .encode_image(&scaled) + .map_err(|e| std::io::Error::other(format!("encode {:?}: {}", dest, e)))?; + Ok(()) +} + +fn generate_xlarge_preview_ffmpeg(src: &Path, dest: &Path) -> std::io::Result<()> { + let vf = format!( + "scale='if(gt(iw,ih),min(iw,{cap}),-1)':'if(gt(iw,ih),-1,min(ih,{cap}))'", + cap = XLARGE_PREVIEW_MAX_DIM + ); + let output = Command::new("ffmpeg") + .arg("-y") + .arg("-i") + .arg(src) + .arg("-vframes") + .arg("1") + .arg("-vf") + .arg(&vf) + .arg("-q:v") + .arg("5") + .arg("-f") + .arg("image2") + .arg("-c:v") + .arg("mjpeg") + .arg(dest) + .output()?; + + if !output.status.success() { + return Err(std::io::Error::other(format!( + "ffmpeg failed ({}): {}", + output.status, + String::from_utf8_lossy(&output.stderr).trim() + ))); + } + Ok(()) +} + pub fn create_thumbnails(libs: &[libraries::Library], excluded_dirs: &[String]) { let tracer = global_tracer(); let span = tracer.start("creating thumbnails");