From f0927f535510ccfecf91c1896041b93c2c156603 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Wed, 20 May 2026 17:52:33 -0400 Subject: [PATCH 01/11] ai: add llamacpp backend (llama-swap) as third LLM client Wires a new LlamaCppClient (OpenAI-compatible /v1 wire format) alongside OllamaClient and OpenRouterClient. Per-slot routing for chat/vision/embed via env (LLAMA_SWAP_URL + *_MODEL vars); capability inference uses an env allowlist since /v1/models doesn't report modality. InsightGenerator + InsightChatService gain three-way dispatch on chat_backend = "local" | "hybrid" | "llamacpp". Hybrid and llamacpp share the describe-then-inline path (text-only chat after a separate vision describe). HYBRID_VISION_BACKEND=llamacpp lets hybrid route its describe pass through llama-swap's vision slot while chat still goes to OpenRouter. Cross-replay matrix added (validate_cross_replay): local<->llamacpp and hybrid<->llamacpp allowed; local->hybrid and llamacpp->hybrid rejected. New /insights/llamacpp/models handler mirrors the OpenRouter shape. --- CLAUDE.md | 49 +- src/ai/handlers.rs | 30 ++ src/ai/insight_chat.rs | 266 ++++++--- src/ai/insight_generator.rs | 171 ++++-- src/ai/llamacpp.rs | 978 ++++++++++++++++++++++++++++++++++ src/ai/mod.rs | 4 +- src/bin/populate_knowledge.rs | 1 + src/main.rs | 1 + src/state.rs | 70 +++ 9 files changed, 1468 insertions(+), 102 deletions(-) create mode 100644 src/ai/llamacpp.rs diff --git a/CLAUDE.md b/CLAUDE.md index 7e605cc..d3419e6 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -475,6 +475,7 @@ POST /insights/generate/agentic (tool-calling loop; body: { file_path, back GET /insights?path=...&library=... GET /insights/models (local Ollama models + capabilities) GET /insights/openrouter/models (curated OpenRouter allowlist) +GET /insights/llamacpp/models (curated llama-swap slot allowlist) POST /insights/rate (thumbs up/down for training data) // Insight Chat Continuation @@ -631,6 +632,23 @@ OPENROUTER_EMBEDDING_MODEL=openai/text-embedding-3-small # Optional, embeddings OPENROUTER_HTTP_REFERER=https://your-site.example # Optional attribution header OPENROUTER_APP_TITLE=ImageApi # Optional attribution header +# llama.cpp / llama-swap (Llamacpp Backend) - sibling to Ollama; OpenAI-compatible +# proxy hosting one or more llama-server processes (chat / vision / embed slots). +LLAMA_SWAP_URL=http://localhost:9292/v1 # Required to enable llamacpp backend +LLAMA_SWAP_PRIMARY_MODEL=chat # Chat slot id (matches config.yaml) +LLAMA_SWAP_VISION_MODEL=vision # Vision slot id; describe_image routes here +LLAMA_SWAP_EMBEDDING_MODEL=embed # Embedding slot id (when local embeddings via llamacpp) +LLAMA_SWAP_VISION_MODELS=qwen-vl,llava # Comma-separated slot ids known to have vision. + # Drives `has_vision` in /insights/llamacpp/models. + # `LLAMA_SWAP_VISION_MODEL` is auto-included. +LLAMA_SWAP_ALLOWED_MODELS=chat,coder # Curated allowlist exposed to clients via + # GET /insights/llamacpp/models. Empty = no picker. +LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180 # Per-request timeout; bump for slow CPU offload +HYBRID_VISION_BACKEND=llamacpp # Optional override for hybrid mode's describe_image: + # `ollama` (default) or `llamacpp`. When `llamacpp`, + # hybrid still routes chat to OpenRouter but uses + # llama-swap's vision slot to describe images. + # Insight Chat Continuation AGENTIC_CHAT_MAX_ITERATIONS=6 # Cap on tool-calling iterations per chat turn (default 6) ``` @@ -652,8 +670,11 @@ This allows runtime verification of model availability before generating insight **Hybrid Backend (OpenRouter):** - Per-request opt-in via `backend=hybrid` on `POST /insights/generate/agentic`. -- Local Ollama still describes the image (vision); the description is inlined - into the chat prompt and the agentic loop runs on OpenRouter. +- Vision describe happens before the agentic loop; the description is inlined + into the chat prompt and the agentic loop runs on OpenRouter. By default + vision uses local Ollama, but `HYBRID_VISION_BACKEND=llamacpp` flips it to + llama-swap's vision slot (useful when you want chat on a frontier model and + vision on a local-but-not-Ollama path). - `request.model` (if provided) overrides `OPENROUTER_DEFAULT_MODEL` for that call. The mobile picker reads from `OPENROUTER_ALLOWED_MODELS`. - No live capability precheck — the operator-curated allowlist is trusted. @@ -661,6 +682,30 @@ This allows runtime verification of model availability before generating insight - `GET /insights/openrouter/models` returns `{ models, default_model, configured }` for client picker UIs. +**Llamacpp Backend (llama-swap):** +- Per-request opt-in via `backend=llamacpp` on `POST /insights/generate/agentic`. +- Sibling to Ollama: a local OpenAI-compatible proxy (mostlygeek/llama-swap) + fronting one or more `llama-server` processes. The chat slot is text-only + by default; vision and embeddings have their own slots (`LLAMA_SWAP_VISION_MODEL`, + `LLAMA_SWAP_EMBEDDING_MODEL`) that llama-swap routes to by model id. The + bundled `docker-compose.yml` + `llama-swap/config.yaml` in the opencode root + is the reference deploy. +- Operates in the same describe-then-inline shape as hybrid: the chat model + never sees raw images. Vision describe routes through llama-swap's vision + slot (`describe_image` on `LlamaCppClient`). +- `request.model` (if provided) overrides `LLAMA_SWAP_PRIMARY_MODEL` for that + call (must match a slot id in llama-swap's `config.yaml`). The mobile picker + reads from `LLAMA_SWAP_ALLOWED_MODELS`. +- No live capability precheck — slot ids are trusted. Tool calling is assumed + for every slot (llama-swap entries typically launch with `--jinja`). +- `GET /insights/llamacpp/models` returns `{ models, default_model, configured }`. +- Cross-replay matrix (chat continuation): `local ↔ llamacpp` allowed (the + LlamaCppClient passes images through to the chat slot — you're responsible + for a vision-capable slot if the stored transcript carries images); + `hybrid ↔ llamacpp` allowed (both transcripts are text-only); `local → + hybrid` and `llamacpp → hybrid` rejected (mid-conversation description + source change isn't supported). + **Insight Chat Continuation:** After an agentic insight is generated, the full `Vec` transcript is diff --git a/src/ai/handlers.rs b/src/ai/handlers.rs index 0e46057..4809b25 100644 --- a/src/ai/handlers.rs +++ b/src/ai/handlers.rs @@ -549,6 +549,36 @@ pub async fn get_openrouter_models_handler( HttpResponse::Ok().json(response) } +#[derive(serde::Serialize)] +pub struct LlamaCppModelsResponse { + pub models: Vec, + pub default_model: Option, + pub configured: bool, +} + +/// GET /insights/llamacpp/models - Curated llama-swap model ids exposed +/// to clients for the llamacpp backend. Returned verbatim from +/// `LLAMA_SWAP_ALLOWED_MODELS`; no live call to llama-swap. Use +/// `LLAMA_SWAP_URL` plus `LLAMA_SWAP_PRIMARY_MODEL` on the server side to +/// pick the actual chat slot. +#[get("/insights/llamacpp/models")] +pub async fn get_llamacpp_models_handler( + _claims: Claims, + app_state: web::Data, +) -> impl Responder { + let configured = app_state.llamacpp.is_some(); + let default_model = app_state + .llamacpp + .as_ref() + .map(|c| c.primary_model.clone()); + let response = LlamaCppModelsResponse { + models: app_state.llamacpp_allowed_models.clone(), + default_model, + configured, + }; + HttpResponse::Ok().json(response) +} + /// POST /insights/rate - Rate an insight (thumbs up/down for training data) #[post("/insights/rate")] pub async fn rate_insight_handler( diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs index b2a7af8..4e87d52 100644 --- a/src/ai/insight_chat.rs +++ b/src/ai/insight_chat.rs @@ -9,6 +9,7 @@ use tokio::sync::Mutex as TokioMutex; use crate::ai::insight_generator::InsightGenerator; use crate::ai::llm_client::{ChatMessage, LlmClient, LlmStreamEvent, Tool}; use crate::ai::ollama::OllamaClient; +use crate::ai::llamacpp::LlamaCppClient; use crate::ai::openrouter::OpenRouterClient; use crate::database::InsightDao; use crate::database::models::InsertPhotoInsight; @@ -93,6 +94,7 @@ pub struct InsightChatService { generator: Arc, ollama: OllamaClient, openrouter: Option>, + llamacpp: Option>, insight_dao: Arc>>, chat_locks: ChatLockMap, } @@ -102,6 +104,7 @@ impl InsightChatService { generator: Arc, ollama: OllamaClient, openrouter: Option>, + llamacpp: Option>, insight_dao: Arc>>, chat_locks: ChatLockMap, ) -> Self { @@ -109,6 +112,7 @@ impl InsightChatService { generator, ollama, openrouter, + llamacpp, insight_dao, chat_locks, } @@ -303,23 +307,15 @@ impl InsightChatService { .map(|s| s.trim().to_lowercase()) .filter(|s| !s.is_empty()) .unwrap_or_else(|| stored_backend.clone()); - if !matches!(effective_backend.as_str(), "local" | "hybrid") { - bail!( - "unknown backend '{}'; expected 'local' or 'hybrid'", - effective_backend - ); - } - if stored_backend == "local" && effective_backend == "hybrid" { - bail!( - "switching from local to hybrid mid-chat isn't supported yet; \ - regenerate the insight in hybrid mode if you want OpenRouter chat" - ); - } + validate_cross_replay(&stored_backend, &effective_backend)?; let is_hybrid = effective_backend == "hybrid"; + let is_llamacpp = effective_backend == "llamacpp"; + let describes_then_inlines = is_hybrid || is_llamacpp; span.set_attribute(KeyValue::new("backend", effective_backend.clone())); // 4. Build the chat backend client. Ollama in local mode, a freshly - // cloned OpenRouter client in hybrid mode (clone so per-request + // cloned OpenRouter client in hybrid mode, a freshly cloned + // LlamaCppClient in llamacpp mode (clone so per-request // sampling/model overrides don't leak into shared state). let max_iterations = req .max_iterations @@ -336,6 +332,7 @@ impl InsightChatService { let mut ollama_client = self.ollama.clone(); let mut openrouter_client: Option = None; + let mut llamacpp_client: Option = None; if is_hybrid { let arc = self.openrouter.as_ref().ok_or_else(|| { @@ -356,6 +353,25 @@ impl InsightChatService { c.set_num_ctx(Some(ctx)); } openrouter_client = Some(c); + } else if is_llamacpp { + let arc = self.llamacpp.as_ref().ok_or_else(|| { + anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured") + })?; + let mut c: LlamaCppClient = (**arc).clone(); + if let Some(ref m) = custom_model { + c.primary_model = m.clone(); + } + if req.temperature.is_some() + || req.top_p.is_some() + || req.top_k.is_some() + || req.min_p.is_some() + { + c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p); + } + if let Some(ctx) = req.num_ctx { + c.set_num_ctx(Some(ctx)); + } + llamacpp_client = Some(c); } else { // Local-mode model swap. Build a new client when the chat model // differs from the configured one (mirrors the agentic pattern). @@ -381,7 +397,9 @@ impl InsightChatService { } } - let chat_backend: &dyn LlmClient = if let Some(ref c) = openrouter_client { + let chat_backend: &dyn LlmClient = if let Some(ref c) = llamacpp_client { + c + } else if let Some(ref c) = openrouter_client { c } else { &ollama_client @@ -389,18 +407,19 @@ impl InsightChatService { let model_used = chat_backend.primary_model().to_string(); span.set_attribute(KeyValue::new("model", model_used.clone())); - // 5. Decide vision + tool set. In hybrid we always omit - // `describe_photo` (matches the original generation flow). In - // local we trust the stored history's first-user shape: if it - // carries `images`, the original model was vision-capable, and - // we keep `describe_photo` available. + // 5. Decide vision + tool set. In describe-then-inline modes + // (hybrid, llamacpp) we always omit `describe_photo` (matches the + // original generation flow). In local we trust the stored + // history's first-user shape: if it carries `images`, the + // original model was vision-capable, and we keep `describe_photo` + // available. let local_first_user_has_image = messages .iter() .find(|m| m.role == "user") .and_then(|m| m.images.as_ref()) .map(|imgs| !imgs.is_empty()) .unwrap_or(false); - let offer_describe_tool = !is_hybrid && local_first_user_has_image; + let offer_describe_tool = !describes_then_inlines && local_first_user_has_image; // current_gate_opts(has_vision) sets gate_opts.has_vision = has_vision // and probes the per-table presence flags. Pass `offer_describe_tool` // directly — the `!is_hybrid && local_first_user_has_image` decision @@ -799,19 +818,10 @@ impl InsightChatService { .map(|s| s.trim().to_lowercase()) .filter(|s| !s.is_empty()) .unwrap_or_else(|| stored_backend.clone()); - if !matches!(effective_backend.as_str(), "local" | "hybrid") { - bail!( - "unknown backend '{}'; expected 'local' or 'hybrid'", - effective_backend - ); - } - if stored_backend == "local" && effective_backend == "hybrid" { - bail!( - "switching from local to hybrid mid-chat isn't supported yet; \ - regenerate the insight in hybrid mode if you want OpenRouter chat" - ); - } + validate_cross_replay(&stored_backend, &effective_backend)?; let is_hybrid = effective_backend == "hybrid"; + let is_llamacpp = effective_backend == "llamacpp"; + let describes_then_inlines = is_hybrid || is_llamacpp; let max_iterations = req .max_iterations @@ -826,20 +836,21 @@ impl InsightChatService { .filter(|m| !m.is_empty()); let (chat_backend_holder, ollama_client) = - self.build_chat_clients(is_hybrid, custom_model.as_deref(), &req)?; + self.build_chat_clients(&effective_backend, custom_model.as_deref(), &req)?; let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref(); let model_used = chat_backend.primary_model().to_string(); // Tool set — local mode + first user turn carries an image → - // offer describe_photo. Hybrid: visual description was inlined - // when the insight was bootstrapped, no describe tool needed. + // offer describe_photo. Describe-then-inline modes (hybrid / + // llamacpp): visual description was inlined when the insight was + // bootstrapped, no describe tool needed. let local_first_user_has_image = messages .iter() .find(|m| m.role == "user") .and_then(|m| m.images.as_ref()) .map(|imgs| !imgs.is_empty()) .unwrap_or(false); - let offer_describe_tool = !is_hybrid && local_first_user_has_image; + let offer_describe_tool = !describes_then_inlines && local_first_user_has_image; let gate_opts = self.generator.current_gate_opts_for_persona( offer_describe_tool, Some((req.user_id, &active_persona)), @@ -976,6 +987,8 @@ impl InsightChatService { .unwrap_or_else(|| "default".to_string()); let effective_backend = resolve_bootstrap_backend(req.backend.as_deref())?; let is_hybrid = effective_backend == "hybrid"; + let is_llamacpp = effective_backend == "llamacpp"; + let describes_then_inlines = is_hybrid || is_llamacpp; let max_iterations = req .max_iterations @@ -984,7 +997,7 @@ impl InsightChatService { let custom_model = req.model.clone().filter(|m| !m.is_empty()); let (chat_backend_holder, ollama_client) = - self.build_chat_clients(is_hybrid, custom_model.as_deref(), &req)?; + self.build_chat_clients(&effective_backend, custom_model.as_deref(), &req)?; let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref(); let model_used = chat_backend.primary_model().to_string(); @@ -1007,21 +1020,48 @@ impl InsightChatService { _ => None, }); - // Hybrid backend: pre-describe the image via local Ollama vision - // so OpenRouter chat models (which can't see images directly) get - // the visual description as text. Mirrors the same pre-describe - // pass that `generate_agentic_insight_for_photo` does for hybrid. - let visual_block = if is_hybrid { + // Describe-then-inline backends (hybrid, llamacpp): pre-describe the + // image so a text-only chat model gets the visual description inline. + // Vision source: llamacpp's vision slot in llamacpp mode; in hybrid + // mode Ollama by default, llamacpp via `HYBRID_VISION_BACKEND=llamacpp`. + let visual_block = if describes_then_inlines { match image_base64.as_deref() { - Some(b64) => match self.ollama.describe_image(b64).await { - Ok(desc) => { - format!("Visual description (from local vision model):\n{}\n", desc) + Some(b64) => { + let use_llamacpp_vision = if is_llamacpp { + true + } else { + matches!( + std::env::var("HYBRID_VISION_BACKEND") + .ok() + .as_deref() + .map(|s| s.trim().to_lowercase()) + .as_deref(), + Some("llamacpp") + ) + }; + let described = if use_llamacpp_vision { + match self.llamacpp.as_ref() { + Some(c) => c.describe_image(b64).await, + None => { + log::warn!( + "bootstrap: requested llamacpp vision but LLAMA_SWAP_URL unset; falling back to Ollama" + ); + self.ollama.describe_image(b64).await + } + } + } else { + self.ollama.describe_image(b64).await + }; + match described { + Ok(desc) => { + format!("Visual description (from local vision model):\n{}\n", desc) + } + Err(e) => { + log::warn!("{} bootstrap: describe_image failed: {}", effective_backend, e); + String::new() + } } - Err(e) => { - log::warn!("hybrid bootstrap: local describe_image failed: {}", e); - String::new() - } - }, + } None => String::new(), } } else { @@ -1031,7 +1071,7 @@ impl InsightChatService { // Tool gates. Local + image present → expose describe_photo so // the chat model can re-look at the photo on demand. Hybrid: // already inlined, no tool needed. - let offer_describe_tool = !is_hybrid && image_base64.is_some(); + let offer_describe_tool = !describes_then_inlines && image_base64.is_some(); let gate_opts = self.generator.current_gate_opts_for_persona( offer_describe_tool, Some((req.user_id, &active_persona)), @@ -1057,7 +1097,7 @@ impl InsightChatService { ); let system_msg = ChatMessage::system(system_content); let mut user_msg = ChatMessage::user(req.user_message.clone()); - if !is_hybrid && let Some(ref img) = image_base64 { + if !describes_then_inlines && let Some(ref img) = image_base64 { user_msg.images = Some(vec![img.clone()]); } let mut messages = vec![system_msg, user_msg]; @@ -1130,19 +1170,22 @@ impl InsightChatService { Ok(()) } - /// Set up chat clients (Ollama + optional OpenRouter) shared by - /// bootstrap and continuation. Returns the chat-side backend client - /// (boxed because hybrid and local return different concrete types) - /// and the Ollama client used for describe-image / local tool calls. + /// Set up chat clients (Ollama + optional OpenRouter / LlamaCpp) shared + /// by bootstrap and continuation. Returns the chat-side backend client + /// (boxed because each backend has a different concrete type) and the + /// Ollama client used for describe-image / local tool calls. + /// + /// `effective_backend` must be one of `"local"`, `"hybrid"`, `"llamacpp"` + /// (validated upstream). fn build_chat_clients( &self, - is_hybrid: bool, + effective_backend: &str, custom_model: Option<&str>, req: &ChatTurnRequest, ) -> Result<(Box, OllamaClient)> { let mut ollama_client = self.ollama.clone(); - if is_hybrid { + if effective_backend == "hybrid" { let arc = self.openrouter.as_ref().ok_or_else(|| { anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured") })?; @@ -1163,6 +1206,27 @@ impl InsightChatService { return Ok((Box::new(c), ollama_client)); } + if effective_backend == "llamacpp" { + let arc = self.llamacpp.as_ref().ok_or_else(|| { + anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured") + })?; + let mut c: LlamaCppClient = (**arc).clone(); + if let Some(m) = custom_model { + c.primary_model = m.to_string(); + } + if req.temperature.is_some() + || req.top_p.is_some() + || req.top_k.is_some() + || req.min_p.is_some() + { + c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p); + } + if let Some(ctx) = req.num_ctx { + c.set_num_ctx(Some(ctx)); + } + return Ok((Box::new(c), ollama_client)); + } + if let Some(m) = custom_model && m != self.ollama.primary_model { @@ -1459,6 +1523,49 @@ fn resolve_date_taken_for_context( .map(|dt| dt.format("%Y-%m-%d").to_string()) } +/// Validate a stored→effective backend transition for a chat continuation. +/// Continuation runs against a transcript that was generated with a specific +/// backend; some transitions break the conversation shape: +/// +/// - `local → hybrid` — the stored transcript has images embedded in the +/// first user message; the openrouter chat client surfaces them through +/// the wire, but vision-only models routed via the hybrid path may not +/// accept that shape consistently across providers. Reject to keep the +/// `regenerate-in-hybrid-mode` workflow as the supported answer. +/// - `llamacpp → hybrid` — the stored transcript already has an inlined +/// visual description produced by llama-swap's vision slot. Switching +/// to hybrid mid-conversation would mix description sources across +/// subsequent turns (any new image in the chat continuation would be +/// described by ollama-vision while the original was described by +/// llama-vision). Reject for consistency. +/// +/// All other transitions are allowed. `local ↔ llamacpp` works because +/// LlamaCppClient passes image content-parts through to the chat slot — +/// the user is responsible for picking a vision-capable chat model in +/// that case. `hybrid ↔ llamacpp` works because both transcripts are +/// text-only (visual description inlined at bootstrap). +fn validate_cross_replay(stored: &str, effective: &str) -> Result<()> { + if !matches!(effective, "local" | "hybrid" | "llamacpp") { + bail!( + "unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'", + effective + ); + } + if stored == "local" && effective == "hybrid" { + bail!( + "switching from local to hybrid mid-chat isn't supported yet; \ + regenerate the insight in hybrid mode if you want OpenRouter chat" + ); + } + if stored == "llamacpp" && effective == "hybrid" { + bail!( + "switching from llamacpp to hybrid mid-chat isn't supported yet; \ + regenerate the insight in hybrid mode if you want OpenRouter chat" + ); + } + Ok(()) +} + /// Pick the backend label for bootstrap. Bootstrap has no stored insight /// to defer to (that's continuation's behaviour), so the default is /// `"local"`. Returns an error if the supplied label is non-empty but @@ -1469,8 +1576,11 @@ fn resolve_bootstrap_backend(supplied: Option<&str>) -> Result { .map(|s| s.trim().to_lowercase()) .filter(|s| !s.is_empty()) .unwrap_or_else(|| "local".to_string()); - if !matches!(lower.as_str(), "local" | "hybrid") { - bail!("unknown backend '{}'; expected 'local' or 'hybrid'", lower); + if !matches!(lower.as_str(), "local" | "hybrid" | "llamacpp") { + bail!( + "unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'", + lower + ); } Ok(lower) } @@ -2074,6 +2184,10 @@ mod tests { fn bootstrap_backend_accepts_local_and_hybrid_case_insensitively() { assert_eq!(resolve_bootstrap_backend(Some("LOCAL")).unwrap(), "local"); assert_eq!(resolve_bootstrap_backend(Some("Hybrid")).unwrap(), "hybrid"); + assert_eq!( + resolve_bootstrap_backend(Some("Llamacpp")).unwrap(), + "llamacpp" + ); assert_eq!( resolve_bootstrap_backend(Some(" local ")).unwrap(), "local" @@ -2088,6 +2202,38 @@ mod tests { assert!(msg.contains("openrouter")); } + #[test] + fn cross_replay_rejects_local_to_hybrid() { + let err = validate_cross_replay("local", "hybrid").unwrap_err(); + assert!(format!("{}", err).contains("local to hybrid")); + } + + #[test] + fn cross_replay_rejects_llamacpp_to_hybrid() { + let err = validate_cross_replay("llamacpp", "hybrid").unwrap_err(); + assert!(format!("{}", err).contains("llamacpp to hybrid")); + } + + #[test] + fn cross_replay_allows_local_llamacpp_and_hybrid_llamacpp_transitions() { + // Local ↔ llamacpp: user is responsible for picking a vision-capable + // chat slot when the transcript has images. + assert!(validate_cross_replay("local", "llamacpp").is_ok()); + assert!(validate_cross_replay("llamacpp", "local").is_ok()); + // Hybrid ↔ llamacpp: both transcripts are text-only. + assert!(validate_cross_replay("hybrid", "llamacpp").is_ok()); + // Same-backend replays are always fine. + assert!(validate_cross_replay("local", "local").is_ok()); + assert!(validate_cross_replay("hybrid", "hybrid").is_ok()); + assert!(validate_cross_replay("llamacpp", "llamacpp").is_ok()); + } + + #[test] + fn cross_replay_rejects_unknown_effective() { + let err = validate_cross_replay("local", "openrouter").unwrap_err(); + assert!(format!("{}", err).contains("unknown backend")); + } + #[test] fn bootstrap_system_message_includes_path_and_persona() { let out = build_bootstrap_system_message("you are helpful", "pics/IMG.jpg", None, None, ""); diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs index 2e2da33..2a11e29 100644 --- a/src/ai/insight_generator.rs +++ b/src/ai/insight_generator.rs @@ -12,6 +12,7 @@ use std::sync::{Arc, Mutex}; use crate::ai::apollo_client::{ApolloClient, ApolloPlace}; use crate::ai::llm_client::LlmClient; use crate::ai::ollama::{ChatMessage, OllamaClient, Tool}; +use crate::ai::llamacpp::LlamaCppClient; use crate::ai::openrouter::OpenRouterClient; use crate::ai::sms_client::{SmsApiClient, SmsSearchHit, SmsSearchParams}; use crate::ai::user_display_name; @@ -68,6 +69,9 @@ pub struct InsightGenerator { /// Optional OpenRouter client, used when `backend=hybrid` is requested. /// `None` when `OPENROUTER_API_KEY` is not configured. openrouter: Option>, + /// Optional llama-swap client, used when `backend=llamacpp` is requested. + /// `None` when `LLAMA_SWAP_URL` is not configured. + llamacpp: Option>, sms_client: SmsApiClient, /// Optional integration with Apollo's user-defined Places. When the /// integration is disabled (`APOLLO_API_BASE_URL` unset), every @@ -120,6 +124,7 @@ impl InsightGenerator { pub fn new( ollama: OllamaClient, openrouter: Option>, + llamacpp: Option>, sms_client: SmsApiClient, apollo_client: ApolloClient, insight_dao: Arc>>, @@ -137,6 +142,7 @@ impl InsightGenerator { Self { ollama, openrouter, + llamacpp, sms_client, apollo_client, insight_dao, @@ -3574,23 +3580,31 @@ Return ONLY the summary, nothing else."#, .map(|s| s.trim().to_lowercase()) .filter(|s| !s.is_empty()) .unwrap_or_else(|| "local".to_string()); - if !matches!(backend_label.as_str(), "local" | "hybrid") { + if !matches!(backend_label.as_str(), "local" | "hybrid" | "llamacpp") { return Err(anyhow::anyhow!( - "unknown backend '{}'; expected 'local' or 'hybrid'", + "unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'", backend_label )); } span.set_attribute(KeyValue::new("backend", backend_label.clone())); let is_hybrid = backend_label == "hybrid"; + let is_llamacpp = backend_label == "llamacpp"; + // In hybrid + llamacpp modes the chat model never sees the image + // directly; we describe-then-inline locally before the agentic loop + // starts. Tracked as a single flag so vision/tool-gate logic doesn't + // have to branch twice. + let describes_then_inlines = is_hybrid || is_llamacpp; // 1b. Always build an Ollama client. In local mode it owns the chat - // loop; in hybrid mode it still handles describe_image + any - // tool-local calls (e.g. if a future tool needs embeddings). - // Sampling overrides only apply in local mode — in hybrid the - // user's params belong to the OpenRouter chat client. - let apply_sampling_to_ollama = !is_hybrid; + // loop; in hybrid/llamacpp mode it still handles tool-local calls + // (e.g. future embedding-backed tools). The chat backend is + // selected separately below. + // Sampling overrides only apply in local mode — in + // hybrid/llamacpp the user's params belong to the alternate chat + // client. + let apply_sampling_to_ollama = !describes_then_inlines; let mut ollama_client = if let Some(ref model) = custom_model - && !is_hybrid + && !describes_then_inlines { log::info!("Using custom model for agentic: {}", model); span.set_attribute(KeyValue::new("custom_model", model.clone())); @@ -3601,7 +3615,7 @@ Return ONLY the summary, nothing else."#, Some(model.clone()), ) } else { - if !is_hybrid { + if !describes_then_inlines { span.set_attribute(KeyValue::new("model", self.ollama.primary_model.clone())); } self.ollama.clone() @@ -3674,6 +3688,44 @@ Return ONLY the summary, nothing else."#, None }; + // 1d. In llamacpp mode, clone the configured LlamaCpp client and + // apply per-request overrides. Same shape as the openrouter + // branch above; describe_image will route through the vision + // slot configured on the client. + let llamacpp_client: Option = if is_llamacpp { + let arc = self.llamacpp.as_ref().ok_or_else(|| { + anyhow::anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured") + })?; + let mut c: LlamaCppClient = (**arc).clone(); + if let Some(ref m) = custom_model { + c.primary_model = m.clone(); + span.set_attribute(KeyValue::new("custom_model", m.clone())); + } + span.set_attribute(KeyValue::new("llamacpp_model", c.primary_model.clone())); + if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() { + if let Some(t) = temperature { + span.set_attribute(KeyValue::new("temperature", t as f64)); + } + if let Some(p) = top_p { + span.set_attribute(KeyValue::new("top_p", p as f64)); + } + if let Some(k) = top_k { + span.set_attribute(KeyValue::new("top_k", k as i64)); + } + if let Some(m) = min_p { + span.set_attribute(KeyValue::new("min_p", m as f64)); + } + c.set_sampling_params(temperature, top_p, top_k, min_p); + } + if let Some(ctx) = num_ctx { + span.set_attribute(KeyValue::new("num_ctx", ctx as i64)); + c.set_num_ctx(Some(ctx)); + } + Some(c) + } else { + None + }; + let insight_cx = current_cx.with_span(span); // 2. Verify chat model supports tool calling. @@ -3681,10 +3733,11 @@ Return ONLY the summary, nothing else."#, // - hybrid: trust the operator's curated allowlist // (OPENROUTER_ALLOWED_MODELS) — no live precheck. A bad model id // surfaces as a chat-call error on the next step. - let has_vision = if is_hybrid { - // In hybrid mode the chat model never sees images directly — we - // describe-then-inject, so `has_vision` drives only whether we - // bother loading the image to describe it, which we always do. + let has_vision = if describes_then_inlines { + // In hybrid + llamacpp modes the chat model never sees images + // directly — we describe-then-inject, so `has_vision` drives only + // whether we bother loading the image to describe it, which we + // always do. true } else { if let Some(ref model_name) = custom_model { @@ -3864,24 +3917,61 @@ Return ONLY the summary, nothing else."#, None }; - let hybrid_visual_description: Option = if is_hybrid { + // describe-then-inline path. In hybrid mode the vision backend + // defaults to Ollama but can be flipped to llamacpp via + // `HYBRID_VISION_BACKEND=llamacpp` (so chat goes to OpenRouter while + // vision/audio routes through llama-swap). In llamacpp mode we always + // use the llamacpp client's configured vision slot. + let inlined_visual_description: Option = if describes_then_inlines { match image_base64.as_deref() { - Some(b64) => match self.ollama.describe_image(b64).await { - Ok(desc) => { - log::info!( - "Hybrid: local vision describe succeeded ({} chars)", - desc.len() - ); - Some(desc) + Some(b64) => { + let use_llamacpp_vision = if is_llamacpp { + true + } else { + // is_hybrid branch — consult env switch + matches!( + std::env::var("HYBRID_VISION_BACKEND") + .ok() + .as_deref() + .map(|s| s.trim().to_lowercase()) + .as_deref(), + Some("llamacpp") + ) + }; + + let described = if use_llamacpp_vision { + match self.llamacpp.as_ref() { + Some(c) => c.describe_image(b64).await, + None => { + log::warn!( + "describe-then-inline: requested llamacpp vision but LLAMA_SWAP_URL is unset, falling back to Ollama" + ); + self.ollama.describe_image(b64).await + } + } + } else { + self.ollama.describe_image(b64).await + }; + + match described { + Ok(desc) => { + log::info!( + "{}: vision describe succeeded ({} chars)", + backend_label, + desc.len() + ); + Some(desc) + } + Err(e) => { + log::warn!( + "{}: vision describe failed, continuing without: {}", + backend_label, + e + ); + None + } } - Err(e) => { - log::warn!( - "Hybrid: local vision describe failed, continuing without: {}", - e - ); - None - } - }, + } None => None, } } else { @@ -3934,7 +4024,7 @@ Return ONLY the summary, nothing else."#, .map(|c| format!("Contact/Person: {}", c)) .unwrap_or_else(|| "Contact/Person: unknown".to_string()); - let visual_block = hybrid_visual_description + let visual_block = inlined_visual_description .as_deref() .map(|d| format!("Visual description (from local vision model):\n{}\n\n", d)) .unwrap_or_default(); @@ -3954,25 +4044,28 @@ Return ONLY the summary, nothing else."#, ); // 10. Define tools. Gate flags computed from current data presence; - // hybrid mode omits describe_photo since the chat model receives - // the visual description inline (so we pass `false` for has_vision - // in hybrid mode regardless of the model's actual capability). - let gate_opts = self.current_gate_opts(has_vision && !is_hybrid); + // describe-then-inline modes (hybrid, llamacpp) omit describe_photo + // since the chat model receives the visual description inline (so + // we pass `false` for has_vision in those modes regardless of the + // model's actual capability). + let gate_opts = self.current_gate_opts(has_vision && !describes_then_inlines); let tools = Self::build_tool_definitions(gate_opts); - // 11. Build initial messages. In hybrid mode images are never - // attached to the wire message — the description is part of - // `user_content`. + // 11. Build initial messages. In describe-then-inline modes images + // are never attached to the wire message — the description is part + // of `user_content`. let system_msg = ChatMessage::system(system_content); let mut user_msg = ChatMessage::user(user_content); - if !is_hybrid && let Some(ref img) = image_base64 { + if !describes_then_inlines && let Some(ref img) = image_base64 { user_msg.images = Some(vec![img.clone()]); } let mut messages = vec![system_msg, user_msg]; // 12. Agentic loop — dispatch through the selected backend. - let chat_backend: &dyn LlmClient = if let Some(ref or_c) = openrouter_client { + let chat_backend: &dyn LlmClient = if let Some(ref lc_c) = llamacpp_client { + lc_c + } else if let Some(ref or_c) = openrouter_client { or_c } else { &ollama_client diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs new file mode 100644 index 0000000..100020c --- /dev/null +++ b/src/ai/llamacpp.rs @@ -0,0 +1,978 @@ +// LlamaCppClient — talks to a llama-swap proxy that fronts one or more +// llama-server processes. llama-swap exposes an OpenAI-compatible HTTP +// surface (`/v1/chat/completions`, `/v1/embeddings`, `/v1/models`), so the +// wire translation mirrors `OpenRouterClient` almost exactly. +// +// Differences from OpenRouter: +// - No bearer auth or attribution headers; llama-swap is LAN-only. +// - Three model slots (`primary_model` = chat, `vision_model`, `embedding_model`) +// each map to a model id in the llama-swap config. `describe_image` and +// `generate_embeddings` issue requests with the appropriate slot id in the +// `model` field, which is how llama-swap selects which backend process to +// run. +// - `/v1/models` returns only the configured slot ids — capabilities aren't +// reported by the API, so `vision_models` is a config-time allowlist (env +// `LLAMA_SWAP_VISION_MODELS`) used to set `has_vision` on responses. +// `has_tool_calling` is assumed true for every slot, since llama-swap entries +// default to launching llama-server with `--jinja`. +// +// First consumer lands alongside the three-way backend dispatch in +// insight_generator / insight_chat. +#![allow(dead_code)] + +use anyhow::{Context, Result, anyhow, bail}; +use async_trait::async_trait; +use reqwest::Client; +use serde::Deserialize; +use serde_json::{Value, json}; +use std::time::Duration; + +use crate::ai::llm_client::{ + ChatMessage, LlmClient, LlmStreamEvent, ModelCapabilities, Tool, ToolCall, ToolCallFunction, +}; +use futures::stream::{BoxStream, StreamExt}; + +const DEFAULT_BASE_URL: &str = "http://localhost:9292/v1"; +const DEFAULT_PRIMARY_MODEL: &str = "chat"; +const DEFAULT_VISION_MODEL: &str = "vision"; +const DEFAULT_EMBEDDING_MODEL: &str = "embed"; +const DEFAULT_REQUEST_TIMEOUT_SECS: u64 = 180; + +/// OpenAI-compatible client targeting a llama-swap proxy in front of one or +/// more llama-server processes. See the module doc-comment for the slot model. +#[derive(Clone)] +pub struct LlamaCppClient { + client: Client, + pub base_url: String, + /// Chat model slot id (e.g. `"chat"`). Used for `generate` / + /// `chat_with_tools` / `chat_with_tools_stream`. + pub primary_model: String, + /// Embedding model slot id (e.g. `"embed"`). Used for + /// `generate_embeddings`. + pub embedding_model: String, + /// Vision model slot id (e.g. `"vision"`). Used for `describe_image` and + /// included in `vision_models` automatically so capability lookups for + /// the default vision slot report `has_vision = true` even when the env + /// allowlist is empty. + pub vision_model: String, + /// Operator-curated set of slot ids known to be multimodal. Drives the + /// `has_vision` field in `list_models` / `model_capabilities`, since + /// llama-swap's `/v1/models` doesn't report modality. Empty allowlist + /// still marks `vision_model` as vision-capable. + pub vision_models: Vec, + num_ctx: Option, + temperature: Option, + top_p: Option, + top_k: Option, + min_p: Option, +} + +impl LlamaCppClient { + pub fn new(base_url: Option, primary_model: Option) -> Self { + let timeout_secs = std::env::var("LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS") + .ok() + .and_then(|v| v.parse::().ok()) + .unwrap_or(DEFAULT_REQUEST_TIMEOUT_SECS); + Self { + client: Client::builder() + .connect_timeout(Duration::from_secs(10)) + .timeout(Duration::from_secs(timeout_secs)) + .build() + .unwrap_or_else(|_| Client::new()), + base_url: base_url.unwrap_or_else(|| DEFAULT_BASE_URL.to_string()), + primary_model: primary_model.unwrap_or_else(|| DEFAULT_PRIMARY_MODEL.to_string()), + embedding_model: DEFAULT_EMBEDDING_MODEL.to_string(), + vision_model: DEFAULT_VISION_MODEL.to_string(), + vision_models: Vec::new(), + num_ctx: None, + temperature: None, + top_p: None, + top_k: None, + min_p: None, + } + } + + pub fn set_embedding_model(&mut self, model: String) { + self.embedding_model = model; + } + + pub fn set_vision_model(&mut self, model: String) { + self.vision_model = model; + } + + pub fn set_vision_models(&mut self, models: Vec) { + self.vision_models = models; + } + + pub fn set_num_ctx(&mut self, num_ctx: Option) { + self.num_ctx = num_ctx; + } + + pub fn set_sampling_params( + &mut self, + temperature: Option, + top_p: Option, + top_k: Option, + min_p: Option, + ) { + self.temperature = temperature; + self.top_p = top_p; + self.top_k = top_k; + self.min_p = min_p; + } + + /// Translate canonical messages to the OpenAI-compatible wire shape. + /// Behaviorally identical to `OpenRouterClient::messages_to_openai` — + /// stringify tool-call arguments, rewrite images into content-parts, attach + /// `tool_call_id` to `role=tool` messages based on the preceding assistant + /// turn's tool calls. + fn messages_to_openai(messages: &[ChatMessage]) -> Vec { + let mut out = Vec::with_capacity(messages.len()); + let mut last_tool_call_ids: Vec = Vec::new(); + let mut next_tool_result_idx: usize = 0; + + for msg in messages { + let mut obj = serde_json::Map::new(); + obj.insert("role".into(), Value::String(msg.role.clone())); + + match &msg.images { + Some(images) if !images.is_empty() => { + let mut parts: Vec = Vec::new(); + if !msg.content.is_empty() { + parts.push(json!({"type": "text", "text": msg.content})); + } + for img in images { + let url = image_to_data_url(img); + parts.push(json!({ + "type": "image_url", + "image_url": { "url": url } + })); + } + obj.insert("content".into(), Value::Array(parts)); + } + _ => { + obj.insert("content".into(), Value::String(msg.content.clone())); + } + } + + if let Some(tcs) = &msg.tool_calls + && msg.role == "assistant" + { + let converted: Vec = tcs + .iter() + .enumerate() + .map(|(i, call)| { + let id = call.id.clone().unwrap_or_else(|| format!("call_{}", i)); + let args_str = serde_json::to_string(&call.function.arguments) + .unwrap_or_else(|_| "{}".to_string()); + json!({ + "id": id, + "type": "function", + "function": { + "name": call.function.name, + "arguments": args_str, + } + }) + }) + .collect(); + last_tool_call_ids = converted + .iter() + .filter_map(|v| v.get("id").and_then(|x| x.as_str()).map(String::from)) + .collect(); + next_tool_result_idx = 0; + obj.insert("tool_calls".into(), Value::Array(converted)); + } + + if msg.role == "tool" { + let id = last_tool_call_ids + .get(next_tool_result_idx) + .cloned() + .unwrap_or_else(|| "call_0".to_string()); + obj.insert("tool_call_id".into(), Value::String(id)); + next_tool_result_idx += 1; + } + + out.push(Value::Object(obj)); + } + + out + } + + /// Parse an OpenAI-compatible assistant message back into canonical shape. + /// llama.cpp emits `reasoning_content` on thinking models; we drop it for + /// parity with OpenRouter (which also strips upstream reasoning fields). + fn openai_message_to_chat(msg: &Value) -> Result { + let obj = msg + .as_object() + .ok_or_else(|| anyhow!("response message is not an object"))?; + let role = obj + .get("role") + .and_then(|v| v.as_str()) + .unwrap_or("assistant") + .to_string(); + let content = obj + .get("content") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); + + let tool_calls = if let Some(tcs) = obj.get("tool_calls").and_then(|v| v.as_array()) { + let mut parsed = Vec::with_capacity(tcs.len()); + for tc in tcs { + let id = tc.get("id").and_then(|v| v.as_str()).map(String::from); + let function = tc + .get("function") + .ok_or_else(|| anyhow!("tool_call missing function field"))?; + let name = function + .get("name") + .and_then(|v| v.as_str()) + .unwrap_or_default() + .to_string(); + let args_value = match function.get("arguments") { + Some(Value::String(s)) => { + serde_json::from_str::(s).unwrap_or_else(|_| json!({})) + } + Some(v @ Value::Object(_)) => v.clone(), + _ => json!({}), + }; + parsed.push(ToolCall { + id, + function: ToolCallFunction { + name, + arguments: args_value, + }, + }); + } + Some(parsed) + } else { + None + }; + + Ok(ChatMessage { + role, + content, + tool_calls, + images: None, + }) + } + + fn build_options(&self) -> Vec<(&'static str, Value)> { + let mut v = Vec::new(); + if let Some(t) = self.temperature { + v.push(("temperature", json!(t))); + } + if let Some(p) = self.top_p { + v.push(("top_p", json!(p))); + } + if let Some(k) = self.top_k { + v.push(("top_k", json!(k))); + } + if let Some(m) = self.min_p { + v.push(("min_p", json!(m))); + } + // num_ctx isn't an OpenAI param; llama-server bakes ctx in at launch + // via -c, so we silently drop the override here. The config.yaml + // entry is the source of truth for context size. + let _ = self.num_ctx; + v + } + + /// Issue a chat request with an explicit model id override. Used by + /// `describe_image` to route through the vision slot without mutating + /// `self.primary_model`. + async fn chat_completion_with_model( + &self, + model: &str, + messages: Vec, + tools: Vec, + ) -> Result<(ChatMessage, Option, Option)> { + let url = format!("{}/chat/completions", self.base_url); + let mut body = serde_json::Map::new(); + body.insert("model".into(), Value::String(model.to_string())); + body.insert( + "messages".into(), + Value::Array(Self::messages_to_openai(&messages)), + ); + body.insert("stream".into(), Value::Bool(false)); + if !tools.is_empty() { + body.insert( + "tools".into(), + serde_json::to_value(&tools).context("serializing tools")?, + ); + } + for (k, v) in self.build_options() { + body.insert(k.into(), v); + } + + let resp = self + .client + .post(&url) + .json(&Value::Object(body)) + .send() + .await + .with_context(|| format!("POST {} failed", url))?; + + if !resp.status().is_success() { + let status = resp.status(); + let body = resp.text().await.unwrap_or_default(); + bail!("llama-swap chat request failed: {} — {}", status, body); + } + + let parsed: Value = resp.json().await.context("parsing chat response")?; + let choice = parsed + .get("choices") + .and_then(|v| v.as_array()) + .and_then(|a| a.first()) + .ok_or_else(|| { + anyhow!( + "response missing choices[0]: {}", + extract_error_detail(&parsed) + ) + })?; + let msg = choice.get("message").ok_or_else(|| { + anyhow!( + "choices[0] missing message: {}", + extract_error_detail(&parsed) + ) + })?; + let chat_msg = Self::openai_message_to_chat(msg)?; + + let usage = parsed.get("usage"); + let prompt_tokens = usage + .and_then(|u| u.get("prompt_tokens")) + .and_then(|v| v.as_i64()) + .map(|n| n as i32); + let completion_tokens = usage + .and_then(|u| u.get("completion_tokens")) + .and_then(|v| v.as_i64()) + .map(|n| n as i32); + + Ok((chat_msg, prompt_tokens, completion_tokens)) + } +} + +#[async_trait] +impl LlmClient for LlamaCppClient { + async fn generate( + &self, + prompt: &str, + system: Option<&str>, + images: Option>, + ) -> Result { + let mut messages: Vec = Vec::new(); + if let Some(sys) = system { + messages.push(ChatMessage::system(sys)); + } + let mut user = ChatMessage::user(prompt); + user.images = images; + messages.push(user); + + let (reply, _, _) = self.chat_with_tools(messages, Vec::new()).await?; + Ok(reply.content) + } + + async fn chat_with_tools( + &self, + messages: Vec, + tools: Vec, + ) -> Result<(ChatMessage, Option, Option)> { + log::info!( + "llama-swap chat_with_tools: model={} messages={} tools={}", + self.primary_model, + messages.len(), + tools.len() + ); + self.chat_completion_with_model(&self.primary_model.clone(), messages, tools) + .await + } + + async fn chat_with_tools_stream( + &self, + messages: Vec, + tools: Vec, + ) -> Result>> { + let url = format!("{}/chat/completions", self.base_url); + let mut body = serde_json::Map::new(); + body.insert( + "model".into(), + Value::String(self.primary_model.clone()), + ); + body.insert( + "messages".into(), + Value::Array(Self::messages_to_openai(&messages)), + ); + body.insert("stream".into(), Value::Bool(true)); + body.insert( + "stream_options".into(), + serde_json::json!({ "include_usage": true }), + ); + if !tools.is_empty() { + body.insert( + "tools".into(), + serde_json::to_value(&tools).context("serializing tools")?, + ); + } + for (k, v) in self.build_options() { + body.insert(k.into(), v); + } + + let resp = self + .client + .post(&url) + .json(&Value::Object(body)) + .send() + .await + .with_context(|| format!("POST {} failed", url))?; + + if !resp.status().is_success() { + let status = resp.status(); + let body = resp.text().await.unwrap_or_default(); + bail!("llama-swap stream request failed: {} — {}", status, body); + } + + let byte_stream = resp.bytes_stream(); + let stream = async_stream::stream! { + let mut byte_stream = byte_stream; + let mut buf: Vec = Vec::new(); + let mut accumulated_content = String::new(); + let mut tool_state: std::collections::BTreeMap< + usize, + (Option, Option, String), + > = std::collections::BTreeMap::new(); + let mut role = "assistant".to_string(); + let mut prompt_tokens: Option = None; + let mut completion_tokens: Option = None; + let mut done_seen = false; + + while let Some(chunk) = byte_stream.next().await { + let chunk = match chunk { + Ok(b) => b, + Err(e) => { + yield Err(anyhow!("stream read failed: {}", e)); + return; + } + }; + buf.extend_from_slice(&chunk); + + while let Some(sep) = find_double_newline(&buf) { + let frame = buf.drain(..sep + 2).collect::>(); + let frame_str = match std::str::from_utf8(&frame) { + Ok(s) => s, + Err(_) => continue, + }; + for line in frame_str.lines() { + let line = line.trim_end_matches('\r'); + let payload = match line.strip_prefix("data: ") { + Some(p) => p, + None => continue, + }; + if payload == "[DONE]" { + done_seen = true; + break; + } + let v: Value = match serde_json::from_str(payload) { + Ok(v) => v, + Err(e) => { + log::warn!( + "malformed llama-swap SSE frame: {} ({})", + payload, + e + ); + continue; + } + }; + + if let Some(usage) = v.get("usage") { + prompt_tokens = usage + .get("prompt_tokens") + .and_then(|n| n.as_i64()) + .map(|n| n as i32); + completion_tokens = usage + .get("completion_tokens") + .and_then(|n| n.as_i64()) + .map(|n| n as i32); + } + + let Some(choices) = v.get("choices").and_then(|c| c.as_array()) + else { + continue; + }; + let Some(choice) = choices.first() else { continue }; + let delta = match choice.get("delta") { + Some(d) => d, + None => continue, + }; + if let Some(r) = delta.get("role").and_then(|v| v.as_str()) { + role = r.to_string(); + } + if let Some(content) = + delta.get("content").and_then(|v| v.as_str()) + && !content.is_empty() + { + accumulated_content.push_str(content); + yield Ok(LlmStreamEvent::TextDelta(content.to_string())); + } + if let Some(tcs) = delta.get("tool_calls").and_then(|v| v.as_array()) { + for tc_delta in tcs { + let idx = tc_delta + .get("index") + .and_then(|n| n.as_u64()) + .unwrap_or(0) as usize; + let entry = tool_state + .entry(idx) + .or_insert((None, None, String::new())); + if let Some(id) = + tc_delta.get("id").and_then(|v| v.as_str()) + { + entry.0 = Some(id.to_string()); + } + if let Some(func) = tc_delta.get("function") { + if let Some(name) = + func.get("name").and_then(|v| v.as_str()) + { + entry.1 = Some(name.to_string()); + } + if let Some(args) = + func.get("arguments").and_then(|v| v.as_str()) + { + entry.2.push_str(args); + } + } + } + } + } + if done_seen { + break; + } + } + if done_seen { + break; + } + } + + let tool_calls: Option> = if tool_state.is_empty() { + None + } else { + let mut v = Vec::with_capacity(tool_state.len()); + for (_idx, (id, name, args)) in tool_state { + let arguments: Value = if args.trim().is_empty() { + Value::Object(Default::default()) + } else { + serde_json::from_str(&args).unwrap_or_else(|_| { + Value::Object(Default::default()) + }) + }; + v.push(ToolCall { + id, + function: ToolCallFunction { + name: name.unwrap_or_default(), + arguments, + }, + }); + } + Some(v) + }; + + let message = ChatMessage { + role, + content: accumulated_content, + tool_calls, + images: None, + }; + yield Ok(LlmStreamEvent::Done { + message, + prompt_eval_count: prompt_tokens, + eval_count: completion_tokens, + }); + }; + + Ok(Box::pin(stream)) + } + + async fn generate_embeddings(&self, texts: &[&str]) -> Result>> { + let url = format!("{}/embeddings", self.base_url); + let body = json!({ + "model": self.embedding_model, + "input": texts, + }); + + let resp = self + .client + .post(&url) + .json(&body) + .send() + .await + .with_context(|| format!("POST {} failed", url))?; + + if !resp.status().is_success() { + let status = resp.status(); + let body = resp.text().await.unwrap_or_default(); + bail!("llama-swap embedding request failed: {} — {}", status, body); + } + + #[derive(Deserialize)] + struct EmbedResponse { + data: Vec, + } + #[derive(Deserialize)] + struct EmbedItem { + embedding: Vec, + } + + let parsed: EmbedResponse = resp.json().await.context("parsing embed response")?; + Ok(parsed.data.into_iter().map(|i| i.embedding).collect()) + } + + async fn describe_image(&self, image_base64: &str) -> Result { + let prompt = "Briefly describe what you see in this image in 1-2 sentences. \ + Focus on the people, location, and activity."; + let system = "You are a scene description assistant. Be concise and factual."; + + let messages = vec![ + ChatMessage::system(system), + ChatMessage { + role: "user".to_string(), + content: prompt.to_string(), + tool_calls: None, + images: Some(vec![image_base64.to_string()]), + }, + ]; + + let (reply, _, _) = self + .chat_completion_with_model(&self.vision_model.clone(), messages, Vec::new()) + .await?; + Ok(reply.content) + } + + async fn list_models(&self) -> Result> { + let url = format!("{}/models", self.base_url); + let resp = self + .client + .get(&url) + .send() + .await + .with_context(|| format!("GET {} failed", url))?; + + if !resp.status().is_success() { + let status = resp.status(); + let body = resp.text().await.unwrap_or_default(); + bail!("llama-swap list_models failed: {} — {}", status, body); + } + + let parsed: Value = resp.json().await.context("parsing models response")?; + let data = parsed + .get("data") + .and_then(|v| v.as_array()) + .ok_or_else(|| anyhow!("models response missing data[]"))?; + + let caps: Vec = data + .iter() + .map(|m| self.parse_model_capabilities(m)) + .collect(); + + Ok(caps) + } + + async fn model_capabilities(&self, model: &str) -> Result { + let all = self.list_models().await?; + all.into_iter() + .find(|m| m.name == model) + .ok_or_else(|| anyhow!("model '{}' not found on llama-swap", model)) + } + + fn primary_model(&self) -> &str { + &self.primary_model + } +} + +impl LlamaCppClient { + fn parse_model_capabilities(&self, m: &Value) -> ModelCapabilities { + let name = m + .get("id") + .and_then(|v| v.as_str()) + .unwrap_or_default() + .to_string(); + let has_vision = name == self.vision_model || self.vision_models.iter().any(|v| v == &name); + // Tool calling is the default for llama-swap entries we configure + // (--jinja flag); no negative-list mechanism yet, so report true. + ModelCapabilities { + name, + has_vision, + has_tool_calling: true, + } + } +} + +/// Extract a diagnostic fragment from a llama-swap / llama-server response +/// that doesn't match the expected `{choices: [...]}` shape. llama-server +/// returns errors as `{"error": {"message": "...", "code": N, "type": "..."}}`; +/// llama-swap itself sometimes wraps subprocess failures with its own +/// `{"error": "..."}` flat shape. Surface either when present, otherwise fall +/// back to a truncated raw-JSON view. +fn extract_error_detail(parsed: &Value) -> String { + if let Some(err) = parsed.get("error") { + match err { + Value::Object(_) => { + let message = err + .get("message") + .and_then(|v| v.as_str()) + .unwrap_or("(no message)"); + let code = err + .get("code") + .map(|v| match v { + Value::String(s) => s.clone(), + other => other.to_string(), + }) + .unwrap_or_else(|| "?".to_string()); + let short_message: String = message.chars().take(240).collect(); + return format!("error code={} message=\"{}\"", code, short_message); + } + Value::String(s) => { + let short: String = s.chars().take(240).collect(); + return format!("error=\"{}\"", short); + } + _ => {} + } + } + let raw = parsed.to_string(); + raw.chars().take(300).collect() +} + +fn find_double_newline(buf: &[u8]) -> Option { + for i in 0..buf.len().saturating_sub(1) { + if buf[i] == b'\n' && buf[i + 1] == b'\n' { + return Some(i); + } + if i + 3 < buf.len() + && buf[i] == b'\r' + && buf[i + 1] == b'\n' + && buf[i + 2] == b'\r' + && buf[i + 3] == b'\n' + { + return Some(i + 1); + } + } + None +} + +fn image_to_data_url(img: &str) -> String { + if img.starts_with("data:") { + img.to_string() + } else { + format!("data:image/jpeg;base64,{}", img) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn tool_call_arguments_stringified_on_send() { + let msg = ChatMessage { + role: "assistant".into(), + content: String::new(), + tool_calls: Some(vec![ToolCall { + id: Some("call_abc".into()), + function: ToolCallFunction { + name: "search_sms".into(), + arguments: json!({"query": "hello", "limit": 5}), + }, + }]), + images: None, + }; + + let wire = LlamaCppClient::messages_to_openai(&[msg]); + let tcs = wire[0] + .get("tool_calls") + .and_then(|v| v.as_array()) + .expect("tool_calls present"); + let args = tcs[0] + .get("function") + .and_then(|f| f.get("arguments")) + .and_then(|a| a.as_str()) + .expect("arguments stringified"); + let parsed: Value = serde_json::from_str(args).unwrap(); + assert_eq!(parsed["query"], "hello"); + assert_eq!(parsed["limit"], 5); + } + + #[test] + fn tool_call_arguments_parsed_on_receive() { + let response_msg = json!({ + "role": "assistant", + "content": "", + "tool_calls": [{ + "id": "call_xyz", + "type": "function", + "function": { + "name": "get_weather", + "arguments": "{\"city\":\"Boston\",\"units\":\"celsius\"}" + } + }] + }); + let parsed = LlamaCppClient::openai_message_to_chat(&response_msg).unwrap(); + let tcs = parsed.tool_calls.unwrap(); + assert_eq!(tcs.len(), 1); + assert_eq!(tcs[0].function.name, "get_weather"); + assert_eq!(tcs[0].function.arguments["city"], "Boston"); + assert_eq!(tcs[0].function.arguments["units"], "celsius"); + assert_eq!(tcs[0].id.as_deref(), Some("call_xyz")); + } + + #[test] + fn tool_call_arguments_accept_native_json_on_receive() { + // Some llama.cpp builds emit arguments as a JSON object directly when + // jinja's tool-output strict-string rule isn't applied — accept both. + let response_msg = json!({ + "role": "assistant", + "content": "", + "tool_calls": [{ + "id": "call_1", + "type": "function", + "function": { + "name": "foo", + "arguments": {"nested": {"k": 1}} + } + }] + }); + let parsed = LlamaCppClient::openai_message_to_chat(&response_msg).unwrap(); + let tc = &parsed.tool_calls.unwrap()[0]; + assert_eq!(tc.function.arguments["nested"]["k"], 1); + } + + #[test] + fn images_become_content_parts() { + let mut msg = ChatMessage::user("What is in this photo?"); + msg.images = Some(vec!["BASE64DATA".into()]); + + let wire = LlamaCppClient::messages_to_openai(&[msg]); + let content = wire[0].get("content").and_then(|v| v.as_array()).unwrap(); + assert_eq!(content.len(), 2); + assert_eq!(content[0]["type"], "text"); + assert_eq!(content[0]["text"], "What is in this photo?"); + assert_eq!(content[1]["type"], "image_url"); + assert_eq!( + content[1]["image_url"]["url"], + "data:image/jpeg;base64,BASE64DATA" + ); + } + + #[test] + fn data_url_images_pass_through_unchanged() { + let mut msg = ChatMessage::user(""); + msg.images = Some(vec!["data:image/png;base64,ABCDEF".into()]); + let wire = LlamaCppClient::messages_to_openai(&[msg]); + let content = wire[0].get("content").and_then(|v| v.as_array()).unwrap(); + assert_eq!(content.len(), 1); + assert_eq!( + content[0]["image_url"]["url"], + "data:image/png;base64,ABCDEF" + ); + } + + #[test] + fn text_only_message_stays_string() { + let msg = ChatMessage::user("hello"); + let wire = LlamaCppClient::messages_to_openai(&[msg]); + assert_eq!(wire[0]["content"], "hello"); + assert!(wire[0]["content"].as_str().is_some()); + } + + #[test] + fn tool_result_inherits_tool_call_id_from_prior_assistant() { + let assistant = ChatMessage { + role: "assistant".into(), + content: String::new(), + tool_calls: Some(vec![ToolCall { + id: Some("call_42".into()), + function: ToolCallFunction { + name: "lookup".into(), + arguments: json!({}), + }, + }]), + images: None, + }; + let tool_result = ChatMessage::tool_result("found it"); + + let wire = LlamaCppClient::messages_to_openai(&[assistant, tool_result]); + assert_eq!(wire[1]["role"], "tool"); + assert_eq!(wire[1]["tool_call_id"], "call_42"); + } + + #[test] + fn multiple_tool_results_map_to_sequential_call_ids() { + let assistant = ChatMessage { + role: "assistant".into(), + content: String::new(), + tool_calls: Some(vec![ + ToolCall { + id: Some("call_A".into()), + function: ToolCallFunction { + name: "a".into(), + arguments: json!({}), + }, + }, + ToolCall { + id: Some("call_B".into()), + function: ToolCallFunction { + name: "b".into(), + arguments: json!({}), + }, + }, + ]), + images: None, + }; + let r1 = ChatMessage::tool_result("a result"); + let r2 = ChatMessage::tool_result("b result"); + + let wire = LlamaCppClient::messages_to_openai(&[assistant, r1, r2]); + assert_eq!(wire[1]["tool_call_id"], "call_A"); + assert_eq!(wire[2]["tool_call_id"], "call_B"); + } + + #[test] + fn missing_tool_call_id_gets_synthetic_fallback() { + let assistant = ChatMessage { + role: "assistant".into(), + content: String::new(), + tool_calls: Some(vec![ToolCall { + id: None, + function: ToolCallFunction { + name: "noid".into(), + arguments: json!({}), + }, + }]), + images: None, + }; + let wire = LlamaCppClient::messages_to_openai(&[assistant]); + let tcs = wire[0] + .get("tool_calls") + .and_then(|v| v.as_array()) + .unwrap(); + assert_eq!(tcs[0]["id"], "call_0"); + } + + #[test] + fn capability_inference_uses_vision_model_and_allowlist() { + let mut c = LlamaCppClient::new(None, Some("chat".into())); + c.set_vision_model("vision".into()); + c.set_vision_models(vec!["qwen-vl".into()]); + + let m_chat = json!({ "id": "chat" }); + let m_vision = json!({ "id": "vision" }); + let m_qwen = json!({ "id": "qwen-vl" }); + let m_other = json!({ "id": "embed" }); + + let chat = c.parse_model_capabilities(&m_chat); + let vision = c.parse_model_capabilities(&m_vision); + let qwen = c.parse_model_capabilities(&m_qwen); + let other = c.parse_model_capabilities(&m_other); + + assert!(!chat.has_vision); + assert!(chat.has_tool_calling); + assert!(vision.has_vision); + assert!(qwen.has_vision); + assert!(!other.has_vision); + } +} diff --git a/src/ai/mod.rs b/src/ai/mod.rs index 3468325..204da04 100644 --- a/src/ai/mod.rs +++ b/src/ai/mod.rs @@ -5,6 +5,7 @@ pub mod face_client; pub mod handlers; pub mod insight_chat; pub mod insight_generator; +pub mod llamacpp; pub mod llm_client; pub mod ollama; pub mod openrouter; @@ -20,7 +21,8 @@ pub use handlers::{ chat_history_handler, chat_rewind_handler, chat_stream_handler, chat_turn_handler, delete_insight_handler, export_training_data_handler, generate_agentic_insight_handler, generate_insight_handler, get_all_insights_handler, get_available_models_handler, - get_insight_handler, get_openrouter_models_handler, rate_insight_handler, + get_insight_handler, get_llamacpp_models_handler, get_openrouter_models_handler, + rate_insight_handler, }; pub use insight_generator::InsightGenerator; #[allow(unused_imports)] diff --git a/src/bin/populate_knowledge.rs b/src/bin/populate_knowledge.rs index 29945d7..71f2f8a 100644 --- a/src/bin/populate_knowledge.rs +++ b/src/bin/populate_knowledge.rs @@ -195,6 +195,7 @@ async fn main() -> anyhow::Result<()> { let generator = InsightGenerator::new( ollama, None, + None, sms_client, apollo_client, insight_dao.clone(), diff --git a/src/main.rs b/src/main.rs index 63013ce..3bd3656 100644 --- a/src/main.rs +++ b/src/main.rs @@ -313,6 +313,7 @@ fn main() -> std::io::Result<()> { .service(ai::get_all_insights_handler) .service(ai::get_available_models_handler) .service(ai::get_openrouter_models_handler) + .service(ai::get_llamacpp_models_handler) .service(ai::chat_turn_handler) .service(ai::chat_stream_handler) .service(ai::chat_history_handler) diff --git a/src/state.rs b/src/state.rs index 8f1bd4e..96d1c22 100644 --- a/src/state.rs +++ b/src/state.rs @@ -2,6 +2,7 @@ use crate::ai::apollo_client::ApolloClient; use crate::ai::clip_client::ClipClient; use crate::ai::face_client::FaceClient; use crate::ai::insight_chat::{ChatLockMap, InsightChatService}; +use crate::ai::llamacpp::LlamaCppClient; use crate::ai::openrouter::OpenRouterClient; use crate::ai::{InsightGenerator, OllamaClient, SmsApiClient}; use crate::database::{ @@ -62,6 +63,16 @@ pub struct AppState { /// Curated list of OpenRouter model ids exposed to clients. Sourced from /// `OPENROUTER_ALLOWED_MODELS` (comma-separated). Empty when unset. pub openrouter_allowed_models: Vec, + /// `None` when `LLAMA_SWAP_URL` is not configured. Consulted only when a + /// request explicitly opts into `backend=llamacpp`. Same shape as the + /// `openrouter` slot — present here so handlers can route to it without + /// threading through the generator. + #[allow(dead_code)] + pub llamacpp: Option>, + /// Curated list of llama-swap model ids exposed to clients. Sourced from + /// `LLAMA_SWAP_ALLOWED_MODELS` (comma-separated). Empty when unset; the + /// server then falls back to `LLAMA_SWAP_PRIMARY_MODEL`. + pub llamacpp_allowed_models: Vec, pub sms_client: SmsApiClient, pub insight_generator: InsightGenerator, /// Chat continuation service. Hold an Arc so handlers can clone cheaply. @@ -105,6 +116,8 @@ impl AppState { ollama: OllamaClient, openrouter: Option>, openrouter_allowed_models: Vec, + llamacpp: Option>, + llamacpp_allowed_models: Vec, sms_client: SmsApiClient, insight_generator: InsightGenerator, insight_chat: Arc, @@ -145,6 +158,8 @@ impl AppState { ollama, openrouter, openrouter_allowed_models, + llamacpp, + llamacpp_allowed_models, sms_client, insight_generator, insight_chat, @@ -186,6 +201,9 @@ impl Default for AppState { let openrouter = build_openrouter_from_env(); let openrouter_allowed_models = parse_openrouter_allowed_models(); + let llamacpp = build_llamacpp_from_env(); + let llamacpp_allowed_models = parse_llamacpp_allowed_models(); + let sms_api_url = env::var("SMS_API_URL").unwrap_or_else(|_| "http://localhost:8000".to_string()); let sms_api_token = env::var("SMS_API_TOKEN").ok(); @@ -250,6 +268,7 @@ impl Default for AppState { let insight_generator = InsightGenerator::new( ollama.clone(), openrouter.clone(), + llamacpp.clone(), sms_client.clone(), apollo_client.clone(), insight_dao.clone(), @@ -273,6 +292,7 @@ impl Default for AppState { Arc::new(insight_generator.clone()), ollama.clone(), openrouter.clone(), + llamacpp.clone(), insight_dao.clone(), chat_locks, )); @@ -294,6 +314,8 @@ impl Default for AppState { ollama, openrouter, openrouter_allowed_models, + llamacpp, + llamacpp_allowed_models, sms_client, insight_generator, insight_chat, @@ -335,6 +357,50 @@ fn parse_openrouter_allowed_models() -> Vec { .collect() } +/// Build a `LlamaCppClient` from environment variables. Returns `None` when +/// `LLAMA_SWAP_URL` is unset (the llamacpp backend is then unavailable and +/// requests for it return a clear error). The slot ids default to the +/// names the bundled `llama-swap/config.yaml` uses — `chat` / `vision` / +/// `embed` — so a minimal deploy only needs to set `LLAMA_SWAP_URL`. +fn build_llamacpp_from_env() -> Option> { + let base_url = env::var("LLAMA_SWAP_URL").ok()?; + let primary_model = env::var("LLAMA_SWAP_PRIMARY_MODEL").ok(); + let mut client = LlamaCppClient::new(Some(base_url), primary_model); + if let Ok(model) = env::var("LLAMA_SWAP_EMBEDDING_MODEL") { + client.set_embedding_model(model); + } + if let Ok(model) = env::var("LLAMA_SWAP_VISION_MODEL") { + client.set_vision_model(model); + } + client.set_vision_models(parse_llamacpp_vision_models()); + Some(Arc::new(client)) +} + +/// Parse `LLAMA_SWAP_ALLOWED_MODELS` (comma-separated) into a vec. Used to +/// drive `/insights/llamacpp/models`; empty when unset. +fn parse_llamacpp_allowed_models() -> Vec { + env::var("LLAMA_SWAP_ALLOWED_MODELS") + .unwrap_or_default() + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect() +} + +/// Parse `LLAMA_SWAP_VISION_MODELS` (comma-separated) — slot ids that report +/// `has_vision = true` in capability lookups. The configured `vision_model` +/// (default `vision`) is always considered vision-capable regardless of this +/// list, so a deploy that only uses the default vision slot can leave it +/// unset. +fn parse_llamacpp_vision_models() -> Vec { + env::var("LLAMA_SWAP_VISION_MODELS") + .unwrap_or_default() + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect() +} + #[cfg(test)] impl AppState { /// Creates an AppState instance for testing with temporary directories @@ -397,6 +463,7 @@ impl AppState { let insight_generator = InsightGenerator::new( ollama.clone(), None, + None, sms_client.clone(), apollo_client.clone(), insight_dao.clone(), @@ -418,6 +485,7 @@ impl AppState { Arc::new(insight_generator.clone()), ollama.clone(), None, + None, insight_dao.clone(), chat_locks, )); @@ -445,6 +513,8 @@ impl AppState { ollama, None, Vec::new(), + None, + Vec::new(), sms_client, insight_generator, insight_chat, -- 2.49.1 From d14df63f196d80b73c9135621ab842af7b2e45f7 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Wed, 20 May 2026 17:54:08 -0400 Subject: [PATCH 02/11] env.example: document LLAMA_SWAP_* + HYBRID_VISION_BACKEND vars Mirrors the section added to CLAUDE.md so deploys can opt into the llamacpp backend from the template alone. --- .env.example | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/.env.example b/.env.example index 718f6bd..63d8436 100644 --- a/.env.example +++ b/.env.example @@ -53,6 +53,27 @@ AGENTIC_CHAT_MAX_ITERATIONS=6 # OPENROUTER_HTTP_REFERER=https://your-site.example # OPENROUTER_APP_TITLE=ImageApi +# ── AI Insights — llama.cpp / llama-swap (optional) ───────────────────── +# Set LLAMA_SWAP_URL to enable the `llamacpp` chat_backend. Talks +# OpenAI-compatible /v1 to a llama-swap proxy that fronts per-slot +# llama-server instances (chat / vision / embed). Like hybrid, the +# agentic loop describes images via the vision slot then inlines the +# text into the chat slot — so the chat slot itself can be text-only. +# LLAMA_SWAP_URL=http://localhost:9292/v1 +# LLAMA_SWAP_PRIMARY_MODEL=chat +# LLAMA_SWAP_VISION_MODEL=vision +# LLAMA_SWAP_EMBEDDING_MODEL=embed +# Comma-separated allowlist of model ids the /v1/models endpoint should +# advertise as vision-capable (llama-swap doesn't report modality). +# LLAMA_SWAP_VISION_MODELS=vision +# Comma-separated allowlist surfaced by /insights/llamacpp/models. +# LLAMA_SWAP_ALLOWED_MODELS=chat,vision,embed +# LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=120 +# Routes hybrid mode's vision-describe pass through llama-swap's vision +# slot instead of Ollama (chat still goes to OpenRouter). Values: +# `ollama` (default) | `llamacpp`. +# HYBRID_VISION_BACKEND=ollama + # ── AI Insights — sibling services (optional) ─────────────────────────── # Apollo (places, face inference, CLIP encoders). Single-Apollo deploys # typically set only APOLLO_API_BASE_URL and let the face + CLIP -- 2.49.1 From be51421b3879253bea74e27bc48ce659f419ba2c Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Thu, 21 May 2026 11:36:58 -0400 Subject: [PATCH 03/11] ai: collapse llamacpp into LLM_BACKEND env switch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reverts the per-request backend="llamacpp" value. Chat/vision/embedding backend is now a deploy-time decision (LLM_BACKEND=ollama|llamacpp), applied globally across chat, vision describe, and embeddings — so embedding vectors stay in one space across the index. - Per-request backend whitelist back to "local"|"hybrid". A request arriving with backend="llamacpp" is rejected. - LLM_BACKEND=llamacpp swaps the entire local stack to llama-swap: chat hits the chat slot, describe hits the vision slot, embeddings hit the embed slot. Hybrid mode still routes chat to OpenRouter but uses LLM_BACKEND for the describe pass. - Drops env vars HYBRID_VISION_BACKEND, LLAMA_SWAP_VISION_MODELS, EMBEDDING_BACKEND (the last never shipped). Drops the LlamaCppClient.vision_models allowlist — capability inference now reports has_vision only for the configured vision_model slot. - Drops the /insights/llamacpp/models handler. /insights/models is the single endpoint; returns Ollama servers under LLM_BACKEND=ollama and llama-swap slots (from LLAMA_SWAP_ALLOWED_MODELS) under LLM_BACKEND=llamacpp. Same envelope shape either way. - New ai::embed_one helper routes embeddings through llama-swap when LLM_BACKEND=llamacpp (else Ollama). Wires it into the four insight_generator embedding sites. - Cross-replay matrix simplifies to pre-llamacpp shape (local↔local, hybrid↔hybrid, hybrid→local allowed; local→hybrid rejected). --- .env.example | 32 ++++--- CLAUDE.md | 96 +++++++++++--------- src/ai/handlers.rs | 64 +++++++------- src/ai/insight_chat.rs | 171 ++++++++++++++---------------------- src/ai/insight_generator.rs | 127 +++++++++++++------------- src/ai/llamacpp.rs | 33 ++----- src/ai/mod.rs | 88 ++++++++++++++++++- src/main.rs | 1 - src/state.rs | 27 ++---- 9 files changed, 338 insertions(+), 301 deletions(-) diff --git a/.env.example b/.env.example index 63d8436..81fab4e 100644 --- a/.env.example +++ b/.env.example @@ -53,26 +53,30 @@ AGENTIC_CHAT_MAX_ITERATIONS=6 # OPENROUTER_HTTP_REFERER=https://your-site.example # OPENROUTER_APP_TITLE=ImageApi +# ── AI Insights — local backend switch ────────────────────────────────── +# Picks which local LLM stack the server uses for chat, vision describe, +# and embeddings. `ollama` (default) uses the OLLAMA_* settings above; +# `llamacpp` uses the LLAMA_SWAP_* settings below. The switch is global +# and applies to both `backend=local` and `backend=hybrid` (hybrid keeps +# chat on OpenRouter but still uses this stack for the describe pass). +# Don't flip mid-deploy without re-embedding existing index rows — +# mixed vector spaces break similarity search. +# LLM_BACKEND=ollama + # ── AI Insights — llama.cpp / llama-swap (optional) ───────────────────── -# Set LLAMA_SWAP_URL to enable the `llamacpp` chat_backend. Talks -# OpenAI-compatible /v1 to a llama-swap proxy that fronts per-slot -# llama-server instances (chat / vision / embed). Like hybrid, the -# agentic loop describes images via the vision slot then inlines the -# text into the chat slot — so the chat slot itself can be text-only. +# Set LLAMA_SWAP_URL plus LLM_BACKEND=llamacpp to swap the local stack +# off Ollama. Talks OpenAI-compatible /v1 to a llama-swap proxy fronting +# per-slot llama-server instances (chat / vision / embed). The chat slot +# is treated as text-only — images are pre-described via the vision slot +# and inlined into the prompt. # LLAMA_SWAP_URL=http://localhost:9292/v1 # LLAMA_SWAP_PRIMARY_MODEL=chat # LLAMA_SWAP_VISION_MODEL=vision # LLAMA_SWAP_EMBEDDING_MODEL=embed -# Comma-separated allowlist of model ids the /v1/models endpoint should -# advertise as vision-capable (llama-swap doesn't report modality). -# LLAMA_SWAP_VISION_MODELS=vision -# Comma-separated allowlist surfaced by /insights/llamacpp/models. +# Comma-separated allowlist surfaced by /insights/models when +# LLM_BACKEND=llamacpp. # LLAMA_SWAP_ALLOWED_MODELS=chat,vision,embed -# LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=120 -# Routes hybrid mode's vision-describe pass through llama-swap's vision -# slot instead of Ollama (chat still goes to OpenRouter). Values: -# `ollama` (default) | `llamacpp`. -# HYBRID_VISION_BACKEND=ollama +# LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180 # ── AI Insights — sibling services (optional) ─────────────────────────── # Apollo (places, face inference, CLIP encoders). Single-Apollo deploys diff --git a/CLAUDE.md b/CLAUDE.md index d3419e6..d06b29f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -473,9 +473,8 @@ GET /memories?path=...&recursive=true POST /insights/generate (non-agentic single-shot) POST /insights/generate/agentic (tool-calling loop; body: { file_path, backend?, model?, ... }) GET /insights?path=...&library=... -GET /insights/models (local Ollama models + capabilities) +GET /insights/models (local-backend models + capabilities; Ollama OR llama-swap based on LLM_BACKEND) GET /insights/openrouter/models (curated OpenRouter allowlist) -GET /insights/llamacpp/models (curated llama-swap slot allowlist) POST /insights/rate (thumbs up/down for training data) // Insight Chat Continuation @@ -632,22 +631,27 @@ OPENROUTER_EMBEDDING_MODEL=openai/text-embedding-3-small # Optional, embeddings OPENROUTER_HTTP_REFERER=https://your-site.example # Optional attribution header OPENROUTER_APP_TITLE=ImageApi # Optional attribution header -# llama.cpp / llama-swap (Llamacpp Backend) - sibling to Ollama; OpenAI-compatible +# Local LLM backend switch. `ollama` (default) keeps the OLLAMA_* settings +# above; `llamacpp` swaps the entire local stack (chat + vision describe + +# embeddings) over to llama-swap. The switch is global and applies to +# `backend=local` requests and to `backend=hybrid`'s describe pass (hybrid +# chat still goes to OpenRouter). Don't flip mid-deploy without +# re-embedding — mixed vector spaces break similarity search. +LLM_BACKEND=ollama + +# llama.cpp / llama-swap (used when LLM_BACKEND=llamacpp). OpenAI-compatible # proxy hosting one or more llama-server processes (chat / vision / embed slots). -LLAMA_SWAP_URL=http://localhost:9292/v1 # Required to enable llamacpp backend +LLAMA_SWAP_URL=http://localhost:9292/v1 # Required when LLM_BACKEND=llamacpp LLAMA_SWAP_PRIMARY_MODEL=chat # Chat slot id (matches config.yaml) -LLAMA_SWAP_VISION_MODEL=vision # Vision slot id; describe_image routes here -LLAMA_SWAP_EMBEDDING_MODEL=embed # Embedding slot id (when local embeddings via llamacpp) -LLAMA_SWAP_VISION_MODELS=qwen-vl,llava # Comma-separated slot ids known to have vision. - # Drives `has_vision` in /insights/llamacpp/models. - # `LLAMA_SWAP_VISION_MODEL` is auto-included. -LLAMA_SWAP_ALLOWED_MODELS=chat,coder # Curated allowlist exposed to clients via - # GET /insights/llamacpp/models. Empty = no picker. +LLAMA_SWAP_VISION_MODEL=vision # Vision slot id; describe_image routes here. + # The only slot reported as has_vision=true in + # /insights/models — chat slots are treated as + # text-only (images pre-described and inlined). +LLAMA_SWAP_EMBEDDING_MODEL=embed # Embedding slot id +LLAMA_SWAP_ALLOWED_MODELS=chat,coder # Curated allowlist surfaced by GET /insights/models + # when LLM_BACKEND=llamacpp. Empty = picker shows + # only the configured primary model. LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180 # Per-request timeout; bump for slow CPU offload -HYBRID_VISION_BACKEND=llamacpp # Optional override for hybrid mode's describe_image: - # `ollama` (default) or `llamacpp`. When `llamacpp`, - # hybrid still routes chat to OpenRouter but uses - # llama-swap's vision slot to describe images. # Insight Chat Continuation AGENTIC_CHAT_MAX_ITERATIONS=6 # Cap on tool-calling iterations per chat turn (default 6) @@ -668,13 +672,36 @@ The `OllamaClient` provides methods to query available models: This allows runtime verification of model availability before generating insights. +**Local backend switch (`LLM_BACKEND`):** + +One env var decides which "local" stack the server runs against — `ollama` +(default) or `llamacpp`. It's global on purpose: chat, vision describe, and +embeddings all route through the same backend, so the embedding-vector +column in SQLite stays in one vector space. Don't flip mid-deploy without +re-embedding the affected rows — similarity search will collapse. + +- `LLM_BACKEND=ollama`: chat and embeddings use Ollama; vision describe + uses Ollama's multimodal model. +- `LLM_BACKEND=llamacpp`: chat hits llama-swap's `chat` slot (which is + treated as text-only — images are pre-described via the `vision` slot + and inlined), embeddings hit the `embed` slot, vision describe hits the + `vision` slot. Requires `LLAMA_SWAP_URL`. + +The per-request `backend=hybrid` override is orthogonal: it always sends +chat to OpenRouter, but the describe pass still routes through whichever +`LLM_BACKEND` is configured. + +`GET /insights/models` returns the local-backend models with capabilities +in the same envelope shape regardless of `LLM_BACKEND`: Ollama servers +when `ollama`, llama-swap slots (from `LLAMA_SWAP_ALLOWED_MODELS`) when +`llamacpp`. No `/insights/llamacpp/models` — the picker reads a single +endpoint. + **Hybrid Backend (OpenRouter):** - Per-request opt-in via `backend=hybrid` on `POST /insights/generate/agentic`. - Vision describe happens before the agentic loop; the description is inlined - into the chat prompt and the agentic loop runs on OpenRouter. By default - vision uses local Ollama, but `HYBRID_VISION_BACKEND=llamacpp` flips it to - llama-swap's vision slot (useful when you want chat on a frontier model and - vision on a local-but-not-Ollama path). + into the chat prompt and the agentic loop runs on OpenRouter. Vision + routes through whichever `LLM_BACKEND` is configured. - `request.model` (if provided) overrides `OPENROUTER_DEFAULT_MODEL` for that call. The mobile picker reads from `OPENROUTER_ALLOWED_MODELS`. - No live capability precheck — the operator-curated allowlist is trusted. @@ -682,29 +709,14 @@ This allows runtime verification of model availability before generating insight - `GET /insights/openrouter/models` returns `{ models, default_model, configured }` for client picker UIs. -**Llamacpp Backend (llama-swap):** -- Per-request opt-in via `backend=llamacpp` on `POST /insights/generate/agentic`. -- Sibling to Ollama: a local OpenAI-compatible proxy (mostlygeek/llama-swap) - fronting one or more `llama-server` processes. The chat slot is text-only - by default; vision and embeddings have their own slots (`LLAMA_SWAP_VISION_MODEL`, - `LLAMA_SWAP_EMBEDDING_MODEL`) that llama-swap routes to by model id. The - bundled `docker-compose.yml` + `llama-swap/config.yaml` in the opencode root - is the reference deploy. -- Operates in the same describe-then-inline shape as hybrid: the chat model - never sees raw images. Vision describe routes through llama-swap's vision - slot (`describe_image` on `LlamaCppClient`). -- `request.model` (if provided) overrides `LLAMA_SWAP_PRIMARY_MODEL` for that - call (must match a slot id in llama-swap's `config.yaml`). The mobile picker - reads from `LLAMA_SWAP_ALLOWED_MODELS`. -- No live capability precheck — slot ids are trusted. Tool calling is assumed - for every slot (llama-swap entries typically launch with `--jinja`). -- `GET /insights/llamacpp/models` returns `{ models, default_model, configured }`. -- Cross-replay matrix (chat continuation): `local ↔ llamacpp` allowed (the - LlamaCppClient passes images through to the chat slot — you're responsible - for a vision-capable slot if the stored transcript carries images); - `hybrid ↔ llamacpp` allowed (both transcripts are text-only); `local → - hybrid` and `llamacpp → hybrid` rejected (mid-conversation description - source change isn't supported). +**Cross-replay matrix (chat continuation):** +- `local → local` allowed (whether served by Ollama or llama-swap; that's + a deploy-time decision, not a request-time one). +- `hybrid → hybrid` allowed. +- `hybrid → local` allowed (the inlined description replays as text). +- `local → hybrid` rejected — the stored transcript has raw images in the + first user message and OpenRouter providers don't accept that shape + consistently. Regenerate the insight in hybrid mode instead. **Insight Chat Continuation:** diff --git a/src/ai/handlers.rs b/src/ai/handlers.rs index 4809b25..c86a5c8 100644 --- a/src/ai/handlers.rs +++ b/src/ai/handlers.rs @@ -470,7 +470,16 @@ pub async fn generate_agentic_insight_handler( } } -/// GET /insights/models - List available models from both servers with capabilities +/// GET /insights/models - Local-backend models with capabilities. Returns +/// Ollama servers when `LLM_BACKEND=ollama` (default), or llama-swap slots +/// when `LLM_BACKEND=llamacpp`. Same envelope shape either way so the +/// client picker doesn't have to branch on backend kind. +/// +/// For llama-swap: `models` comes verbatim from `LLAMA_SWAP_ALLOWED_MODELS` +/// (no live `/v1/models` probe), `has_vision` is true only for the +/// configured `LLAMA_SWAP_VISION_MODEL` slot id, and `has_tool_calling` is +/// reported as true for every slot (llama-server is launched with `--jinja` +/// by convention — a misconfigured slot surfaces as a chat-call error). #[get("/insights/models")] pub async fn get_available_models_handler( _claims: Claims, @@ -478,6 +487,29 @@ pub async fn get_available_models_handler( ) -> impl Responder { log::debug!("Fetching available models with capabilities"); + if crate::ai::local_backend_is_llamacpp() + && let Some(lc) = app_state.llamacpp.as_ref() + { + let models: Vec = app_state + .llamacpp_allowed_models + .iter() + .map(|name| ModelCapabilities { + name: name.clone(), + has_vision: name == &lc.vision_model, + has_tool_calling: true, + }) + .collect(); + let primary = ServerModels { + url: lc.base_url.clone(), + models, + default_model: lc.primary_model.clone(), + }; + return HttpResponse::Ok().json(AvailableModelsResponse { + primary, + fallback: None, + }); + } + let ollama_client = &app_state.ollama; // Fetch models with capabilities from primary server @@ -549,36 +581,6 @@ pub async fn get_openrouter_models_handler( HttpResponse::Ok().json(response) } -#[derive(serde::Serialize)] -pub struct LlamaCppModelsResponse { - pub models: Vec, - pub default_model: Option, - pub configured: bool, -} - -/// GET /insights/llamacpp/models - Curated llama-swap model ids exposed -/// to clients for the llamacpp backend. Returned verbatim from -/// `LLAMA_SWAP_ALLOWED_MODELS`; no live call to llama-swap. Use -/// `LLAMA_SWAP_URL` plus `LLAMA_SWAP_PRIMARY_MODEL` on the server side to -/// pick the actual chat slot. -#[get("/insights/llamacpp/models")] -pub async fn get_llamacpp_models_handler( - _claims: Claims, - app_state: web::Data, -) -> impl Responder { - let configured = app_state.llamacpp.is_some(); - let default_model = app_state - .llamacpp - .as_ref() - .map(|c| c.primary_model.clone()); - let response = LlamaCppModelsResponse { - models: app_state.llamacpp_allowed_models.clone(), - default_model, - configured, - }; - HttpResponse::Ok().json(response) -} - /// POST /insights/rate - Rate an insight (thumbs up/down for training data) #[post("/insights/rate")] pub async fn rate_insight_handler( diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs index 4e87d52..b549dab 100644 --- a/src/ai/insight_chat.rs +++ b/src/ai/insight_chat.rs @@ -309,14 +309,15 @@ impl InsightChatService { .unwrap_or_else(|| stored_backend.clone()); validate_cross_replay(&stored_backend, &effective_backend)?; let is_hybrid = effective_backend == "hybrid"; - let is_llamacpp = effective_backend == "llamacpp"; - let describes_then_inlines = is_hybrid || is_llamacpp; + let local_via_llamacpp = + crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); + let describes_then_inlines = is_hybrid || local_via_llamacpp; span.set_attribute(KeyValue::new("backend", effective_backend.clone())); - // 4. Build the chat backend client. Ollama in local mode, a freshly - // cloned OpenRouter client in hybrid mode, a freshly cloned - // LlamaCppClient in llamacpp mode (clone so per-request - // sampling/model overrides don't leak into shared state). + // 4. Build the chat backend client. Hybrid → OpenRouter; local with + // `LLM_BACKEND=llamacpp` → llama-swap; otherwise Ollama. Clones + // so per-request sampling/model overrides don't leak into shared + // state. let max_iterations = req .max_iterations .unwrap_or(DEFAULT_MAX_ITERATIONS) @@ -353,9 +354,9 @@ impl InsightChatService { c.set_num_ctx(Some(ctx)); } openrouter_client = Some(c); - } else if is_llamacpp { + } else if local_via_llamacpp { let arc = self.llamacpp.as_ref().ok_or_else(|| { - anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured") + anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured") })?; let mut c: LlamaCppClient = (**arc).clone(); if let Some(ref m) = custom_model { @@ -373,8 +374,8 @@ impl InsightChatService { } llamacpp_client = Some(c); } else { - // Local-mode model swap. Build a new client when the chat model - // differs from the configured one (mirrors the agentic pattern). + // Pure local (Ollama): model swap. Build a new client when the + // chat model differs from the configured one. if let Some(ref m) = custom_model && m != &self.ollama.primary_model { @@ -820,8 +821,9 @@ impl InsightChatService { .unwrap_or_else(|| stored_backend.clone()); validate_cross_replay(&stored_backend, &effective_backend)?; let is_hybrid = effective_backend == "hybrid"; - let is_llamacpp = effective_backend == "llamacpp"; - let describes_then_inlines = is_hybrid || is_llamacpp; + let local_via_llamacpp = + crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); + let describes_then_inlines = is_hybrid || local_via_llamacpp; let max_iterations = req .max_iterations @@ -841,9 +843,9 @@ impl InsightChatService { let model_used = chat_backend.primary_model().to_string(); // Tool set — local mode + first user turn carries an image → - // offer describe_photo. Describe-then-inline modes (hybrid / - // llamacpp): visual description was inlined when the insight was - // bootstrapped, no describe tool needed. + // offer describe_photo. Describe-then-inline modes (hybrid OR + // local_via_llamacpp): visual description was inlined when the + // insight was bootstrapped, no describe tool needed. let local_first_user_has_image = messages .iter() .find(|m| m.role == "user") @@ -987,8 +989,9 @@ impl InsightChatService { .unwrap_or_else(|| "default".to_string()); let effective_backend = resolve_bootstrap_backend(req.backend.as_deref())?; let is_hybrid = effective_backend == "hybrid"; - let is_llamacpp = effective_backend == "llamacpp"; - let describes_then_inlines = is_hybrid || is_llamacpp; + let local_via_llamacpp = + crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); + let describes_then_inlines = is_hybrid || local_via_llamacpp; let max_iterations = req .max_iterations @@ -1020,35 +1023,19 @@ impl InsightChatService { _ => None, }); - // Describe-then-inline backends (hybrid, llamacpp): pre-describe the - // image so a text-only chat model gets the visual description inline. - // Vision source: llamacpp's vision slot in llamacpp mode; in hybrid - // mode Ollama by default, llamacpp via `HYBRID_VISION_BACKEND=llamacpp`. + // Describe-then-inline (hybrid OR local_via_llamacpp): pre-describe + // the image so a text-only chat model gets the visual description + // inline. Vision source follows `LLM_BACKEND`: llama-swap when + // `local_via_llamacpp`, else Ollama. let visual_block = if describes_then_inlines { match image_base64.as_deref() { Some(b64) => { - let use_llamacpp_vision = if is_llamacpp { - true - } else { - matches!( - std::env::var("HYBRID_VISION_BACKEND") - .ok() - .as_deref() - .map(|s| s.trim().to_lowercase()) - .as_deref(), - Some("llamacpp") - ) - }; - let described = if use_llamacpp_vision { - match self.llamacpp.as_ref() { - Some(c) => c.describe_image(b64).await, - None => { - log::warn!( - "bootstrap: requested llamacpp vision but LLAMA_SWAP_URL unset; falling back to Ollama" - ); - self.ollama.describe_image(b64).await - } - } + let described = if local_via_llamacpp { + self.llamacpp + .as_ref() + .expect("local_via_llamacpp guarantees Some") + .describe_image(b64) + .await } else { self.ollama.describe_image(b64).await }; @@ -1175,8 +1162,11 @@ impl InsightChatService { /// (boxed because each backend has a different concrete type) and the /// Ollama client used for describe-image / local tool calls. /// - /// `effective_backend` must be one of `"local"`, `"hybrid"`, `"llamacpp"` - /// (validated upstream). + /// `effective_backend` must be one of `"local"` or `"hybrid"` (validated + /// upstream). Hybrid → OpenRouter; local with `LLM_BACKEND=llamacpp` → + /// llama-swap; pure local → Ollama. Returns the dispatched chat client + /// plus the (possibly per-request) Ollama client that the caller uses + /// for non-chat helpers (image describe in non-llamacpp mode, tool ops). fn build_chat_clients( &self, effective_backend: &str, @@ -1206,10 +1196,10 @@ impl InsightChatService { return Ok((Box::new(c), ollama_client)); } - if effective_backend == "llamacpp" { - let arc = self.llamacpp.as_ref().ok_or_else(|| { - anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured") - })?; + // Local mode — env switch decides between Ollama and llama-swap. + if crate::ai::local_backend_is_llamacpp() + && let Some(arc) = self.llamacpp.as_ref() + { let mut c: LlamaCppClient = (**arc).clone(); if let Some(m) = custom_model { c.primary_model = m.to_string(); @@ -1525,41 +1515,26 @@ fn resolve_date_taken_for_context( /// Validate a stored→effective backend transition for a chat continuation. /// Continuation runs against a transcript that was generated with a specific -/// backend; some transitions break the conversation shape: +/// backend; the only blocked transition is `local → hybrid`, because the +/// stored transcript has images embedded in the first user message and the +/// hybrid path (OpenRouter chat with describe-then-inline) can't replay +/// raw image bytes through OpenRouter consistently across providers. +/// `hybrid → local` is allowed (the inlined description replays verbatim +/// as text). /// -/// - `local → hybrid` — the stored transcript has images embedded in the -/// first user message; the openrouter chat client surfaces them through -/// the wire, but vision-only models routed via the hybrid path may not -/// accept that shape consistently across providers. Reject to keep the -/// `regenerate-in-hybrid-mode` workflow as the supported answer. -/// - `llamacpp → hybrid` — the stored transcript already has an inlined -/// visual description produced by llama-swap's vision slot. Switching -/// to hybrid mid-conversation would mix description sources across -/// subsequent turns (any new image in the chat continuation would be -/// described by ollama-vision while the original was described by -/// llama-vision). Reject for consistency. -/// -/// All other transitions are allowed. `local ↔ llamacpp` works because -/// LlamaCppClient passes image content-parts through to the chat slot — -/// the user is responsible for picking a vision-capable chat model in -/// that case. `hybrid ↔ llamacpp` works because both transcripts are -/// text-only (visual description inlined at bootstrap). +/// Whether "local" routes through Ollama or llama-swap is decided at +/// startup by `LLM_BACKEND`; both share the same transcript shape from +/// the chat-replay perspective. fn validate_cross_replay(stored: &str, effective: &str) -> Result<()> { - if !matches!(effective, "local" | "hybrid" | "llamacpp") { + if !matches!(effective, "local" | "hybrid") { bail!( - "unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'", + "unknown backend '{}'; expected 'local' or 'hybrid'", effective ); } if stored == "local" && effective == "hybrid" { bail!( - "switching from local to hybrid mid-chat isn't supported yet; \ - regenerate the insight in hybrid mode if you want OpenRouter chat" - ); - } - if stored == "llamacpp" && effective == "hybrid" { - bail!( - "switching from llamacpp to hybrid mid-chat isn't supported yet; \ + "switching from local to hybrid mid-chat isn't supported; \ regenerate the insight in hybrid mode if you want OpenRouter chat" ); } @@ -1576,9 +1551,9 @@ fn resolve_bootstrap_backend(supplied: Option<&str>) -> Result { .map(|s| s.trim().to_lowercase()) .filter(|s| !s.is_empty()) .unwrap_or_else(|| "local".to_string()); - if !matches!(lower.as_str(), "local" | "hybrid" | "llamacpp") { + if !matches!(lower.as_str(), "local" | "hybrid") { bail!( - "unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'", + "unknown backend '{}'; expected 'local' or 'hybrid'", lower ); } @@ -2184,10 +2159,6 @@ mod tests { fn bootstrap_backend_accepts_local_and_hybrid_case_insensitively() { assert_eq!(resolve_bootstrap_backend(Some("LOCAL")).unwrap(), "local"); assert_eq!(resolve_bootstrap_backend(Some("Hybrid")).unwrap(), "hybrid"); - assert_eq!( - resolve_bootstrap_backend(Some("Llamacpp")).unwrap(), - "llamacpp" - ); assert_eq!( resolve_bootstrap_backend(Some(" local ")).unwrap(), "local" @@ -2196,10 +2167,13 @@ mod tests { #[test] fn bootstrap_backend_rejects_unknown_label() { - let err = resolve_bootstrap_backend(Some("openrouter")).unwrap_err(); - let msg = format!("{}", err); - assert!(msg.contains("unknown backend")); - assert!(msg.contains("openrouter")); + // `llamacpp` is no longer a per-request backend value — it's chosen + // at deploy time via `LLM_BACKEND`. + for label in &["openrouter", "llamacpp", "ollama"] { + let err = resolve_bootstrap_backend(Some(label)).unwrap_err(); + let msg = format!("{}", err); + assert!(msg.contains("unknown backend"), "label={}", label); + } } #[test] @@ -2209,29 +2183,20 @@ mod tests { } #[test] - fn cross_replay_rejects_llamacpp_to_hybrid() { - let err = validate_cross_replay("llamacpp", "hybrid").unwrap_err(); - assert!(format!("{}", err).contains("llamacpp to hybrid")); - } - - #[test] - fn cross_replay_allows_local_llamacpp_and_hybrid_llamacpp_transitions() { - // Local ↔ llamacpp: user is responsible for picking a vision-capable - // chat slot when the transcript has images. - assert!(validate_cross_replay("local", "llamacpp").is_ok()); - assert!(validate_cross_replay("llamacpp", "local").is_ok()); - // Hybrid ↔ llamacpp: both transcripts are text-only. - assert!(validate_cross_replay("hybrid", "llamacpp").is_ok()); - // Same-backend replays are always fine. + fn cross_replay_allows_supported_transitions() { assert!(validate_cross_replay("local", "local").is_ok()); assert!(validate_cross_replay("hybrid", "hybrid").is_ok()); - assert!(validate_cross_replay("llamacpp", "llamacpp").is_ok()); + // Hybrid → local replays the inlined description as plain text. + assert!(validate_cross_replay("hybrid", "local").is_ok()); } #[test] fn cross_replay_rejects_unknown_effective() { - let err = validate_cross_replay("local", "openrouter").unwrap_err(); - assert!(format!("{}", err).contains("unknown backend")); + // Both "openrouter" and the former "llamacpp" value are unknown now. + for label in &["openrouter", "llamacpp"] { + let err = validate_cross_replay("local", label).unwrap_err(); + assert!(format!("{}", err).contains("unknown backend"), "label={}", label); + } } #[test] diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs index 2a11e29..08fcfef 100644 --- a/src/ai/insight_generator.rs +++ b/src/ai/insight_generator.rs @@ -471,8 +471,11 @@ impl InsightGenerator { log::info!("RAG QUERY: {}", query); log::info!("========================================"); - // Generate embedding for the query - let query_embedding = self.ollama.generate_embedding(&query).await?; + // Generate embedding for the query via the configured local backend + // (`LLM_BACKEND` switch). Must match the backend that populated the + // daily-summary embeddings or similarity search will be garbage. + let query_embedding = + crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &query).await?; // Search for similar daily summaries with time-based weighting // This prioritizes summaries temporally close to the query date @@ -563,7 +566,7 @@ impl InsightGenerator { let calendar_cx = parent_cx.with_span(span); let query_embedding = if let Some(loc) = location { - match self.ollama.generate_embedding(loc).await { + match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), loc).await { Ok(emb) => Some(emb), Err(e) => { log::warn!("Failed to generate embedding for location '{}': {}", loc, e); @@ -734,16 +737,17 @@ impl InsightGenerator { ) }; - let query_embedding = match self.ollama.generate_embedding(&query_text).await { - Ok(emb) => emb, - Err(e) => { - log::warn!("Failed to generate search embedding: {}", e); - search_cx.span().set_status(Status::Error { - description: e.to_string().into(), - }); - return Ok(None); - } - }; + let query_embedding = + match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &query_text).await { + Ok(emb) => emb, + Err(e) => { + log::warn!("Failed to generate search embedding: {}", e); + search_cx.span().set_status(Status::Error { + description: e.to_string().into(), + }); + return Ok(None); + } + }; let searches = { let mut dao = self @@ -2608,11 +2612,13 @@ Return ONLY the summary, nothing else."#, } } - /// Tool: store_entity — upsert an entity into the knowledge memory + /// Tool: store_entity — upsert an entity into the knowledge memory. + /// Embeddings go through the configured local backend (`LLM_BACKEND`), + /// independent of the per-request chat backend in the caller. async fn tool_store_entity( &self, args: &serde_json::Value, - ollama: &OllamaClient, + _ollama: &OllamaClient, cx: &opentelemetry::Context, ) -> String { use crate::database::models::InsertEntity; @@ -2672,9 +2678,16 @@ Return ONLY the summary, nothing else."#, .collect() }; - // Generate embedding for name + description (best-effort) + // Generate embedding for name + description (best-effort) via the + // configured local backend. let embed_text = format!("{} {}", name, description); - let embedding: Option> = match ollama.generate_embedding(&embed_text).await { + let embedding: Option> = match crate::ai::embed_one( + &self.ollama, + self.llamacpp.as_deref(), + &embed_text, + ) + .await + { Ok(vec) => { let bytes: Vec = vec.iter().flat_map(|f| f.to_le_bytes()).collect(); Some(bytes) @@ -3580,20 +3593,24 @@ Return ONLY the summary, nothing else."#, .map(|s| s.trim().to_lowercase()) .filter(|s| !s.is_empty()) .unwrap_or_else(|| "local".to_string()); - if !matches!(backend_label.as_str(), "local" | "hybrid" | "llamacpp") { + if !matches!(backend_label.as_str(), "local" | "hybrid") { return Err(anyhow::anyhow!( - "unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'", + "unknown backend '{}'; expected 'local' or 'hybrid'", backend_label )); } span.set_attribute(KeyValue::new("backend", backend_label.clone())); let is_hybrid = backend_label == "hybrid"; - let is_llamacpp = backend_label == "llamacpp"; - // In hybrid + llamacpp modes the chat model never sees the image - // directly; we describe-then-inline locally before the agentic loop - // starts. Tracked as a single flag so vision/tool-gate logic doesn't - // have to branch twice. - let describes_then_inlines = is_hybrid || is_llamacpp; + // `LLM_BACKEND=llamacpp` swaps Ollama out for llama-swap as the + // "local" stack — chat + vision describe + embeddings all route + // through llama-swap. In hybrid mode this still applies to vision + // describe (chat continues to go to OpenRouter). The chat slot is + // text-only in either case, so we describe-then-inline. + let local_via_llamacpp = + crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); + // Describe-then-inline: hybrid (chat is OpenRouter, text-only) or + // any path where chat goes through llama-swap (chat slot is text-only). + let describes_then_inlines = is_hybrid || local_via_llamacpp; // 1b. Always build an Ollama client. In local mode it owns the chat // loop; in hybrid/llamacpp mode it still handles tool-local calls @@ -3688,13 +3705,14 @@ Return ONLY the summary, nothing else."#, None }; - // 1d. In llamacpp mode, clone the configured LlamaCpp client and - // apply per-request overrides. Same shape as the openrouter - // branch above; describe_image will route through the vision - // slot configured on the client. - let llamacpp_client: Option = if is_llamacpp { + // 1d. When `LLM_BACKEND=llamacpp` and we're in local mode (not + // hybrid — hybrid keeps chat on OpenRouter), clone the llamacpp + // client and apply per-request overrides. Same shape as the + // openrouter branch above; describe_image will route through + // the vision slot configured on the client. + let llamacpp_client: Option = if local_via_llamacpp && !is_hybrid { let arc = self.llamacpp.as_ref().ok_or_else(|| { - anyhow::anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured") + anyhow::anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured") })?; let mut c: LlamaCppClient = (**arc).clone(); if let Some(ref m) = custom_model { @@ -3917,38 +3935,19 @@ Return ONLY the summary, nothing else."#, None }; - // describe-then-inline path. In hybrid mode the vision backend - // defaults to Ollama but can be flipped to llamacpp via - // `HYBRID_VISION_BACKEND=llamacpp` (so chat goes to OpenRouter while - // vision/audio routes through llama-swap). In llamacpp mode we always - // use the llamacpp client's configured vision slot. + // describe-then-inline path. Vision describe routes through whichever + // `LLM_BACKEND` is configured — llama-swap when `local_via_llamacpp` + // is set (even in hybrid mode, since chat is OpenRouter but vision + // stays on the local stack), otherwise Ollama. let inlined_visual_description: Option = if describes_then_inlines { match image_base64.as_deref() { Some(b64) => { - let use_llamacpp_vision = if is_llamacpp { - true - } else { - // is_hybrid branch — consult env switch - matches!( - std::env::var("HYBRID_VISION_BACKEND") - .ok() - .as_deref() - .map(|s| s.trim().to_lowercase()) - .as_deref(), - Some("llamacpp") - ) - }; - - let described = if use_llamacpp_vision { - match self.llamacpp.as_ref() { - Some(c) => c.describe_image(b64).await, - None => { - log::warn!( - "describe-then-inline: requested llamacpp vision but LLAMA_SWAP_URL is unset, falling back to Ollama" - ); - self.ollama.describe_image(b64).await - } - } + let described = if local_via_llamacpp { + self.llamacpp + .as_ref() + .expect("local_via_llamacpp guarantees Some") + .describe_image(b64) + .await } else { self.ollama.describe_image(b64).await }; @@ -4044,10 +4043,10 @@ Return ONLY the summary, nothing else."#, ); // 10. Define tools. Gate flags computed from current data presence; - // describe-then-inline modes (hybrid, llamacpp) omit describe_photo - // since the chat model receives the visual description inline (so - // we pass `false` for has_vision in those modes regardless of the - // model's actual capability). + // describe-then-inline modes (hybrid OR local_via_llamacpp) omit + // describe_photo since the chat model receives the visual + // description inline (so we pass `false` for has_vision in + // those modes regardless of the model's actual capability). let gate_opts = self.current_gate_opts(has_vision && !describes_then_inlines); let tools = Self::build_tool_definitions(gate_opts); diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs index 100020c..d23fad5 100644 --- a/src/ai/llamacpp.rs +++ b/src/ai/llamacpp.rs @@ -11,10 +11,10 @@ // `model` field, which is how llama-swap selects which backend process to // run. // - `/v1/models` returns only the configured slot ids — capabilities aren't -// reported by the API, so `vision_models` is a config-time allowlist (env -// `LLAMA_SWAP_VISION_MODELS`) used to set `has_vision` on responses. -// `has_tool_calling` is assumed true for every slot, since llama-swap entries -// default to launching llama-server with `--jinja`. +// reported by the API, so we infer `has_vision` from a single config field +// (`vision_model`, defaulting to `"vision"`) and assume `has_tool_calling` +// is true for every slot, since llama-swap entries default to launching +// llama-server with `--jinja`. // // First consumer lands alongside the three-way backend dispatch in // insight_generator / insight_chat. @@ -50,16 +50,10 @@ pub struct LlamaCppClient { /// Embedding model slot id (e.g. `"embed"`). Used for /// `generate_embeddings`. pub embedding_model: String, - /// Vision model slot id (e.g. `"vision"`). Used for `describe_image` and - /// included in `vision_models` automatically so capability lookups for - /// the default vision slot report `has_vision = true` even when the env - /// allowlist is empty. + /// Vision model slot id (e.g. `"vision"`). Used for `describe_image`, + /// and the only slot that reports `has_vision = true` in capability + /// lookups (llama-swap's `/v1/models` doesn't surface modality). pub vision_model: String, - /// Operator-curated set of slot ids known to be multimodal. Drives the - /// `has_vision` field in `list_models` / `model_capabilities`, since - /// llama-swap's `/v1/models` doesn't report modality. Empty allowlist - /// still marks `vision_model` as vision-capable. - pub vision_models: Vec, num_ctx: Option, temperature: Option, top_p: Option, @@ -83,7 +77,6 @@ impl LlamaCppClient { primary_model: primary_model.unwrap_or_else(|| DEFAULT_PRIMARY_MODEL.to_string()), embedding_model: DEFAULT_EMBEDDING_MODEL.to_string(), vision_model: DEFAULT_VISION_MODEL.to_string(), - vision_models: Vec::new(), num_ctx: None, temperature: None, top_p: None, @@ -100,10 +93,6 @@ impl LlamaCppClient { self.vision_model = model; } - pub fn set_vision_models(&mut self, models: Vec) { - self.vision_models = models; - } - pub fn set_num_ctx(&mut self, num_ctx: Option) { self.num_ctx = num_ctx; } @@ -692,7 +681,7 @@ impl LlamaCppClient { .and_then(|v| v.as_str()) .unwrap_or_default() .to_string(); - let has_vision = name == self.vision_model || self.vision_models.iter().any(|v| v == &name); + let has_vision = name == self.vision_model; // Tool calling is the default for llama-swap entries we configure // (--jinja flag); no negative-list mechanism yet, so report true. ModelCapabilities { @@ -954,25 +943,21 @@ mod tests { } #[test] - fn capability_inference_uses_vision_model_and_allowlist() { + fn capability_inference_marks_only_vision_slot() { let mut c = LlamaCppClient::new(None, Some("chat".into())); c.set_vision_model("vision".into()); - c.set_vision_models(vec!["qwen-vl".into()]); let m_chat = json!({ "id": "chat" }); let m_vision = json!({ "id": "vision" }); - let m_qwen = json!({ "id": "qwen-vl" }); let m_other = json!({ "id": "embed" }); let chat = c.parse_model_capabilities(&m_chat); let vision = c.parse_model_capabilities(&m_vision); - let qwen = c.parse_model_capabilities(&m_qwen); let other = c.parse_model_capabilities(&m_other); assert!(!chat.has_vision); assert!(chat.has_tool_calling); assert!(vision.has_vision); - assert!(qwen.has_vision); assert!(!other.has_vision); } } diff --git a/src/ai/mod.rs b/src/ai/mod.rs index 204da04..8d634fd 100644 --- a/src/ai/mod.rs +++ b/src/ai/mod.rs @@ -21,14 +21,14 @@ pub use handlers::{ chat_history_handler, chat_rewind_handler, chat_stream_handler, chat_turn_handler, delete_insight_handler, export_training_data_handler, generate_agentic_insight_handler, generate_insight_handler, get_all_insights_handler, get_available_models_handler, - get_insight_handler, get_llamacpp_models_handler, get_openrouter_models_handler, - rate_insight_handler, + get_insight_handler, get_openrouter_models_handler, rate_insight_handler, }; pub use insight_generator::InsightGenerator; #[allow(unused_imports)] pub use llm_client::{ ChatMessage, LlmClient, ModelCapabilities, Tool, ToolCall, ToolCallFunction, ToolFunction, }; +pub use llamacpp::LlamaCppClient; pub use ollama::{EMBEDDING_MODEL, OllamaClient}; pub use sms_client::{SmsApiClient, SmsMessage}; @@ -40,3 +40,87 @@ pub use sms_client::{SmsApiClient, SmsMessage}; pub fn user_display_name() -> String { std::env::var("USER_NAME").unwrap_or_else(|_| "Me".to_string()) } + +/// One switch for the "local" LLM stack: when `LLM_BACKEND=llamacpp` is +/// set, chat / vision describe / embeddings all route through llama-swap +/// instead of Ollama. Any other value (including unset, the default) is +/// Ollama. This is intentionally global — embeddings must be drawn from +/// a single source or similarity search across the index breaks (mixed +/// vector spaces, possibly mixed dims). The `backend=hybrid` per-request +/// override remains orthogonal: it always sends chat to OpenRouter, and +/// uses `LLM_BACKEND` for the describe-then-inline vision pass. +pub fn local_backend_is_llamacpp() -> bool { + matches!( + std::env::var("LLM_BACKEND") + .ok() + .as_deref() + .map(|s| s.trim().to_lowercase()) + .as_deref(), + Some("llamacpp") + ) +} + +/// Embed one string via the configured local backend. Routes through +/// llama-swap when `LLM_BACKEND=llamacpp` (and a client is configured), +/// else Ollama. Returns the single embedding vector. See +/// [`local_backend_is_llamacpp`] for the rationale on consistency. +pub async fn embed_one( + ollama: &OllamaClient, + llamacpp: Option<&LlamaCppClient>, + text: &str, +) -> anyhow::Result> { + if local_backend_is_llamacpp() { + if let Some(lc) = llamacpp { + let mut vecs = ::generate_embeddings(lc, &[text]).await?; + return vecs + .pop() + .ok_or_else(|| anyhow::anyhow!("llama-swap returned no embeddings")); + } + log::warn!( + "LLM_BACKEND=llamacpp but LlamaCppClient is unconfigured; falling back to Ollama embeddings" + ); + } + ollama.generate_embedding(text).await +} + +#[cfg(test)] +mod env_dispatch_tests { + use super::*; + + fn with_env(key: &str, val: Option<&str>, f: F) { + let prev = std::env::var(key).ok(); + match val { + Some(v) => unsafe { std::env::set_var(key, v) }, + None => unsafe { std::env::remove_var(key) }, + } + f(); + match prev { + Some(v) => unsafe { std::env::set_var(key, v) }, + None => unsafe { std::env::remove_var(key) }, + } + } + + #[test] + fn llm_backend_defaults_to_ollama() { + with_env("LLM_BACKEND", None, || { + assert!(!local_backend_is_llamacpp()); + }); + } + + #[test] + fn llm_backend_llamacpp_case_insensitive() { + with_env("LLM_BACKEND", Some("LlamaCpp"), || { + assert!(local_backend_is_llamacpp()); + }); + with_env("LLM_BACKEND", Some(" llamacpp "), || { + assert!(local_backend_is_llamacpp()); + }); + } + + #[test] + fn llm_backend_unknown_value_is_ollama() { + with_env("LLM_BACKEND", Some("vllm"), || { + assert!(!local_backend_is_llamacpp()); + }); + } +} diff --git a/src/main.rs b/src/main.rs index 3bd3656..63013ce 100644 --- a/src/main.rs +++ b/src/main.rs @@ -313,7 +313,6 @@ fn main() -> std::io::Result<()> { .service(ai::get_all_insights_handler) .service(ai::get_available_models_handler) .service(ai::get_openrouter_models_handler) - .service(ai::get_llamacpp_models_handler) .service(ai::chat_turn_handler) .service(ai::chat_stream_handler) .service(ai::chat_history_handler) diff --git a/src/state.rs b/src/state.rs index 96d1c22..c4f810a 100644 --- a/src/state.rs +++ b/src/state.rs @@ -358,10 +358,11 @@ fn parse_openrouter_allowed_models() -> Vec { } /// Build a `LlamaCppClient` from environment variables. Returns `None` when -/// `LLAMA_SWAP_URL` is unset (the llamacpp backend is then unavailable and -/// requests for it return a clear error). The slot ids default to the -/// names the bundled `llama-swap/config.yaml` uses — `chat` / `vision` / -/// `embed` — so a minimal deploy only needs to set `LLAMA_SWAP_URL`. +/// `LLAMA_SWAP_URL` is unset. The client is constructed unconditionally +/// when the URL is set (so it's available even under `LLM_BACKEND=ollama` +/// for ad-hoc tooling), but the agentic / chat paths only route through it +/// when `LLM_BACKEND=llamacpp`. Slot ids default to the names the bundled +/// `llama-swap/config.yaml` uses — `chat` / `vision` / `embed`. fn build_llamacpp_from_env() -> Option> { let base_url = env::var("LLAMA_SWAP_URL").ok()?; let primary_model = env::var("LLAMA_SWAP_PRIMARY_MODEL").ok(); @@ -372,12 +373,12 @@ fn build_llamacpp_from_env() -> Option> { if let Ok(model) = env::var("LLAMA_SWAP_VISION_MODEL") { client.set_vision_model(model); } - client.set_vision_models(parse_llamacpp_vision_models()); Some(Arc::new(client)) } /// Parse `LLAMA_SWAP_ALLOWED_MODELS` (comma-separated) into a vec. Used to -/// drive `/insights/llamacpp/models`; empty when unset. +/// populate the model picker when `LLM_BACKEND=llamacpp` — `/insights/models` +/// surfaces these slots with capabilities. Empty when unset. fn parse_llamacpp_allowed_models() -> Vec { env::var("LLAMA_SWAP_ALLOWED_MODELS") .unwrap_or_default() @@ -387,20 +388,6 @@ fn parse_llamacpp_allowed_models() -> Vec { .collect() } -/// Parse `LLAMA_SWAP_VISION_MODELS` (comma-separated) — slot ids that report -/// `has_vision = true` in capability lookups. The configured `vision_model` -/// (default `vision`) is always considered vision-capable regardless of this -/// list, so a deploy that only uses the default vision slot can leave it -/// unset. -fn parse_llamacpp_vision_models() -> Vec { - env::var("LLAMA_SWAP_VISION_MODELS") - .unwrap_or_default() - .split(',') - .map(|s| s.trim().to_string()) - .filter(|s| !s.is_empty()) - .collect() -} - #[cfg(test)] impl AppState { /// Creates an AppState instance for testing with temporary directories -- 2.49.1 From 0631820fbf55dd27a522212b9656183614fc4a39 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Sun, 24 May 2026 14:00:37 -0400 Subject: [PATCH 04/11] ai: send images directly to llamacpp chat models + add ResolvedBackend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit llamacpp models now receive images via OpenAI content-parts instead of the describe-then-inline strategy (hybrid mode unchanged). Fixes assistant messages with tool_calls emitting content: null instead of "" to satisfy strict Jinja template role-alternation checks. Adds debug logging of message role sequences on llamacpp requests. Introduces BackendKind enum, SamplingOverrides, and ResolvedBackend in a new backend.rs module. InsightGenerator::resolve_backend centralises client construction + vision capability detection — next step wires the existing inline dispatch through it. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ai/backend.rs | 118 ++++++++++++++++ src/ai/handlers.rs | 2 +- src/ai/insight_chat.rs | 33 ++--- src/ai/insight_generator.rs | 262 ++++++++++++++++++++++++++++++------ src/ai/llamacpp.rs | 49 +++++-- src/ai/mod.rs | 1 + 6 files changed, 395 insertions(+), 70 deletions(-) create mode 100644 src/ai/backend.rs diff --git a/src/ai/backend.rs b/src/ai/backend.rs new file mode 100644 index 0000000..0fe7747 --- /dev/null +++ b/src/ai/backend.rs @@ -0,0 +1,118 @@ +use anyhow::{Result, anyhow}; + +use crate::ai::llm_client::LlmClient; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum BackendKind { + Local, + Hybrid, +} + +impl BackendKind { + pub fn parse(s: &str) -> Result { + match s.trim().to_lowercase().as_str() { + "local" | "" => Ok(Self::Local), + "hybrid" => Ok(Self::Hybrid), + other => Err(anyhow!("unknown backend '{}'; expected 'local' or 'hybrid'", other)), + } + } + + pub fn as_str(&self) -> &'static str { + match self { + Self::Local => "local", + Self::Hybrid => "hybrid", + } + } +} + +impl std::fmt::Display for BackendKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.as_str()) + } +} + +pub struct SamplingOverrides { + pub model: Option, + pub num_ctx: Option, + pub temperature: Option, + pub top_p: Option, + pub top_k: Option, + pub min_p: Option, +} + +impl SamplingOverrides { + pub fn has_sampling(&self) -> bool { + self.temperature.is_some() + || self.top_p.is_some() + || self.top_k.is_some() + || self.min_p.is_some() + } +} + +pub struct ResolvedBackend { + chat: Box, + local: Box, + pub kind: BackendKind, + /// `true` when the chat model receives images directly (Ollama with + /// vision, or llamacpp). `false` for hybrid where we describe-then-inline. + pub images_inline: bool, +} + +impl ResolvedBackend { + pub fn new( + chat: Box, + local: Box, + kind: BackendKind, + images_inline: bool, + ) -> Self { + Self { chat, local, kind, images_inline } + } + + pub fn chat(&self) -> &dyn LlmClient { + self.chat.as_ref() + } + + pub fn local(&self) -> &dyn LlmClient { + self.local.as_ref() + } + + pub fn model(&self) -> &str { + self.chat.primary_model() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_backend_kind() { + assert_eq!(BackendKind::parse("local").unwrap(), BackendKind::Local); + assert_eq!(BackendKind::parse("hybrid").unwrap(), BackendKind::Hybrid); + assert_eq!(BackendKind::parse(" Local ").unwrap(), BackendKind::Local); + assert_eq!(BackendKind::parse("HYBRID").unwrap(), BackendKind::Hybrid); + assert_eq!(BackendKind::parse("").unwrap(), BackendKind::Local); + assert!(BackendKind::parse("vllm").is_err()); + } + + #[test] + fn backend_kind_as_str_roundtrips() { + assert_eq!(BackendKind::parse(BackendKind::Local.as_str()).unwrap(), BackendKind::Local); + assert_eq!(BackendKind::parse(BackendKind::Hybrid.as_str()).unwrap(), BackendKind::Hybrid); + } + + #[test] + fn sampling_overrides_has_sampling() { + let empty = SamplingOverrides { + model: None, num_ctx: None, temperature: None, + top_p: None, top_k: None, min_p: None, + }; + assert!(!empty.has_sampling()); + + let with_temp = SamplingOverrides { + model: None, num_ctx: Some(4096), temperature: Some(0.7), + top_p: None, top_k: None, min_p: None, + }; + assert!(with_temp.has_sampling()); + } +} diff --git a/src/ai/handlers.rs b/src/ai/handlers.rs index c86a5c8..a7a3720 100644 --- a/src/ai/handlers.rs +++ b/src/ai/handlers.rs @@ -495,7 +495,7 @@ pub async fn get_available_models_handler( .iter() .map(|name| ModelCapabilities { name: name.clone(), - has_vision: name == &lc.vision_model, + has_vision: true, has_tool_calling: true, }) .collect(); diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs index b549dab..54350df 100644 --- a/src/ai/insight_chat.rs +++ b/src/ai/insight_chat.rs @@ -311,7 +311,7 @@ impl InsightChatService { let is_hybrid = effective_backend == "hybrid"; let local_via_llamacpp = crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); - let describes_then_inlines = is_hybrid || local_via_llamacpp; + let describes_then_inlines = is_hybrid; span.set_attribute(KeyValue::new("backend", effective_backend.clone())); // 4. Build the chat backend client. Hybrid → OpenRouter; local with @@ -408,12 +408,11 @@ impl InsightChatService { let model_used = chat_backend.primary_model().to_string(); span.set_attribute(KeyValue::new("model", model_used.clone())); - // 5. Decide vision + tool set. In describe-then-inline modes - // (hybrid, llamacpp) we always omit `describe_photo` (matches the - // original generation flow). In local we trust the stored - // history's first-user shape: if it carries `images`, the - // original model was vision-capable, and we keep `describe_photo` - // available. + // 5. Decide vision + tool set. In describe-then-inline mode + // (hybrid only) we omit `describe_photo`. In local and llamacpp + // we trust the stored history's first-user shape: if it carries + // `images`, the original model was vision-capable, and we keep + // `describe_photo` available. let local_first_user_has_image = messages .iter() .find(|m| m.role == "user") @@ -821,9 +820,7 @@ impl InsightChatService { .unwrap_or_else(|| stored_backend.clone()); validate_cross_replay(&stored_backend, &effective_backend)?; let is_hybrid = effective_backend == "hybrid"; - let local_via_llamacpp = - crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); - let describes_then_inlines = is_hybrid || local_via_llamacpp; + let describes_then_inlines = is_hybrid; let max_iterations = req .max_iterations @@ -842,10 +839,9 @@ impl InsightChatService { let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref(); let model_used = chat_backend.primary_model().to_string(); - // Tool set — local mode + first user turn carries an image → - // offer describe_photo. Describe-then-inline modes (hybrid OR - // local_via_llamacpp): visual description was inlined when the - // insight was bootstrapped, no describe tool needed. + // Tool set — local/llamacpp mode + first user turn carries an image → + // offer describe_photo. Describe-then-inline mode (hybrid only): + // visual description was inlined at bootstrap, no describe tool needed. let local_first_user_has_image = messages .iter() .find(|m| m.role == "user") @@ -991,7 +987,7 @@ impl InsightChatService { let is_hybrid = effective_backend == "hybrid"; let local_via_llamacpp = crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); - let describes_then_inlines = is_hybrid || local_via_llamacpp; + let describes_then_inlines = is_hybrid; let max_iterations = req .max_iterations @@ -1023,10 +1019,9 @@ impl InsightChatService { _ => None, }); - // Describe-then-inline (hybrid OR local_via_llamacpp): pre-describe - // the image so a text-only chat model gets the visual description - // inline. Vision source follows `LLM_BACKEND`: llama-swap when - // `local_via_llamacpp`, else Ollama. + // Describe-then-inline (hybrid only): pre-describe the image so a + // text-only chat model gets the visual description inline. llamacpp + // sends images directly to the chat model. let visual_block = if describes_then_inlines { match image_base64.as_deref() { Some(b64) => { diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs index 08fcfef..075001a 100644 --- a/src/ai/insight_generator.rs +++ b/src/ai/insight_generator.rs @@ -10,6 +10,7 @@ use std::io::Cursor; use std::sync::{Arc, Mutex}; use crate::ai::apollo_client::{ApolloClient, ApolloPlace}; +use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides}; use crate::ai::llm_client::LlmClient; use crate::ai::ollama::{ChatMessage, OllamaClient, Tool}; use crate::ai::llamacpp::LlamaCppClient; @@ -1781,14 +1782,18 @@ Return ONLY the summary, nothing else."#, ); let started = std::time::Instant::now(); - let response = ollama - .generate_no_think( - &prompt, - Some( - "You are a terse relevance ranker. You output only numbers separated by commas.", - ), - ) - .await?; + let system = Some( + "You are a terse relevance ranker. You output only numbers separated by commas.", + ); + let response = if crate::ai::local_backend_is_llamacpp() { + if let Some(ref lc) = self.llamacpp { + lc.generate(&prompt, system, None).await? + } else { + ollama.generate_no_think(&prompt, system).await? + } + } else { + ollama.generate_no_think(&prompt, system).await? + }; log::info!( "rerank: finished in {} ms (prompt={} chars)", started.elapsed().as_millis(), @@ -2360,7 +2365,8 @@ Return ONLY the summary, nothing else."#, out } - /// Tool: describe_photo — generate a visual description of the photo + /// Tool: describe_photo — generate a visual description of the photo. + /// Routes through llama-swap when `LLM_BACKEND=llamacpp`, Ollama otherwise. async fn tool_describe_photo( &self, ollama: &OllamaClient, @@ -2369,10 +2375,21 @@ Return ONLY the summary, nothing else."#, log::info!("tool_describe_photo: generating visual description"); match image_base64 { - Some(img) => match ollama.generate_photo_description(img).await { - Ok(desc) => desc, - Err(e) => format!("Error describing photo: {}", e), - }, + Some(img) => { + let result = if crate::ai::local_backend_is_llamacpp() { + if let Some(ref lc) = self.llamacpp { + lc.describe_image(img).await + } else { + ollama.generate_photo_description(img).await + } + } else { + ollama.generate_photo_description(img).await + }; + match result { + Ok(desc) => desc, + Err(e) => format!("Error describing photo: {}", e), + } + } None => "No image available for description.".to_string(), } } @@ -3560,6 +3577,177 @@ Return ONLY the summary, nothing else."#, out } + /// Consolidate client construction for the agentic insight loop. + /// + /// Returns a [`ResolvedBackend`] containing the **chat** client (the model + /// that drives the agent loop), the **local** client (always the configured + /// local backend — Ollama or llama-swap — for utility calls like + /// describe_image, rerank, embeddings), the backend kind, and whether the + /// chat model receives images inline. + pub async fn resolve_backend( + &self, + kind: BackendKind, + overrides: &SamplingOverrides, + ) -> Result { + let local_via_llamacpp = + crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); + let is_hybrid = kind == BackendKind::Hybrid; + + // ── chat client ──────────────────────────────────────────────── + let chat: Box = if is_hybrid { + // Hybrid: chat through OpenRouter. + let arc = self.openrouter.as_ref().ok_or_else(|| { + anyhow::anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured") + })?; + let mut c: OpenRouterClient = (**arc).clone(); + if let Some(ref m) = overrides.model { + c.primary_model = m.clone(); + } + if overrides.has_sampling() { + c.set_sampling_params( + overrides.temperature, + overrides.top_p, + overrides.top_k, + overrides.min_p, + ); + } + if let Some(ctx) = overrides.num_ctx { + c.set_num_ctx(Some(ctx)); + } + Box::new(c) + } else if local_via_llamacpp { + // Local via llama-swap. + let arc = self.llamacpp.as_ref().ok_or_else(|| { + anyhow::anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured") + })?; + let mut c: LlamaCppClient = (**arc).clone(); + if let Some(ref m) = overrides.model { + c.primary_model = m.clone(); + } + if overrides.has_sampling() { + c.set_sampling_params( + overrides.temperature, + overrides.top_p, + overrides.top_k, + overrides.min_p, + ); + } + if let Some(ctx) = overrides.num_ctx { + c.set_num_ctx(Some(ctx)); + } + Box::new(c) + } else { + // Pure Ollama local. + let mut ollama_client = if let Some(ref model) = overrides.model { + OllamaClient::new( + self.ollama.primary_url.clone(), + self.ollama.fallback_url.clone(), + model.clone(), + Some(model.clone()), + ) + } else { + self.ollama.clone() + }; + if overrides.has_sampling() { + ollama_client.set_sampling_params( + overrides.temperature, + overrides.top_p, + overrides.top_k, + overrides.min_p, + ); + } + if let Some(ctx) = overrides.num_ctx { + ollama_client.set_num_ctx(Some(ctx)); + } + Box::new(ollama_client) + }; + + // ── local client (utility calls: rerank, describe_image, etc.) ─ + let local: Box = if local_via_llamacpp { + Box::new(self.llamacpp.as_ref().unwrap().as_ref().clone()) + } else { + Box::new(self.ollama.clone()) + }; + + // ── images_inline ────────────────────────────────────────────── + let images_inline = if is_hybrid { + // Hybrid: chat model never sees images — describe-then-inject. + false + } else if local_via_llamacpp { + // llama-swap models receive images directly via OpenAI content + // parts. Capability probing isn't available (no `/api/show`), + // so assume vision support; a misconfigured model surfaces as + // a chat-call error. + true + } else { + // Pure Ollama: probe model capabilities. + let ollama_for_caps = if let Some(ref model) = overrides.model { + // Verify custom model is available on at least one server. + let available_on_primary = + OllamaClient::is_model_available(&self.ollama.primary_url, model) + .await + .unwrap_or(false); + + let available_on_fallback = + if let Some(ref fallback_url) = self.ollama.fallback_url { + OllamaClient::is_model_available(fallback_url, model) + .await + .unwrap_or(false) + } else { + false + }; + + if !available_on_primary && !available_on_fallback { + anyhow::bail!( + "model not available: '{}' not found on any configured server", + model + ); + } + model.as_str() + } else { + self.ollama.primary_model.as_str() + }; + + let capabilities = match OllamaClient::check_model_capabilities( + &self.ollama.primary_url, + ollama_for_caps, + ) + .await + { + Ok(caps) => caps, + Err(_) => { + let fallback_url = + self.ollama.fallback_url.as_deref().ok_or_else(|| { + anyhow::anyhow!( + "Failed to check model capabilities for '{}': model not found on primary server and no fallback configured", + ollama_for_caps + ) + })?; + OllamaClient::check_model_capabilities(fallback_url, ollama_for_caps) + .await + .map_err(|e| { + anyhow::anyhow!( + "Failed to check model capabilities for '{}': {}", + ollama_for_caps, + e + ) + })? + } + }; + + if !capabilities.has_tool_calling { + anyhow::bail!( + "tool calling not supported by model '{}'", + ollama_for_caps + ); + } + + capabilities.has_vision + }; + + Ok(ResolvedBackend::new(chat, local, kind, images_inline)) + } + pub async fn generate_agentic_insight_for_photo( &self, file_path: &str, @@ -3602,26 +3790,22 @@ Return ONLY the summary, nothing else."#, span.set_attribute(KeyValue::new("backend", backend_label.clone())); let is_hybrid = backend_label == "hybrid"; // `LLM_BACKEND=llamacpp` swaps Ollama out for llama-swap as the - // "local" stack — chat + vision describe + embeddings all route - // through llama-swap. In hybrid mode this still applies to vision - // describe (chat continues to go to OpenRouter). The chat slot is - // text-only in either case, so we describe-then-inline. + // "local" stack — chat + embeddings route through llama-swap. + // llamacpp models receive images directly (vision-capable); only + // hybrid mode (OpenRouter chat) uses describe-then-inline. let local_via_llamacpp = crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); - // Describe-then-inline: hybrid (chat is OpenRouter, text-only) or - // any path where chat goes through llama-swap (chat slot is text-only). - let describes_then_inlines = is_hybrid || local_via_llamacpp; + let describes_then_inlines = is_hybrid; + let ollama_is_chat = !is_hybrid && !local_via_llamacpp; // 1b. Always build an Ollama client. In local mode it owns the chat // loop; in hybrid/llamacpp mode it still handles tool-local calls // (e.g. future embedding-backed tools). The chat backend is // selected separately below. - // Sampling overrides only apply in local mode — in - // hybrid/llamacpp the user's params belong to the alternate chat - // client. - let apply_sampling_to_ollama = !describes_then_inlines; + // Sampling overrides only apply when Ollama is the chat backend. + let apply_sampling_to_ollama = ollama_is_chat; let mut ollama_client = if let Some(ref model) = custom_model - && !describes_then_inlines + && ollama_is_chat { log::info!("Using custom model for agentic: {}", model); span.set_attribute(KeyValue::new("custom_model", model.clone())); @@ -3632,7 +3816,7 @@ Return ONLY the summary, nothing else."#, Some(model.clone()), ) } else { - if !describes_then_inlines { + if ollama_is_chat { span.set_attribute(KeyValue::new("model", self.ollama.primary_model.clone())); } self.ollama.clone() @@ -3752,10 +3936,13 @@ Return ONLY the summary, nothing else."#, // (OPENROUTER_ALLOWED_MODELS) — no live precheck. A bad model id // surfaces as a chat-call error on the next step. let has_vision = if describes_then_inlines { - // In hybrid + llamacpp modes the chat model never sees images - // directly — we describe-then-inject, so `has_vision` drives only - // whether we bother loading the image to describe it, which we - // always do. + // Hybrid: chat model never sees images — describe-then-inject. + true + } else if local_via_llamacpp { + // llama-swap models receive images directly via OpenAI content + // parts. Capability probing isn't available (no `/api/show`), + // so assume vision support; a misconfigured model surfaces as + // a chat-call error. true } else { if let Some(ref model_name) = custom_model { @@ -3935,10 +4122,9 @@ Return ONLY the summary, nothing else."#, None }; - // describe-then-inline path. Vision describe routes through whichever - // `LLM_BACKEND` is configured — llama-swap when `local_via_llamacpp` - // is set (even in hybrid mode, since chat is OpenRouter but vision - // stays on the local stack), otherwise Ollama. + // describe-then-inline path (hybrid only). Vision describe routes + // through whichever local backend is configured — llama-swap when + // `local_via_llamacpp`, otherwise Ollama. let inlined_visual_description: Option = if describes_then_inlines { match image_base64.as_deref() { Some(b64) => { @@ -4043,10 +4229,10 @@ Return ONLY the summary, nothing else."#, ); // 10. Define tools. Gate flags computed from current data presence; - // describe-then-inline modes (hybrid OR local_via_llamacpp) omit - // describe_photo since the chat model receives the visual - // description inline (so we pass `false` for has_vision in - // those modes regardless of the model's actual capability). + // hybrid mode omits describe_photo since the chat model receives + // the visual description inline (so we pass `false` for + // has_vision in that mode regardless of the model's actual + // capability). let gate_opts = self.current_gate_opts(has_vision && !describes_then_inlines); let tools = Self::build_tool_definitions(gate_opts); diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs index d23fad5..2a03aed 100644 --- a/src/ai/llamacpp.rs +++ b/src/ai/llamacpp.rs @@ -50,9 +50,9 @@ pub struct LlamaCppClient { /// Embedding model slot id (e.g. `"embed"`). Used for /// `generate_embeddings`. pub embedding_model: String, - /// Vision model slot id (e.g. `"vision"`). Used for `describe_image`, - /// and the only slot that reports `has_vision = true` in capability - /// lookups (llama-swap's `/v1/models` doesn't surface modality). + /// Vision model slot id. Used for `describe_image` routing. Defaults + /// to `primary_model` so describe_image works out of the box; override + /// via `LLAMA_SWAP_VISION_MODEL` for a dedicated vision slot. pub vision_model: String, num_ctx: Option, temperature: Option, @@ -67,6 +67,7 @@ impl LlamaCppClient { .ok() .and_then(|v| v.parse::().ok()) .unwrap_or(DEFAULT_REQUEST_TIMEOUT_SECS); + let pm = primary_model.unwrap_or_else(|| DEFAULT_PRIMARY_MODEL.to_string()); Self { client: Client::builder() .connect_timeout(Duration::from_secs(10)) @@ -74,9 +75,9 @@ impl LlamaCppClient { .build() .unwrap_or_else(|_| Client::new()), base_url: base_url.unwrap_or_else(|| DEFAULT_BASE_URL.to_string()), - primary_model: primary_model.unwrap_or_else(|| DEFAULT_PRIMARY_MODEL.to_string()), + primary_model: pm.clone(), embedding_model: DEFAULT_EMBEDDING_MODEL.to_string(), - vision_model: DEFAULT_VISION_MODEL.to_string(), + vision_model: pm, num_ctx: None, temperature: None, top_p: None, @@ -124,6 +125,13 @@ impl LlamaCppClient { let mut obj = serde_json::Map::new(); obj.insert("role".into(), Value::String(msg.role.clone())); + // Assistant messages with tool_calls must emit `content: null` + // (not `""`) — some Jinja templates (Mistral-family) treat + // empty-string content as a regular message rather than a + // tool-calling turn, breaking role-alternation validation. + let has_tool_calls = msg.role == "assistant" + && msg.tool_calls.as_ref().is_some_and(|tcs| !tcs.is_empty()); + match &msg.images { Some(images) if !images.is_empty() => { let mut parts: Vec = Vec::new(); @@ -139,6 +147,9 @@ impl LlamaCppClient { } obj.insert("content".into(), Value::Array(parts)); } + _ if has_tool_calls && msg.content.is_empty() => { + obj.insert("content".into(), Value::Null); + } _ => { obj.insert("content".into(), Value::String(msg.content.clone())); } @@ -276,6 +287,13 @@ impl LlamaCppClient { tools: Vec, ) -> Result<(ChatMessage, Option, Option)> { let url = format!("{}/chat/completions", self.base_url); + let roles: Vec<&str> = messages.iter().map(|m| m.role.as_str()).collect(); + log::debug!( + "llama-swap chat_completion: model={} roles={:?} tools={}", + model, + roles, + tools.len() + ); let mut body = serde_json::Map::new(); body.insert("model".into(), Value::String(model.to_string())); body.insert( @@ -381,6 +399,13 @@ impl LlmClient for LlamaCppClient { tools: Vec, ) -> Result>> { let url = format!("{}/chat/completions", self.base_url); + let roles: Vec<&str> = messages.iter().map(|m| m.role.as_str()).collect(); + log::debug!( + "llama-swap stream: model={} roles={:?} tools={}", + self.primary_model, + roles, + tools.len() + ); let mut body = serde_json::Map::new(); body.insert( "model".into(), @@ -681,12 +706,12 @@ impl LlamaCppClient { .and_then(|v| v.as_str()) .unwrap_or_default() .to_string(); - let has_vision = name == self.vision_model; - // Tool calling is the default for llama-swap entries we configure - // (--jinja flag); no negative-list mechanism yet, so report true. + // llama-swap doesn't expose per-model modality flags. Assume all + // configured models support vision and tool calling; a model + // without multimodal support surfaces as a chat-call error. ModelCapabilities { name, - has_vision, + has_vision: true, has_tool_calling: true, } } @@ -943,7 +968,7 @@ mod tests { } #[test] - fn capability_inference_marks_only_vision_slot() { + fn all_models_report_vision_capable() { let mut c = LlamaCppClient::new(None, Some("chat".into())); c.set_vision_model("vision".into()); @@ -955,9 +980,9 @@ mod tests { let vision = c.parse_model_capabilities(&m_vision); let other = c.parse_model_capabilities(&m_other); - assert!(!chat.has_vision); + assert!(chat.has_vision); assert!(chat.has_tool_calling); assert!(vision.has_vision); - assert!(!other.has_vision); + assert!(other.has_vision); } } diff --git a/src/ai/mod.rs b/src/ai/mod.rs index 8d634fd..09cbfda 100644 --- a/src/ai/mod.rs +++ b/src/ai/mod.rs @@ -1,4 +1,5 @@ pub mod apollo_client; +pub mod backend; pub mod clip_client; pub mod daily_summary_job; pub mod face_client; -- 2.49.1 From a8a661f70a8022f304851985575bc58193b9354d Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Sun, 24 May 2026 15:00:50 -0400 Subject: [PATCH 05/11] ai: extract ResolvedBackend, remove ~480 lines of duplicated dispatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace 5 copies of the ~80-line backend resolution pattern with a single InsightGenerator::resolve_backend() builder that returns a ResolvedBackend (chat + local clients, BackendKind enum, images_inline flag). Tool dispatch now takes &ResolvedBackend instead of &OllamaClient + model + backend strings. Remove duplicated ollama/openrouter/llamacpp fields from InsightChatService — InsightGenerator owns them and resolve_backend uses them. Delete build_chat_clients (replaced by resolve_backend). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ai/insight_chat.rs | 362 +++++++----------------------- src/ai/insight_generator.rs | 430 +++++++----------------------------- src/state.rs | 6 - 3 files changed, 158 insertions(+), 640 deletions(-) diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs index 54350df..6d52f8b 100644 --- a/src/ai/insight_chat.rs +++ b/src/ai/insight_chat.rs @@ -6,11 +6,9 @@ use std::collections::HashMap; use std::sync::{Arc, Mutex}; use tokio::sync::Mutex as TokioMutex; +use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides}; use crate::ai::insight_generator::InsightGenerator; -use crate::ai::llm_client::{ChatMessage, LlmClient, LlmStreamEvent, Tool}; -use crate::ai::ollama::OllamaClient; -use crate::ai::llamacpp::LlamaCppClient; -use crate::ai::openrouter::OpenRouterClient; +use crate::ai::llm_client::{ChatMessage, LlmStreamEvent, Tool}; use crate::database::InsightDao; use crate::database::models::InsertPhotoInsight; use crate::otel::global_tracer; @@ -92,9 +90,6 @@ pub struct ChatTurnResult { #[derive(Clone)] pub struct InsightChatService { generator: Arc, - ollama: OllamaClient, - openrouter: Option>, - llamacpp: Option>, insight_dao: Arc>>, chat_locks: ChatLockMap, } @@ -102,17 +97,11 @@ pub struct InsightChatService { impl InsightChatService { pub fn new( generator: Arc, - ollama: OllamaClient, - openrouter: Option>, - llamacpp: Option>, insight_dao: Arc>>, chat_locks: ChatLockMap, ) -> Self { Self { generator, - ollama, - openrouter, - llamacpp, insight_dao, chat_locks, } @@ -308,16 +297,9 @@ impl InsightChatService { .filter(|s| !s.is_empty()) .unwrap_or_else(|| stored_backend.clone()); validate_cross_replay(&stored_backend, &effective_backend)?; - let is_hybrid = effective_backend == "hybrid"; - let local_via_llamacpp = - crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); - let describes_then_inlines = is_hybrid; - span.set_attribute(KeyValue::new("backend", effective_backend.clone())); + let kind = BackendKind::parse(&effective_backend)?; + span.set_attribute(KeyValue::new("backend", kind.as_str())); - // 4. Build the chat backend client. Hybrid → OpenRouter; local with - // `LLM_BACKEND=llamacpp` → llama-swap; otherwise Ollama. Clones - // so per-request sampling/model overrides don't leak into shared - // state. let max_iterations = req .max_iterations .unwrap_or(DEFAULT_MAX_ITERATIONS) @@ -325,113 +307,36 @@ impl InsightChatService { span.set_attribute(KeyValue::new("max_iterations", max_iterations as i64)); let stored_model = insight.model_version.clone(); - let custom_model = req - .model - .clone() - .or_else(|| Some(stored_model.clone())) - .filter(|m| !m.is_empty()); - - let mut ollama_client = self.ollama.clone(); - let mut openrouter_client: Option = None; - let mut llamacpp_client: Option = None; - - if is_hybrid { - let arc = self.openrouter.as_ref().ok_or_else(|| { - anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured") - })?; - let mut c: OpenRouterClient = (**arc).clone(); - if let Some(ref m) = custom_model { - c.primary_model = m.clone(); - } - if req.temperature.is_some() - || req.top_p.is_some() - || req.top_k.is_some() - || req.min_p.is_some() - { - c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p); - } - if let Some(ctx) = req.num_ctx { - c.set_num_ctx(Some(ctx)); - } - openrouter_client = Some(c); - } else if local_via_llamacpp { - let arc = self.llamacpp.as_ref().ok_or_else(|| { - anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured") - })?; - let mut c: LlamaCppClient = (**arc).clone(); - if let Some(ref m) = custom_model { - c.primary_model = m.clone(); - } - if req.temperature.is_some() - || req.top_p.is_some() - || req.top_k.is_some() - || req.min_p.is_some() - { - c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p); - } - if let Some(ctx) = req.num_ctx { - c.set_num_ctx(Some(ctx)); - } - llamacpp_client = Some(c); - } else { - // Pure local (Ollama): model swap. Build a new client when the - // chat model differs from the configured one. - if let Some(ref m) = custom_model - && m != &self.ollama.primary_model - { - ollama_client = OllamaClient::new( - self.ollama.primary_url.clone(), - self.ollama.fallback_url.clone(), - m.clone(), - Some(m.clone()), - ); - } - if req.temperature.is_some() - || req.top_p.is_some() - || req.top_k.is_some() - || req.min_p.is_some() - { - ollama_client.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p); - } - if let Some(ctx) = req.num_ctx { - ollama_client.set_num_ctx(Some(ctx)); - } - } - - let chat_backend: &dyn LlmClient = if let Some(ref c) = llamacpp_client { - c - } else if let Some(ref c) = openrouter_client { - c - } else { - &ollama_client + let overrides = SamplingOverrides { + model: req.model.clone() + .or_else(|| Some(stored_model.clone())) + .filter(|m| !m.is_empty()), + num_ctx: req.num_ctx, + temperature: req.temperature, + top_p: req.top_p, + top_k: req.top_k, + min_p: req.min_p, }; - let model_used = chat_backend.primary_model().to_string(); + let backend = self.generator.resolve_backend(kind, &overrides).await?; + let model_used = backend.model().to_string(); span.set_attribute(KeyValue::new("model", model_used.clone())); - // 5. Decide vision + tool set. In describe-then-inline mode - // (hybrid only) we omit `describe_photo`. In local and llamacpp - // we trust the stored history's first-user shape: if it carries - // `images`, the original model was vision-capable, and we keep - // `describe_photo` available. + // 5. Decide vision + tool set. In hybrid (describe-then-inline) mode + // we omit `describe_photo`. Otherwise trust the stored history: + // if the first user message carries images, describe_photo stays. let local_first_user_has_image = messages .iter() .find(|m| m.role == "user") .and_then(|m| m.images.as_ref()) .map(|imgs| !imgs.is_empty()) .unwrap_or(false); - let offer_describe_tool = !describes_then_inlines && local_first_user_has_image; - // current_gate_opts(has_vision) sets gate_opts.has_vision = has_vision - // and probes the per-table presence flags. Pass `offer_describe_tool` - // directly — the `!is_hybrid && local_first_user_has_image` decision - // is the chat-path's vision predicate. + let offer_describe_tool = backend.images_inline && local_first_user_has_image; let gate_opts = self.generator.current_gate_opts_for_persona( offer_describe_tool, Some((req.user_id, &active_persona)), ); let tools = InsightGenerator::build_tool_definitions(gate_opts); - // Image base64 only needed when describe_photo is on the menu. Load - // lazily to avoid disk IO when the loop never invokes it. let image_base64: Option = if offer_describe_tool { self.generator.load_image_as_base64(&normalized).ok() } else { @@ -480,13 +385,13 @@ impl InsightChatService { iterations_used = iteration + 1; log::info!("Chat iteration {}/{}", iterations_used, max_iterations); - let (response, prompt_tokens, eval_tokens) = chat_backend + let (response, prompt_tokens, eval_tokens) = backend + .chat() .chat_with_tools(messages.clone(), tools.clone()) .await?; last_prompt_eval_count = prompt_tokens; last_eval_count = eval_tokens; - // Ollama rejects non-object tool-call arguments on replay. let mut response = response; if let Some(ref mut tcs) = response.tool_calls { for tc in tcs.iter_mut() { @@ -514,13 +419,11 @@ impl InsightChatService { .execute_tool( &tool_call.function.name, &tool_call.function.arguments, - &ollama_client, + &backend, &image_base64, &normalized, req.user_id, &active_persona, - &model_used, - &effective_backend, &loop_cx, ) .await; @@ -534,8 +437,6 @@ impl InsightChatService { } if final_content.is_empty() { - // The model never produced a final answer; ask once more without - // tools to force a textual reply. log::info!( "Chat loop exhausted after {} iterations, requesting final answer", iterations_used @@ -543,7 +444,8 @@ impl InsightChatService { messages.push(ChatMessage::user( "Please write your final answer now without calling any more tools.", )); - let (final_response, prompt_tokens, eval_tokens) = chat_backend + let (final_response, prompt_tokens, eval_tokens) = backend + .chat() .chat_with_tools(messages.clone(), vec![]) .await?; last_prompt_eval_count = prompt_tokens; @@ -579,7 +481,8 @@ impl InsightChatService { Capture the key moment or theme. Return ONLY the title, nothing else.", final_content ); - let title_raw = chat_backend + let title_raw = backend + .chat() .generate( &title_prompt, Some( @@ -604,7 +507,7 @@ impl InsightChatService { model_version: model_used.clone(), is_current: true, training_messages: Some(json), - backend: effective_backend.clone(), + backend: kind.as_str().to_string(), fewshot_source_ids: None, content_hash: None, }; @@ -629,7 +532,7 @@ impl InsightChatService { prompt_eval_count: last_prompt_eval_count, eval_count: last_eval_count, amended_insight_id, - backend_used: effective_backend, + backend_used: kind.as_str().to_string(), model_used, }) } @@ -818,9 +721,8 @@ impl InsightChatService { .map(|s| s.trim().to_lowercase()) .filter(|s| !s.is_empty()) .unwrap_or_else(|| stored_backend.clone()); - validate_cross_replay(&stored_backend, &effective_backend)?; - let is_hybrid = effective_backend == "hybrid"; - let describes_then_inlines = is_hybrid; + let kind = BackendKind::parse(&effective_backend)?; + validate_cross_replay(&stored_backend, kind.as_str())?; let max_iterations = req .max_iterations @@ -828,18 +730,20 @@ impl InsightChatService { .clamp(1, env_max_iterations()); let stored_model = insight.model_version.clone(); - let custom_model = req - .model - .clone() - .or_else(|| Some(stored_model.clone())) - .filter(|m| !m.is_empty()); + let overrides = SamplingOverrides { + model: req.model.clone() + .or_else(|| Some(stored_model.clone())) + .filter(|m| !m.is_empty()), + num_ctx: req.num_ctx, + temperature: req.temperature, + top_p: req.top_p, + top_k: req.top_k, + min_p: req.min_p, + }; + let backend = self.generator.resolve_backend(kind, &overrides).await?; + let model_used = backend.model().to_string(); - let (chat_backend_holder, ollama_client) = - self.build_chat_clients(&effective_backend, custom_model.as_deref(), &req)?; - let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref(); - let model_used = chat_backend.primary_model().to_string(); - - // Tool set — local/llamacpp mode + first user turn carries an image → + // Tool set — images_inline mode + first user turn carries an image → // offer describe_photo. Describe-then-inline mode (hybrid only): // visual description was inlined at bootstrap, no describe tool needed. let local_first_user_has_image = messages @@ -848,7 +752,7 @@ impl InsightChatService { .and_then(|m| m.images.as_ref()) .map(|imgs| !imgs.is_empty()) .unwrap_or(false); - let offer_describe_tool = !describes_then_inlines && local_first_user_has_image; + let offer_describe_tool = backend.images_inline && local_first_user_has_image; let gate_opts = self.generator.current_gate_opts_for_persona( offer_describe_tool, Some((req.user_id, &active_persona)), @@ -879,16 +783,13 @@ impl InsightChatService { let outcome = self .run_streaming_agentic_loop( - chat_backend, - &ollama_client, + &backend, &mut messages, tools, &image_base64, &normalized, req.user_id, &active_persona, - &model_used, - &effective_backend, max_iterations, &tx, ) @@ -916,7 +817,7 @@ impl InsightChatService { let mut amended_insight_id: Option = None; if req.amend { - let title = self.generate_title(chat_backend, &final_content).await?; + let title = self.generate_title(&backend, &final_content).await?; // Amended rows intentionally do not inherit the parent's // `fewshot_source_ids`. The parent's few-shot influence is still @@ -932,7 +833,7 @@ impl InsightChatService { model_version: model_used.clone(), is_current: true, training_messages: Some(json), - backend: effective_backend.clone(), + backend: kind.as_str().to_string(), fewshot_source_ids: None, content_hash: None, }; @@ -958,7 +859,7 @@ impl InsightChatService { eval_tokens: last_eval_count, num_ctx: req.num_ctx, amended_insight_id, - backend_used: effective_backend, + backend_used: kind.as_str().to_string(), model_used, }) .await; @@ -984,21 +885,23 @@ impl InsightChatService { .filter(|s| !s.trim().is_empty()) .unwrap_or_else(|| "default".to_string()); let effective_backend = resolve_bootstrap_backend(req.backend.as_deref())?; - let is_hybrid = effective_backend == "hybrid"; - let local_via_llamacpp = - crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); - let describes_then_inlines = is_hybrid; + let kind = BackendKind::parse(&effective_backend)?; let max_iterations = req .max_iterations .unwrap_or(DEFAULT_MAX_ITERATIONS) .clamp(1, env_max_iterations()); - let custom_model = req.model.clone().filter(|m| !m.is_empty()); - let (chat_backend_holder, ollama_client) = - self.build_chat_clients(&effective_backend, custom_model.as_deref(), &req)?; - let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref(); - let model_used = chat_backend.primary_model().to_string(); + let overrides = SamplingOverrides { + model: req.model.clone().filter(|m| !m.is_empty()), + num_ctx: req.num_ctx, + temperature: req.temperature, + top_p: req.top_p, + top_k: req.top_k, + min_p: req.min_p, + }; + let backend = self.generator.resolve_backend(kind, &overrides).await?; + let model_used = backend.model().to_string(); // Load image bytes once. RAW preview fallback is handled inside // load_image_as_base64. Errors degrade silently — a chat that @@ -1020,26 +923,17 @@ impl InsightChatService { }); // Describe-then-inline (hybrid only): pre-describe the image so a - // text-only chat model gets the visual description inline. llamacpp - // sends images directly to the chat model. - let visual_block = if describes_then_inlines { + // text-only chat model gets the visual description inline. + // images_inline backends send images directly to the chat model. + let visual_block = if !backend.images_inline { match image_base64.as_deref() { Some(b64) => { - let described = if local_via_llamacpp { - self.llamacpp - .as_ref() - .expect("local_via_llamacpp guarantees Some") - .describe_image(b64) - .await - } else { - self.ollama.describe_image(b64).await - }; - match described { + match backend.local().describe_image(b64).await { Ok(desc) => { format!("Visual description (from local vision model):\n{}\n", desc) } Err(e) => { - log::warn!("{} bootstrap: describe_image failed: {}", effective_backend, e); + log::warn!("{} bootstrap: describe_image failed: {}", kind.as_str(), e); String::new() } } @@ -1050,10 +944,10 @@ impl InsightChatService { String::new() }; - // Tool gates. Local + image present → expose describe_photo so - // the chat model can re-look at the photo on demand. Hybrid: + // Tool gates. images_inline + image present → expose describe_photo so + // the chat model can re-look at the photo on demand. Non-inline: // already inlined, no tool needed. - let offer_describe_tool = !describes_then_inlines && image_base64.is_some(); + let offer_describe_tool = backend.images_inline && image_base64.is_some(); let gate_opts = self.generator.current_gate_opts_for_persona( offer_describe_tool, Some((req.user_id, &active_persona)), @@ -1079,23 +973,22 @@ impl InsightChatService { ); let system_msg = ChatMessage::system(system_content); let mut user_msg = ChatMessage::user(req.user_message.clone()); - if !describes_then_inlines && let Some(ref img) = image_base64 { - user_msg.images = Some(vec![img.clone()]); + if backend.images_inline { + if let Some(ref img) = image_base64 { + user_msg.images = Some(vec![img.clone()]); + } } let mut messages = vec![system_msg, user_msg]; let outcome = self .run_streaming_agentic_loop( - chat_backend, - &ollama_client, + &backend, &mut messages, tools, &image_base64, &normalized, req.user_id, &active_persona, - &model_used, - &effective_backend, max_iterations, &tx, ) @@ -1108,7 +1001,7 @@ impl InsightChatService { final_content, } = outcome; - let title = self.generate_title(chat_backend, &final_content).await?; + let title = self.generate_title(&backend, &final_content).await?; let json = serde_json::to_string(&messages) .map_err(|e| anyhow!("failed to serialize chat history: {}", e))?; @@ -1121,7 +1014,7 @@ impl InsightChatService { model_version: model_used.clone(), is_current: true, training_messages: Some(json), - backend: effective_backend.clone(), + backend: kind.as_str().to_string(), fewshot_source_ids: None, content_hash: None, }; @@ -1144,7 +1037,7 @@ impl InsightChatService { eval_tokens: last_eval_count, num_ctx: req.num_ctx, amended_insight_id: Some(stored.id), - backend_used: effective_backend, + backend_used: kind.as_str().to_string(), model_used, }) .await; @@ -1152,95 +1045,12 @@ impl InsightChatService { Ok(()) } - /// Set up chat clients (Ollama + optional OpenRouter / LlamaCpp) shared - /// by bootstrap and continuation. Returns the chat-side backend client - /// (boxed because each backend has a different concrete type) and the - /// Ollama client used for describe-image / local tool calls. - /// - /// `effective_backend` must be one of `"local"` or `"hybrid"` (validated - /// upstream). Hybrid → OpenRouter; local with `LLM_BACKEND=llamacpp` → - /// llama-swap; pure local → Ollama. Returns the dispatched chat client - /// plus the (possibly per-request) Ollama client that the caller uses - /// for non-chat helpers (image describe in non-llamacpp mode, tool ops). - fn build_chat_clients( - &self, - effective_backend: &str, - custom_model: Option<&str>, - req: &ChatTurnRequest, - ) -> Result<(Box, OllamaClient)> { - let mut ollama_client = self.ollama.clone(); - - if effective_backend == "hybrid" { - let arc = self.openrouter.as_ref().ok_or_else(|| { - anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured") - })?; - let mut c: OpenRouterClient = (**arc).clone(); - if let Some(m) = custom_model { - c.primary_model = m.to_string(); - } - if req.temperature.is_some() - || req.top_p.is_some() - || req.top_k.is_some() - || req.min_p.is_some() - { - c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p); - } - if let Some(ctx) = req.num_ctx { - c.set_num_ctx(Some(ctx)); - } - return Ok((Box::new(c), ollama_client)); - } - - // Local mode — env switch decides between Ollama and llama-swap. - if crate::ai::local_backend_is_llamacpp() - && let Some(arc) = self.llamacpp.as_ref() - { - let mut c: LlamaCppClient = (**arc).clone(); - if let Some(m) = custom_model { - c.primary_model = m.to_string(); - } - if req.temperature.is_some() - || req.top_p.is_some() - || req.top_k.is_some() - || req.min_p.is_some() - { - c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p); - } - if let Some(ctx) = req.num_ctx { - c.set_num_ctx(Some(ctx)); - } - return Ok((Box::new(c), ollama_client)); - } - - if let Some(m) = custom_model - && m != self.ollama.primary_model - { - ollama_client = OllamaClient::new( - self.ollama.primary_url.clone(), - self.ollama.fallback_url.clone(), - m.to_string(), - Some(m.to_string()), - ); - } - if req.temperature.is_some() - || req.top_p.is_some() - || req.top_k.is_some() - || req.min_p.is_some() - { - ollama_client.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p); - } - if let Some(ctx) = req.num_ctx { - ollama_client.set_num_ctx(Some(ctx)); - } - Ok((Box::new(ollama_client.clone()), ollama_client)) - } - /// Generate a short title via the same chat backend so voice stays /// consistent with the body. Mirrors generate_agentic_insight_for_photo's /// titling pass. async fn generate_title( &self, - chat_backend: &dyn LlmClient, + backend: &ResolvedBackend, final_content: &str, ) -> Result { let title_prompt = format!( @@ -1248,7 +1058,8 @@ impl InsightChatService { Capture the key moment or theme. Return ONLY the title, nothing else.", final_content ); - let title_raw = chat_backend + let title_raw = backend + .chat() .generate( &title_prompt, Some( @@ -1266,18 +1077,13 @@ impl InsightChatService { /// final assistant content. async fn run_streaming_agentic_loop( &self, - chat_backend: &dyn LlmClient, - ollama_client: &OllamaClient, + backend: &ResolvedBackend, messages: &mut Vec, tools: Vec, image_base64: &Option, normalized: &str, user_id: i32, active_persona: &str, - // Provenance — stamped onto any store_fact tool call made - // during this loop. Mirrors the non-streaming chat path. - model_used: &str, - effective_backend: &str, max_iterations: usize, tx: &tokio::sync::mpsc::Sender, ) -> Result { @@ -1296,7 +1102,8 @@ impl InsightChatService { }) .await; - let mut stream = chat_backend + let mut stream = backend + .chat() .chat_with_tools_stream(messages.clone(), tools.clone()) .await?; @@ -1353,13 +1160,11 @@ impl InsightChatService { .execute_tool( &tool_call.function.name, &tool_call.function.arguments, - ollama_client, + backend, image_base64, normalized, user_id, active_persona, - model_used, - effective_backend, &cx, ) .await; @@ -1394,7 +1199,8 @@ impl InsightChatService { messages.push(ChatMessage::user( "Please write your final answer now without calling any more tools.", )); - let mut stream = chat_backend + let mut stream = backend + .chat() .chat_with_tools_stream(messages.clone(), vec![]) .await?; let mut final_message: Option = None; diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs index 075001a..f95c6dc 100644 --- a/src/ai/insight_generator.rs +++ b/src/ai/insight_generator.rs @@ -1594,29 +1594,24 @@ Return ONLY the summary, nothing else."#, &self, tool_name: &str, arguments: &serde_json::Value, - ollama: &OllamaClient, + backend: &ResolvedBackend, image_base64: &Option, file_path: &str, user_id: i32, persona_id: &str, - // Provenance — written into entity_facts.created_by_* when - // the loop calls store_fact. The caller knows the actual - // chat-runtime model and backend (which may differ from - // ollama.primary_model in hybrid mode where chat lives on - // OpenRouter while Ollama still handles vision). - model: &str, - backend: &str, cx: &opentelemetry::Context, ) -> String { + let model = backend.model(); + let backend_label = backend.kind.as_str(); let result = match tool_name { - "search_rag" => self.tool_search_rag(arguments, ollama, cx).await, + "search_rag" => self.tool_search_rag(arguments, backend.local(), cx).await, "search_messages" => self.tool_search_messages(arguments, cx).await, "get_sms_messages" => self.tool_get_sms_messages(arguments, cx).await, "get_calendar_events" => self.tool_get_calendar_events(arguments, cx).await, "get_location_history" => self.tool_get_location_history(arguments, cx).await, "get_file_tags" => self.tool_get_file_tags(arguments, cx).await, "get_faces_in_photo" => self.tool_get_faces_in_photo(arguments, cx).await, - "describe_photo" => self.tool_describe_photo(ollama, image_base64).await, + "describe_photo" => self.tool_describe_photo(backend.local(), image_base64).await, "reverse_geocode" => self.tool_reverse_geocode(arguments).await, "get_personal_place_at" => self.tool_get_personal_place_at(arguments).await, "recall_entities" => self.tool_recall_entities(arguments, cx).await, @@ -1624,19 +1619,19 @@ Return ONLY the summary, nothing else."#, self.tool_recall_facts_for_photo(arguments, user_id, persona_id, cx) .await } - "store_entity" => self.tool_store_entity(arguments, ollama, cx).await, + "store_entity" => self.tool_store_entity(arguments, cx).await, "store_fact" => { self.tool_store_fact( - arguments, file_path, user_id, persona_id, model, backend, cx, + arguments, file_path, user_id, persona_id, model, backend_label, cx, ) .await } "update_fact" => { - self.tool_update_fact(arguments, user_id, persona_id, model, backend, cx) + self.tool_update_fact(arguments, user_id, persona_id, model, backend_label, cx) .await } "supersede_fact" => { - self.tool_supersede_fact(arguments, user_id, persona_id, model, backend, cx) + self.tool_supersede_fact(arguments, user_id, persona_id, model, backend_label, cx) .await } "get_current_datetime" => Self::tool_get_current_datetime(), @@ -1654,7 +1649,7 @@ Return ONLY the summary, nothing else."#, async fn tool_search_rag( &self, args: &serde_json::Value, - ollama: &OllamaClient, + local: &dyn LlmClient, _cx: &opentelemetry::Context, ) -> String { let query = match args.get("query").and_then(|v| v.as_str()) { @@ -1718,7 +1713,7 @@ Return ONLY the summary, nothing else."#, }; let final_results = if rerank_enabled && results.len() > limit { - match self.rerank_with_llm(&query, &results, limit, ollama).await { + match self.rerank_with_llm(&query, &results, limit, local).await { Ok(reordered) => reordered, Err(e) => { log::warn!("rerank failed, using vector order: {}", e); @@ -1744,7 +1739,7 @@ Return ONLY the summary, nothing else."#, query: &str, candidates: &[String], limit: usize, - ollama: &OllamaClient, + local: &dyn LlmClient, ) -> Result> { let query_preview: String = query.chars().take(60).collect(); log::info!( @@ -1785,15 +1780,7 @@ Return ONLY the summary, nothing else."#, let system = Some( "You are a terse relevance ranker. You output only numbers separated by commas.", ); - let response = if crate::ai::local_backend_is_llamacpp() { - if let Some(ref lc) = self.llamacpp { - lc.generate(&prompt, system, None).await? - } else { - ollama.generate_no_think(&prompt, system).await? - } - } else { - ollama.generate_no_think(&prompt, system).await? - }; + let response = local.generate(&prompt, system, None).await?; log::info!( "rerank: finished in {} ms (prompt={} chars)", started.elapsed().as_millis(), @@ -2365,31 +2352,17 @@ Return ONLY the summary, nothing else."#, out } - /// Tool: describe_photo — generate a visual description of the photo. - /// Routes through llama-swap when `LLM_BACKEND=llamacpp`, Ollama otherwise. async fn tool_describe_photo( &self, - ollama: &OllamaClient, + local: &dyn LlmClient, image_base64: &Option, ) -> String { log::info!("tool_describe_photo: generating visual description"); - match image_base64 { - Some(img) => { - let result = if crate::ai::local_backend_is_llamacpp() { - if let Some(ref lc) = self.llamacpp { - lc.describe_image(img).await - } else { - ollama.generate_photo_description(img).await - } - } else { - ollama.generate_photo_description(img).await - }; - match result { - Ok(desc) => desc, - Err(e) => format!("Error describing photo: {}", e), - } - } + Some(img) => match local.describe_image(img).await { + Ok(desc) => desc, + Err(e) => format!("Error describing photo: {}", e), + }, None => "No image available for description.".to_string(), } } @@ -2635,7 +2608,6 @@ Return ONLY the summary, nothing else."#, async fn tool_store_entity( &self, args: &serde_json::Value, - _ollama: &OllamaClient, cx: &opentelemetry::Context, ) -> String { use crate::database::models::InsertEntity; @@ -3775,243 +3747,25 @@ Return ONLY the summary, nothing else."#, span.set_attribute(KeyValue::new("file_path", file_path.clone())); span.set_attribute(KeyValue::new("max_iterations", max_iterations as i64)); - // 1a. Resolve backend label (defaults to "local"). - let backend_label = backend - .as_deref() - .map(|s| s.trim().to_lowercase()) - .filter(|s| !s.is_empty()) - .unwrap_or_else(|| "local".to_string()); - if !matches!(backend_label.as_str(), "local" | "hybrid") { - return Err(anyhow::anyhow!( - "unknown backend '{}'; expected 'local' or 'hybrid'", - backend_label - )); - } - span.set_attribute(KeyValue::new("backend", backend_label.clone())); - let is_hybrid = backend_label == "hybrid"; - // `LLM_BACKEND=llamacpp` swaps Ollama out for llama-swap as the - // "local" stack — chat + embeddings route through llama-swap. - // llamacpp models receive images directly (vision-capable); only - // hybrid mode (OpenRouter chat) uses describe-then-inline. - let local_via_llamacpp = - crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); - let describes_then_inlines = is_hybrid; - let ollama_is_chat = !is_hybrid && !local_via_llamacpp; - - // 1b. Always build an Ollama client. In local mode it owns the chat - // loop; in hybrid/llamacpp mode it still handles tool-local calls - // (e.g. future embedding-backed tools). The chat backend is - // selected separately below. - // Sampling overrides only apply when Ollama is the chat backend. - let apply_sampling_to_ollama = ollama_is_chat; - let mut ollama_client = if let Some(ref model) = custom_model - && ollama_is_chat - { - log::info!("Using custom model for agentic: {}", model); - span.set_attribute(KeyValue::new("custom_model", model.clone())); - OllamaClient::new( - self.ollama.primary_url.clone(), - self.ollama.fallback_url.clone(), - model.clone(), - Some(model.clone()), - ) - } else { - if ollama_is_chat { - span.set_attribute(KeyValue::new("model", self.ollama.primary_model.clone())); - } - self.ollama.clone() - }; - - if apply_sampling_to_ollama { - if let Some(ctx) = num_ctx { - log::info!("Using custom context size: {}", ctx); - span.set_attribute(KeyValue::new("num_ctx", ctx as i64)); - ollama_client.set_num_ctx(Some(ctx)); - } - - if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() { - log::info!( - "Using sampling params — temperature: {:?}, top_p: {:?}, top_k: {:?}, min_p: {:?}", - temperature, - top_p, - top_k, - min_p - ); - if let Some(t) = temperature { - span.set_attribute(KeyValue::new("temperature", t as f64)); - } - if let Some(p) = top_p { - span.set_attribute(KeyValue::new("top_p", p as f64)); - } - if let Some(k) = top_k { - span.set_attribute(KeyValue::new("top_k", k as i64)); - } - if let Some(m) = min_p { - span.set_attribute(KeyValue::new("min_p", m as f64)); - } - ollama_client.set_sampling_params(temperature, top_p, top_k, min_p); - } - } - - // 1c. In hybrid mode, clone the configured OpenRouter client and - // apply per-request overrides. - let openrouter_client: Option = if is_hybrid { - let arc = self.openrouter.as_ref().ok_or_else(|| { - anyhow::anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured") - })?; - let mut c: OpenRouterClient = (**arc).clone(); - if let Some(ref m) = custom_model { - c.primary_model = m.clone(); - span.set_attribute(KeyValue::new("custom_model", m.clone())); - } - span.set_attribute(KeyValue::new("openrouter_model", c.primary_model.clone())); - if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() { - if let Some(t) = temperature { - span.set_attribute(KeyValue::new("temperature", t as f64)); - } - if let Some(p) = top_p { - span.set_attribute(KeyValue::new("top_p", p as f64)); - } - if let Some(k) = top_k { - span.set_attribute(KeyValue::new("top_k", k as i64)); - } - if let Some(m) = min_p { - span.set_attribute(KeyValue::new("min_p", m as f64)); - } - c.set_sampling_params(temperature, top_p, top_k, min_p); - } - if let Some(ctx) = num_ctx { - span.set_attribute(KeyValue::new("num_ctx", ctx as i64)); - c.set_num_ctx(Some(ctx)); - } - Some(c) - } else { - None - }; - - // 1d. When `LLM_BACKEND=llamacpp` and we're in local mode (not - // hybrid — hybrid keeps chat on OpenRouter), clone the llamacpp - // client and apply per-request overrides. Same shape as the - // openrouter branch above; describe_image will route through - // the vision slot configured on the client. - let llamacpp_client: Option = if local_via_llamacpp && !is_hybrid { - let arc = self.llamacpp.as_ref().ok_or_else(|| { - anyhow::anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured") - })?; - let mut c: LlamaCppClient = (**arc).clone(); - if let Some(ref m) = custom_model { - c.primary_model = m.clone(); - span.set_attribute(KeyValue::new("custom_model", m.clone())); - } - span.set_attribute(KeyValue::new("llamacpp_model", c.primary_model.clone())); - if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() { - if let Some(t) = temperature { - span.set_attribute(KeyValue::new("temperature", t as f64)); - } - if let Some(p) = top_p { - span.set_attribute(KeyValue::new("top_p", p as f64)); - } - if let Some(k) = top_k { - span.set_attribute(KeyValue::new("top_k", k as i64)); - } - if let Some(m) = min_p { - span.set_attribute(KeyValue::new("min_p", m as f64)); - } - c.set_sampling_params(temperature, top_p, top_k, min_p); - } - if let Some(ctx) = num_ctx { - span.set_attribute(KeyValue::new("num_ctx", ctx as i64)); - c.set_num_ctx(Some(ctx)); - } - Some(c) - } else { - None + // 1. Resolve backend + build clients. + let kind = BackendKind::parse( + backend.as_deref().unwrap_or("local"), + )?; + span.set_attribute(KeyValue::new("backend", kind.as_str())); + let overrides = SamplingOverrides { + model: custom_model, + num_ctx, + temperature, + top_p, + top_k, + min_p, }; + let backend = self.resolve_backend(kind, &overrides).await?; + span.set_attribute(KeyValue::new("model", backend.model().to_string())); + span.set_attribute(KeyValue::new("images_inline", backend.images_inline)); let insight_cx = current_cx.with_span(span); - // 2. Verify chat model supports tool calling. - // - local: existing Ollama model availability + capability check. - // - hybrid: trust the operator's curated allowlist - // (OPENROUTER_ALLOWED_MODELS) — no live precheck. A bad model id - // surfaces as a chat-call error on the next step. - let has_vision = if describes_then_inlines { - // Hybrid: chat model never sees images — describe-then-inject. - true - } else if local_via_llamacpp { - // llama-swap models receive images directly via OpenAI content - // parts. Capability probing isn't available (no `/api/show`), - // so assume vision support; a misconfigured model surfaces as - // a chat-call error. - true - } else { - if let Some(ref model_name) = custom_model { - let available_on_primary = - OllamaClient::is_model_available(&ollama_client.primary_url, model_name) - .await - .unwrap_or(false); - - let available_on_fallback = - if let Some(ref fallback_url) = ollama_client.fallback_url { - OllamaClient::is_model_available(fallback_url, model_name) - .await - .unwrap_or(false) - } else { - false - }; - - if !available_on_primary && !available_on_fallback { - anyhow::bail!( - "model not available: '{}' not found on any configured server", - model_name - ); - } - } - - let model_name_for_caps = &ollama_client.primary_model; - let capabilities = match OllamaClient::check_model_capabilities( - &ollama_client.primary_url, - model_name_for_caps, - ) - .await - { - Ok(caps) => caps, - Err(_) => { - let fallback_url = ollama_client.fallback_url.as_deref().ok_or_else(|| { - anyhow::anyhow!( - "Failed to check model capabilities for '{}': model not found on primary server and no fallback configured", - model_name_for_caps - ) - })?; - OllamaClient::check_model_capabilities(fallback_url, model_name_for_caps) - .await - .map_err(|e| { - anyhow::anyhow!( - "Failed to check model capabilities for '{}': {}", - model_name_for_caps, - e - ) - })? - } - }; - - if !capabilities.has_tool_calling { - return Err(anyhow::anyhow!( - "tool calling not supported by model '{}'", - ollama_client.primary_model - )); - } - - insight_cx - .span() - .set_attribute(KeyValue::new("model_has_vision", capabilities.has_vision)); - insight_cx - .span() - .set_attribute(KeyValue::new("model_has_tool_calling", true)); - - capabilities.has_vision - }; - // 3. Fetch EXIF let exif = { let mut exif_dao = self.exif_dao.lock().expect("Unable to lock ExifDao"); @@ -4103,60 +3857,33 @@ Return ONLY the summary, nothing else."#, } }; - // 7. Load image if vision capable. - // In hybrid mode we ALSO describe it locally now so the - // description can be inlined as text — the OpenRouter chat model - // never receives the base64 image directly. - let image_base64 = if has_vision { - match self.load_image_as_base64(&file_path) { - Ok(b64) => { - log::info!("Loaded image for vision-capable agentic model"); - Some(b64) - } - Err(e) => { - log::warn!("Failed to load image for agentic vision: {}", e); - None - } + // 7. Load image. Always attempted — vision-capable models get the + // base64 inline; hybrid mode describes it locally and injects text. + let image_base64 = match self.load_image_as_base64(&file_path) { + Ok(b64) => { + log::info!("Loaded image for agentic model"); + Some(b64) + } + Err(e) => { + log::warn!("Failed to load image for agentic: {}", e); + None } - } else { - None }; - // describe-then-inline path (hybrid only). Vision describe routes - // through whichever local backend is configured — llama-swap when - // `local_via_llamacpp`, otherwise Ollama. - let inlined_visual_description: Option = if describes_then_inlines { + // Describe-then-inline (hybrid only). Vision describe routes through + // the local backend so non-text work stays off OpenRouter. + let inlined_visual_description: Option = if !backend.images_inline { match image_base64.as_deref() { - Some(b64) => { - let described = if local_via_llamacpp { - self.llamacpp - .as_ref() - .expect("local_via_llamacpp guarantees Some") - .describe_image(b64) - .await - } else { - self.ollama.describe_image(b64).await - }; - - match described { - Ok(desc) => { - log::info!( - "{}: vision describe succeeded ({} chars)", - backend_label, - desc.len() - ); - Some(desc) - } - Err(e) => { - log::warn!( - "{}: vision describe failed, continuing without: {}", - backend_label, - e - ); - None - } + Some(b64) => match backend.local().describe_image(b64).await { + Ok(desc) => { + log::info!("{}: vision describe succeeded ({} chars)", kind, desc.len()); + Some(desc) } - } + Err(e) => { + log::warn!("{}: vision describe failed, continuing without: {}", kind, e); + None + } + }, None => None, } } else { @@ -4228,34 +3955,24 @@ Return ONLY the summary, nothing else."#, date = date_taken.format("%B %d, %Y"), ); - // 10. Define tools. Gate flags computed from current data presence; - // hybrid mode omits describe_photo since the chat model receives - // the visual description inline (so we pass `false` for - // has_vision in that mode regardless of the model's actual - // capability). - let gate_opts = self.current_gate_opts(has_vision && !describes_then_inlines); + // 10. Define tools. describe_photo offered only when the chat model + // sees images directly (images_inline); in hybrid mode the visual + // description is already inlined as text. + let gate_opts = self.current_gate_opts(backend.images_inline); let tools = Self::build_tool_definitions(gate_opts); - // 11. Build initial messages. In describe-then-inline modes images - // are never attached to the wire message — the description is part - // of `user_content`. + // 11. Build initial messages. images_inline → attach base64 to the + // user message; describe-then-inline → text was already injected. let system_msg = ChatMessage::system(system_content); let mut user_msg = ChatMessage::user(user_content); - if !describes_then_inlines && let Some(ref img) = image_base64 { - user_msg.images = Some(vec![img.clone()]); + if backend.images_inline { + if let Some(ref img) = image_base64 { + user_msg.images = Some(vec![img.clone()]); + } } let mut messages = vec![system_msg, user_msg]; - // 12. Agentic loop — dispatch through the selected backend. - let chat_backend: &dyn LlmClient = if let Some(ref lc_c) = llamacpp_client { - lc_c - } else if let Some(ref or_c) = openrouter_client { - or_c - } else { - &ollama_client - }; - let loop_span = tracer.start_with_context("ai.agentic.loop", &insight_cx); let loop_cx = insight_cx.with_span(loop_span); @@ -4268,7 +3985,8 @@ Return ONLY the summary, nothing else."#, iterations_used = iteration + 1; log::info!("Agentic iteration {}/{}", iteration + 1, max_iterations); - let (response, prompt_tokens, eval_tokens) = chat_backend + let (response, prompt_tokens, eval_tokens) = backend + .chat() .chat_with_tools(messages.clone(), tools.clone()) .await?; @@ -4308,13 +4026,11 @@ Return ONLY the summary, nothing else."#, .execute_tool( &tool_call.function.name, &tool_call.function.arguments, - &ollama_client, + &backend, &image_base64, &file_path, user_id, &persona_id, - chat_backend.primary_model(), - &backend_label, &loop_cx, ) .await; @@ -4338,7 +4054,8 @@ Return ONLY the summary, nothing else."#, "Based on the context gathered, please write the final photo insight: a title and a detailed personal summary. Write in first person as {}.", user_display_name() ))); - let (final_response, prompt_tokens, eval_tokens) = chat_backend + let (final_response, prompt_tokens, eval_tokens) = backend + .chat() .chat_with_tools(messages.clone(), vec![]) .await?; last_prompt_eval_count = prompt_tokens; @@ -4360,7 +4077,8 @@ Return ONLY the summary, nothing else."#, let title_system = custom_system_prompt.as_deref().unwrap_or( "You are my long term memory assistant. Use only the information provided. Do not invent details.", ); - let title_raw = chat_backend + let title_raw = backend + .chat() .generate(&title_prompt, Some(title_system), None) .await?; let title = title_raw.trim().trim_matches('"').to_string(); @@ -4383,7 +4101,7 @@ Return ONLY the summary, nothing else."#, }; // 15. Store insight (returns the persisted row including its new id) - let model_version = chat_backend.primary_model().to_string(); + let model_version = backend.model().to_string(); let fewshot_source_ids_json = if fewshot_source_ids.is_empty() { None } else { @@ -4398,7 +4116,7 @@ Return ONLY the summary, nothing else."#, model_version, is_current: true, training_messages, - backend: backend_label.clone(), + backend: kind.as_str().to_string(), fewshot_source_ids: fewshot_source_ids_json, content_hash: None, }; diff --git a/src/state.rs b/src/state.rs index c4f810a..8cfccbb 100644 --- a/src/state.rs +++ b/src/state.rs @@ -290,9 +290,6 @@ impl Default for AppState { Arc::new(tokio::sync::Mutex::new(std::collections::HashMap::new())); let insight_chat = Arc::new(InsightChatService::new( Arc::new(insight_generator.clone()), - ollama.clone(), - openrouter.clone(), - llamacpp.clone(), insight_dao.clone(), chat_locks, )); @@ -470,9 +467,6 @@ impl AppState { Arc::new(tokio::sync::Mutex::new(std::collections::HashMap::new())); let insight_chat = Arc::new(InsightChatService::new( Arc::new(insight_generator.clone()), - ollama.clone(), - None, - None, insight_dao.clone(), chat_locks, )); -- 2.49.1 From fb388c29d78152b53b42cd24f29855aacd0cd109 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Sun, 24 May 2026 15:03:12 -0400 Subject: [PATCH 06/11] docs: update env + CLAUDE.md for direct-vision llamacpp + ResolvedBackend llamacpp models now receive images directly instead of describe-then-inline. LLAMA_SWAP_VISION_MODEL defaults to the primary model. Document the ResolvedBackend dispatch pattern. Co-Authored-By: Claude Opus 4.7 (1M context) --- .env.example | 10 ++++++---- CLAUDE.md | 45 +++++++++++++++++++++++++++++---------------- 2 files changed, 35 insertions(+), 20 deletions(-) diff --git a/.env.example b/.env.example index 81fab4e..f7a1004 100644 --- a/.env.example +++ b/.env.example @@ -66,15 +66,17 @@ AGENTIC_CHAT_MAX_ITERATIONS=6 # ── AI Insights — llama.cpp / llama-swap (optional) ───────────────────── # Set LLAMA_SWAP_URL plus LLM_BACKEND=llamacpp to swap the local stack # off Ollama. Talks OpenAI-compatible /v1 to a llama-swap proxy fronting -# per-slot llama-server instances (chat / vision / embed). The chat slot -# is treated as text-only — images are pre-described via the vision slot -# and inlined into the prompt. +# per-slot llama-server instances. Chat models receive images directly +# via content-parts (vision-capable models assumed); a separate vision +# slot is used only by the describe_photo tool and describe-image utility. # LLAMA_SWAP_URL=http://localhost:9292/v1 # LLAMA_SWAP_PRIMARY_MODEL=chat +# Optional dedicated vision slot for describe_image. Defaults to +# PRIMARY_MODEL so describe_photo works without extra config. # LLAMA_SWAP_VISION_MODEL=vision # LLAMA_SWAP_EMBEDDING_MODEL=embed # Comma-separated allowlist surfaced by /insights/models when -# LLM_BACKEND=llamacpp. +# LLM_BACKEND=llamacpp. All report has_vision=true. # LLAMA_SWAP_ALLOWED_MODELS=chat,vision,embed # LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180 diff --git a/CLAUDE.md b/CLAUDE.md index d06b29f..7f1da76 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -640,17 +640,16 @@ OPENROUTER_APP_TITLE=ImageApi # Optional attribution header LLM_BACKEND=ollama # llama.cpp / llama-swap (used when LLM_BACKEND=llamacpp). OpenAI-compatible -# proxy hosting one or more llama-server processes (chat / vision / embed slots). +# proxy hosting one or more llama-server processes. Chat models receive +# images directly via content-parts (all models assumed vision-capable). LLAMA_SWAP_URL=http://localhost:9292/v1 # Required when LLM_BACKEND=llamacpp LLAMA_SWAP_PRIMARY_MODEL=chat # Chat slot id (matches config.yaml) -LLAMA_SWAP_VISION_MODEL=vision # Vision slot id; describe_image routes here. - # The only slot reported as has_vision=true in - # /insights/models — chat slots are treated as - # text-only (images pre-described and inlined). +LLAMA_SWAP_VISION_MODEL= # Dedicated vision slot for describe_image / describe_photo + # tool. Defaults to PRIMARY_MODEL when unset. LLAMA_SWAP_EMBEDDING_MODEL=embed # Embedding slot id LLAMA_SWAP_ALLOWED_MODELS=chat,coder # Curated allowlist surfaced by GET /insights/models - # when LLM_BACKEND=llamacpp. Empty = picker shows - # only the configured primary model. + # when LLM_BACKEND=llamacpp. All report has_vision=true. + # Empty = picker shows only the configured primary model. LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180 # Per-request timeout; bump for slow CPU offload # Insight Chat Continuation @@ -675,21 +674,35 @@ This allows runtime verification of model availability before generating insight **Local backend switch (`LLM_BACKEND`):** One env var decides which "local" stack the server runs against — `ollama` -(default) or `llamacpp`. It's global on purpose: chat, vision describe, and +(default) or `llamacpp`. It's global on purpose: chat, vision, and embeddings all route through the same backend, so the embedding-vector column in SQLite stays in one vector space. Don't flip mid-deploy without re-embedding the affected rows — similarity search will collapse. -- `LLM_BACKEND=ollama`: chat and embeddings use Ollama; vision describe - uses Ollama's multimodal model. -- `LLM_BACKEND=llamacpp`: chat hits llama-swap's `chat` slot (which is - treated as text-only — images are pre-described via the `vision` slot - and inlined), embeddings hit the `embed` slot, vision describe hits the - `vision` slot. Requires `LLAMA_SWAP_URL`. +- `LLM_BACKEND=ollama`: chat, vision, and embeddings use Ollama. Vision + capability is probed per-model via `/api/show`. +- `LLM_BACKEND=llamacpp`: chat models receive images directly via OpenAI + content-parts (all models assumed vision-capable). Embeddings hit the + `embed` slot. A dedicated `LLAMA_SWAP_VISION_MODEL` slot (defaults to + the chat model) handles `describe_image` for the `describe_photo` tool. + Requires `LLAMA_SWAP_URL`. The per-request `backend=hybrid` override is orthogonal: it always sends -chat to OpenRouter, but the describe pass still routes through whichever -`LLM_BACKEND` is configured. +chat to OpenRouter (text-only, images are pre-described and inlined), but +the describe + embed passes still route through whichever `LLM_BACKEND` +is configured. + +**Backend dispatch (`ResolvedBackend`):** + +`InsightGenerator::resolve_backend(kind, overrides)` is the single entry +point that builds clients for a request. Returns a `ResolvedBackend` with +two roles: `.chat()` (the agentic/chat client) and `.local()` (local-only +utility calls: rerank, describe_image, embeddings). `BackendKind` is an +enum (`Local` | `Hybrid`) replacing the stringly-typed `"local"` / +`"hybrid"` labels. `SamplingOverrides` groups model/ctx/temp/top_p/top_k/ +min_p per-request overrides. All downstream code (`execute_tool`, +`run_streaming_agentic_loop`, etc.) takes `&ResolvedBackend` rather than +individual client references. `GET /insights/models` returns the local-backend models with capabilities in the same envelope shape regardless of `LLM_BACKEND`: Ollama servers -- 2.49.1 From 208344ad98c4c3c232a1ff79df15a51ee8f6255e Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Sun, 24 May 2026 19:27:29 -0400 Subject: [PATCH 07/11] ai: mirror chat model on local client to prevent mid-turn model swap When the user selects a model from the picker, the local client's primary_model and vision_model now match the chat model. Prevents llama-swap exclusive mode from swapping models when describe_photo or rerank fires during an agentic turn. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ai/insight_generator.rs | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs index f95c6dc..80a7627 100644 --- a/src/ai/insight_generator.rs +++ b/src/ai/insight_generator.rs @@ -3635,8 +3635,16 @@ Return ONLY the summary, nothing else."#, }; // ── local client (utility calls: rerank, describe_image, etc.) ─ + // For llamacpp: mirror the chat model selection so rerank / + // describe_image hit the same model that's already loaded — + // avoids a mid-turn model swap in llama-swap exclusive mode. let local: Box = if local_via_llamacpp { - Box::new(self.llamacpp.as_ref().unwrap().as_ref().clone()) + let mut lc = self.llamacpp.as_ref().unwrap().as_ref().clone(); + if let Some(ref m) = overrides.model { + lc.primary_model = m.clone(); + lc.set_vision_model(m.clone()); + } + Box::new(lc) } else { Box::new(self.ollama.clone()) }; -- 2.49.1 From 9dba659d1e35b52f7a5e9f3ee4d09f65dcafcc39 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Sun, 24 May 2026 19:29:51 -0400 Subject: [PATCH 08/11] test: add llamacpp model-slot consistency and content-null tests Cover the properties that prevent mid-turn model swaps in llama-swap exclusive mode: vision_model defaults to primary, cloned local client mirrors the user-selected model, embeddings stay on their own slot. Also test the content:null serialization for tool-calling messages. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ai/llamacpp.rs | 87 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 87 insertions(+) diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs index 2a03aed..77cd05f 100644 --- a/src/ai/llamacpp.rs +++ b/src/ai/llamacpp.rs @@ -985,4 +985,91 @@ mod tests { assert!(vision.has_vision); assert!(other.has_vision); } + + #[test] + fn vision_model_defaults_to_primary() { + let c = LlamaCppClient::new(None, Some("qwen3:32b".into())); + assert_eq!(c.primary_model, "qwen3:32b"); + assert_eq!(c.vision_model, "qwen3:32b"); + } + + #[test] + fn vision_model_explicit_override_diverges_from_primary() { + let mut c = LlamaCppClient::new(None, Some("qwen3:32b".into())); + c.set_vision_model("minicpm-v".into()); + assert_eq!(c.primary_model, "qwen3:32b"); + assert_eq!(c.vision_model, "minicpm-v"); + } + + #[test] + fn cloned_local_with_model_override_keeps_all_slots_consistent() { + // Simulates what resolve_backend does for the `local` client: + // clone the configured client, then override primary + vision + // to match the user-selected chat model. This prevents mid-turn + // model swaps in llama-swap exclusive mode. + let mut base = LlamaCppClient::new(None, Some("chat".into())); + base.set_vision_model("vision".into()); + base.set_embedding_model("embed".into()); + + let mut local = base.clone(); + let user_selected = "qwen3:32b"; + local.primary_model = user_selected.to_string(); + local.set_vision_model(user_selected.to_string()); + + // Chat generation (rerank) routes through primary_model. + assert_eq!(local.primary_model, user_selected); + // describe_image routes through vision_model. + assert_eq!(local.vision_model, user_selected); + // Embeddings stay on the dedicated slot — separate endpoint, + // no model swap conflict. + assert_eq!(local.embedding_model, "embed"); + } + + #[test] + fn assistant_tool_calls_emit_null_content() { + let msg = ChatMessage { + role: "assistant".into(), + content: String::new(), + tool_calls: Some(vec![ToolCall { + id: Some("call_1".into()), + function: ToolCallFunction { + name: "search".into(), + arguments: json!({}), + }, + }]), + images: None, + }; + let wire = LlamaCppClient::messages_to_openai(&[msg]); + assert!(wire[0]["content"].is_null(), "empty content + tool_calls should emit null"); + } + + #[test] + fn assistant_with_content_and_tool_calls_preserves_content() { + let msg = ChatMessage { + role: "assistant".into(), + content: "Let me search for that.".into(), + tool_calls: Some(vec![ToolCall { + id: Some("call_1".into()), + function: ToolCallFunction { + name: "search".into(), + arguments: json!({}), + }, + }]), + images: None, + }; + let wire = LlamaCppClient::messages_to_openai(&[msg]); + assert_eq!(wire[0]["content"], "Let me search for that."); + } + + #[test] + fn assistant_without_tool_calls_keeps_empty_string_content() { + let msg = ChatMessage { + role: "assistant".into(), + content: String::new(), + tool_calls: None, + images: None, + }; + let wire = LlamaCppClient::messages_to_openai(&[msg]); + assert_eq!(wire[0]["content"], ""); + } } -- 2.49.1 From b9175e27189f63de4c19f2341db388c7d8b4493d Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Mon, 25 May 2026 15:33:03 -0400 Subject: [PATCH 09/11] image: add xlarge (4096px) on-demand preview tier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New `PhotoSize::XLarge` variant sits between `Large` (2048px) and `Full` (original). On-demand generated and disk-cached at `_xlarge/.jpg`, same waterfall as `Large` (embedded RAW preview → ffmpeg → image crate). Sources below 4096px serve at native size. Reduces decoded bitmap memory from ~192MB (48MP full) to ~64MB for the mobile viewer's zoom tier. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/content_hash.rs | 9 ++++ src/data/mod.rs | 1 + src/handlers/image.rs | 101 +++++++++++++++++++++++++++++++++++++++--- src/thumbnails.rs | 95 +++++++++++++++++++++++++++++++++++++-- 4 files changed, 196 insertions(+), 10 deletions(-) diff --git a/src/content_hash.rs b/src/content_hash.rs index d68dbef..a2a4632 100644 --- a/src/content_hash.rs +++ b/src/content_hash.rs @@ -62,6 +62,15 @@ pub fn large_preview_path(thumbs_dir: &Path, hash: &str) -> PathBuf { .join(format!("{}.jpg", hash)) } +/// Hash-keyed xlarge-preview path: `/_xlarge//.jpg`. +pub fn xlarge_preview_path(thumbs_dir: &Path, hash: &str) -> PathBuf { + let shard = shard_prefix(hash); + thumbs_dir + .join("_xlarge") + .join(shard) + .join(format!("{}.jpg", hash)) +} + /// Hash-keyed HLS output directory: `///`. /// The playlist lives at `playlist.m3u8` inside this directory and its /// segments are co-located so HLS relative references Just Work. See diff --git a/src/data/mod.rs b/src/data/mod.rs index 4780d6c..931b3c6 100644 --- a/src/data/mod.rs +++ b/src/data/mod.rs @@ -194,6 +194,7 @@ pub enum MediaType { #[serde(rename_all = "lowercase")] pub enum PhotoSize { Full, + XLarge, Large, Thumb, } diff --git a/src/handlers/image.rs b/src/handlers/image.rs index ca0f598..c1dbffb 100644 --- a/src/handlers/image.rs +++ b/src/handlers/image.rs @@ -83,12 +83,14 @@ pub async fn get_image( if let Some((library, path)) = resolved { let image_size = req.size.unwrap_or(PhotoSize::Full); - // `size=large` is only meaningful for stills — there's no useful - // "2048px video preview" tier. Videos fall back to the existing - // thumb pipeline (which already handles gif/static selection). - // `mut` so the Large branch can downgrade itself to `Full` after a - // generation failure (RAW-preview branch below keys off `Full`). - let mut image_size = if image_size == PhotoSize::Large && file_types::is_video_file(&path) { + // `size=large|xlarge` is only meaningful for stills — there's no + // useful "resized video preview" tier. Videos fall back to the + // existing thumb pipeline (which already handles gif/static + // selection). `mut` so preview branches can downgrade to `Full` + // after a generation failure. + let mut image_size = if (image_size == PhotoSize::Large || image_size == PhotoSize::XLarge) + && file_types::is_video_file(&path) + { PhotoSize::Thumb } else { image_size @@ -196,6 +198,93 @@ pub async fn get_image( image_size = PhotoSize::Full; } + if image_size == PhotoSize::XLarge { + let relative_path = path + .strip_prefix(&library.root_path) + .expect("Error stripping library root prefix from xlarge preview"); + let relative_path_str = relative_path.to_string_lossy().replace('\\', "/"); + let thumbs = Path::new(&app_state.thumbnail_path); + let xlarge_dir = thumbs.join("_xlarge"); + + let hash_xlarge_path: Option = { + let mut dao = exif_dao.lock().expect("Unable to lock ExifDao"); + match dao.get_exif(&context, &relative_path_str) { + Ok(Some(row)) => row + .content_hash + .as_deref() + .map(|h| content_hash::xlarge_preview_path(thumbs, h)), + _ => None, + } + }; + let scoped_legacy_xlarge_path = + content_hash::library_scoped_legacy_path(&xlarge_dir, library.id, relative_path); + + let existing = hash_xlarge_path + .as_ref() + .filter(|p| p.exists()) + .cloned() + .or_else(|| { + if scoped_legacy_xlarge_path.exists() { + Some(scoped_legacy_xlarge_path.clone()) + } else { + None + } + }); + + if let Some(found) = existing { + if let Ok(file) = NamedFile::open(&found) { + span.set_status(Status::Ok); + return file + .use_etag(true) + .use_last_modified(true) + .prefer_utf8(true) + .into_response(&request); + } + } + + let dest = hash_xlarge_path + .clone() + .unwrap_or_else(|| scoped_legacy_xlarge_path.clone()); + let src = path.clone(); + let dest_for_block = dest.clone(); + let generated = web::block(move || { + if let Some(parent) = dest_for_block.parent() { + std::fs::create_dir_all(parent)?; + } + let tmp = dest_for_block.with_extension("jpg.tmp"); + crate::thumbnails::generate_xlarge_preview(&src, &tmp)?; + std::fs::rename(&tmp, &dest_for_block)?; + Ok::<(), std::io::Error>(()) + }) + .await; + + match generated { + Ok(Ok(())) => { + if let Ok(file) = NamedFile::open(&dest) { + span.set_status(Status::Ok); + return file + .use_etag(true) + .use_last_modified(true) + .prefer_utf8(true) + .into_response(&request); + } + } + Ok(Err(e)) => { + warn!( + "XLarge preview generation failed for {:?}: {} — falling back to original", + path, e + ); + } + Err(e) => { + warn!( + "XLarge preview blocking-pool error for {:?}: {} — falling back to original", + path, e + ); + } + } + image_size = PhotoSize::Full; + } + if image_size == PhotoSize::Thumb { let relative_path = path .strip_prefix(&library.root_path) diff --git a/src/thumbnails.rs b/src/thumbnails.rs index 51b6200..a7334b0 100644 --- a/src/thumbnails.rs +++ b/src/thumbnails.rs @@ -36,12 +36,19 @@ use crate::video::actors::{generate_image_thumbnail_ffmpeg, generate_video_thumb /// `size=full` and the handler streams the original bytes. pub const LARGE_PREVIEW_MAX_DIM: u32 = 2048; -/// JPEG quality for the large preview tier. 85 is the conventional -/// "indistinguishable from source at viewing size" point — well above the -/// `image` crate's default ~75, but well below quality-90+ territory where -/// file size doubles for no perceptible win. +/// JPEG quality for the large and xlarge preview tiers. 85 is the +/// conventional "indistinguishable from source at viewing size" point — +/// well above the `image` crate's default ~75, but well below quality-90+ +/// territory where file size doubles for no perceptible win. const LARGE_PREVIEW_JPEG_QUALITY: u8 = 85; +/// Maximum long-edge size (px) for the xlarge preview tier. Bridges the +/// gap between `large` (2048px, ~16MB decoded) and the original bytes +/// (potentially 48+ MP / ~192MB decoded). At 4096px the decoded bitmap is +/// ~64MB — enough for 2-3× pinch-zoom on any phone before the viewer +/// needs to stream the true original. +pub const XLARGE_PREVIEW_MAX_DIM: u32 = 4096; + lazy_static! { pub static ref IMAGE_GAUGE: IntGauge = IntGauge::new( "imageserver_image_total", @@ -205,6 +212,86 @@ fn generate_large_preview_ffmpeg(src: &Path, dest: &Path) -> std::io::Result<()> Ok(()) } +/// Generate the on-demand xlarge-preview tier (≈4096 long edge JPEG). +/// +/// Same waterfall as [`generate_large_preview`] but targeting +/// [`XLARGE_PREVIEW_MAX_DIM`]. Sources whose long edge is already below +/// the cap are encoded at native size (no upscale). +pub fn generate_xlarge_preview(src: &Path, dest: &Path) -> std::io::Result<()> { + let orientation = exif::read_orientation(src).unwrap_or(1); + + if let Some(preview) = exif::extract_embedded_jpeg_preview(src) { + let img = image::load_from_memory(&preview).map_err(|e| { + std::io::Error::new( + std::io::ErrorKind::InvalidData, + format!("decode embedded preview {:?}: {}", src, e), + ) + })?; + let img = exif::apply_orientation(img, orientation); + return encode_xlarge_jpeg(img, dest); + } + + if file_types::needs_ffmpeg_thumbnail(src) { + return generate_xlarge_preview_ffmpeg(src, dest); + } + + let img = image::open(src).map_err(|e| { + std::io::Error::new(std::io::ErrorKind::InvalidData, format!("{:?}: {}", src, e)) + })?; + let img = exif::apply_orientation(img, orientation); + encode_xlarge_jpeg(img, dest) +} + +fn encode_xlarge_jpeg(img: image::DynamicImage, dest: &Path) -> std::io::Result<()> { + let (w, h) = img.dimensions(); + let max_dim = w.max(h); + let scaled = if max_dim > XLARGE_PREVIEW_MAX_DIM { + img.thumbnail(XLARGE_PREVIEW_MAX_DIM, XLARGE_PREVIEW_MAX_DIM) + } else { + img + }; + let file = std::fs::File::create(dest) + .map_err(|e| std::io::Error::other(format!("create {:?}: {}", dest, e)))?; + let mut writer = std::io::BufWriter::new(file); + let mut encoder = JpegEncoder::new_with_quality(&mut writer, LARGE_PREVIEW_JPEG_QUALITY); + encoder + .encode_image(&scaled) + .map_err(|e| std::io::Error::other(format!("encode {:?}: {}", dest, e)))?; + Ok(()) +} + +fn generate_xlarge_preview_ffmpeg(src: &Path, dest: &Path) -> std::io::Result<()> { + let vf = format!( + "scale='if(gt(iw,ih),min(iw,{cap}),-1)':'if(gt(iw,ih),-1,min(ih,{cap}))'", + cap = XLARGE_PREVIEW_MAX_DIM + ); + let output = Command::new("ffmpeg") + .arg("-y") + .arg("-i") + .arg(src) + .arg("-vframes") + .arg("1") + .arg("-vf") + .arg(&vf) + .arg("-q:v") + .arg("5") + .arg("-f") + .arg("image2") + .arg("-c:v") + .arg("mjpeg") + .arg(dest) + .output()?; + + if !output.status.success() { + return Err(std::io::Error::other(format!( + "ffmpeg failed ({}): {}", + output.status, + String::from_utf8_lossy(&output.stderr).trim() + ))); + } + Ok(()) +} + pub fn create_thumbnails(libs: &[libraries::Library], excluded_dirs: &[String]) { let tracer = global_tracer(); let span = tracer.start("creating thumbnails"); -- 2.49.1 From 0a627f48806732629e6cf4535f4dd904cd755eac Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Mon, 25 May 2026 21:46:18 -0400 Subject: [PATCH 10/11] Add contact name filter to SMS search tool + misc improvements - sms search tool: accept contact name, trim/validate, skip when contact_id is set, pass to API client - sms_client: new contact field in SmsSearchParams, URL-encode on wire - Tool description clarifies contact_id takes precedence when both given - Add parse_title_body helper for LLM response parsing - llamacpp backend improvements --- src/ai/insight_chat.rs | 33 +------- src/ai/insight_generator.rs | 147 ++++++++++++++++++++++++++---------- src/ai/llamacpp.rs | 46 +++++++++++ src/ai/sms_client.rs | 6 ++ 4 files changed, 165 insertions(+), 67 deletions(-) diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs index 6d52f8b..86671af 100644 --- a/src/ai/insight_chat.rs +++ b/src/ai/insight_chat.rs @@ -817,7 +817,8 @@ impl InsightChatService { let mut amended_insight_id: Option = None; if req.amend { - let title = self.generate_title(&backend, &final_content).await?; + let (title, body) = crate::ai::insight_generator::parse_title_body(&final_content); + let final_content = body; // Amended rows intentionally do not inherit the parent's // `fewshot_source_ids`. The parent's few-shot influence is still @@ -1001,7 +1002,7 @@ impl InsightChatService { final_content, } = outcome; - let title = self.generate_title(&backend, &final_content).await?; + let (title, body) = crate::ai::insight_generator::parse_title_body(&final_content); let json = serde_json::to_string(&messages) .map_err(|e| anyhow!("failed to serialize chat history: {}", e))?; @@ -1009,7 +1010,7 @@ impl InsightChatService { library_id: req.library_id, file_path: normalized.clone(), title, - summary: final_content, + summary: body, generated_at: Utc::now().timestamp(), model_version: model_used.clone(), is_current: true, @@ -1045,32 +1046,6 @@ impl InsightChatService { Ok(()) } - /// Generate a short title via the same chat backend so voice stays - /// consistent with the body. Mirrors generate_agentic_insight_for_photo's - /// titling pass. - async fn generate_title( - &self, - backend: &ResolvedBackend, - final_content: &str, - ) -> Result { - let title_prompt = format!( - "Create a short title (maximum 8 words) for the following journal entry:\n\n{}\n\n\ - Capture the key moment or theme. Return ONLY the title, nothing else.", - final_content - ); - let title_raw = backend - .chat() - .generate( - &title_prompt, - Some( - "You are my long term memory assistant. Use only the information provided. Do not invent details.", - ), - None, - ) - .await?; - Ok(title_raw.trim().trim_matches('"').to_string()) - } - /// Drive the agentic loop with streaming SSE events. Shared between /// bootstrap and continuation. Mutates `messages` in place (response /// turns + tool results are appended) and returns counters + the diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs index 80a7627..b0cd54c 100644 --- a/src/ai/insight_generator.rs +++ b/src/ai/insight_generator.rs @@ -28,6 +28,39 @@ use crate::otel::global_tracer; use crate::tags::TagDao; use crate::utils::{earliest_fs_time, normalize_path}; +/// Parse a "Title: ...\n\n" response into (title, body). +/// Falls back to the first sentence as the title if the model didn't +/// follow the format. +pub(crate) fn parse_title_body(raw: &str) -> (String, String) { + let trimmed = raw.trim(); + + // Try "Title: \n\n<body>" or "Title: <title>\n<body>" + if let Some(rest) = trimmed.strip_prefix("Title:").or_else(|| trimmed.strip_prefix("title:")) { + let rest = rest.trim_start(); + if let Some(split_pos) = rest.find("\n\n").or_else(|| rest.find('\n')) { + let title = rest[..split_pos].trim(); + let body = rest[split_pos..].trim(); + if !title.is_empty() && !body.is_empty() { + return (title.to_string(), body.to_string()); + } + } + } + + // Fallback: first sentence (up to first `. ` or `.\n`) becomes the title + if let Some(pos) = trimmed.find(". ").or_else(|| trimmed.find(".\n")) { + let title = &trimmed[..pos]; + let body = trimmed[pos + 1..].trim(); + if title.len() <= 100 && !body.is_empty() { + return (title.to_string(), body.to_string()); + } + } + + // Last resort: truncate to 60 chars for title, full text as body + let title: String = trimmed.chars().take(60).collect(); + let title = title.trim_end().to_string(); + (title, trimmed.to_string()) +} + /// Combine an optional personal Apollo Place with an optional Nominatim /// reverse-geocoded city, falling back to bare coordinates when neither /// resolves. Free function so we can test it cheaply without spinning up @@ -1927,6 +1960,11 @@ Return ONLY the summary, nothing else."#, .unwrap_or(20) .clamp(1, 50) as usize; let contact_id = args.get("contact_id").and_then(|v| v.as_i64()); + let contact = args.get("contact") + .and_then(|v| v.as_str()) + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .filter(|_| contact_id.is_none()); let start_ts = args.get("start_ts").and_then(|v| v.as_i64()); let end_ts = args.get("end_ts").and_then(|v| v.as_i64()); let is_mms = args.get("is_mms").and_then(|v| v.as_bool()); @@ -1934,10 +1972,11 @@ Return ONLY the summary, nothing else."#, let has_date_filter = start_ts.is_some() || end_ts.is_some(); log::info!( - "tool_search_messages: query='{}', mode={}, contact_id={:?}, range=[{:?}, {:?}], is_mms={:?}, has_media={:?}, limit={}", + "tool_search_messages: query='{}', mode={}, contact_id={:?}, contact={:?}, range=[{:?}, {:?}], is_mms={:?}, has_media={:?}, limit={}", query, mode, contact_id, + contact, start_ts, end_ts, is_mms, @@ -1952,6 +1991,7 @@ Return ONLY the summary, nothing else."#, mode: mode.as_str(), limit: user_limit, contact_id, + contact, date_from: start_ts, date_to: end_ts, is_mms, @@ -3033,28 +3073,31 @@ Return ONLY the summary, nothing else."#, } tools.push(Tool::function( - "search_messages", +"search_messages", "Search SMS/MMS messages — bodies and (for MMS) attachment text + filenames. \ - Modes: `fts5` (keyword + phrase + prefix + AND/OR/NOT + NEAR proximity), \ - `semantic` (embedding similarity, requires generated embeddings), `hybrid` (RRF merge, recommended; \ - degrades to fts5 when embeddings absent). Optional filters: `start_ts` / `end_ts` (real-UTC unix \ - seconds), `contact_id`, `is_mms` (true = MMS only, false = SMS only), `has_media` (true = messages \ - with image/video/audio attachments only). For pure date / contact browsing without keywords, prefer \ - `get_sms_messages`. \ - \n\nFTS5 query syntax (works in fts5 + hybrid modes):\n\ - - Phrase: `\"trader joe's\"` — exact word sequence (use double quotes).\n\ - - Prefix: `restaur*` — matches restaurant, restaurants, restauranteur, ….\n\ - - Boolean: `dinner AND tahoe`, `wedding OR reception OR ceremony`, `vacation NOT work` (operators must be UPPERCASE).\n\ - - Proximity: `NEAR(meeting work, 5)` — both terms within 5 tokens of each other.\n\ - - Combine: `(reception OR ceremony) AND tahoe*` — group with parens.\n\ - Unquoted multi-word queries are treated as implicit AND. Apostrophes / hyphens / colons are safe — they no longer downgrade to a slow LIKE scan. Use `mode: \"fts5\"` when you want the operators above to be authoritative; `hybrid` still respects them but may surface semantically-similar non-keyword hits alongside.\n\n\ - Examples:\n\ - - `{query: \"trader joe's\"}` — phrase across all time.\n\ - - `{query: \"dinner\", contact_id: 42, start_ts: 1700000000, end_ts: 1700604800}` — keyword within a contact and a week.\n\ - - `{query: \"vacation\", has_media: true}` — only matches that include photos / videos.\n\ - - `{query: \"wedding OR reception OR ceremony\", mode: \"fts5\"}` — any of several synonyms.\n\ - - `{query: \"restaur*\", mode: \"fts5\"}` — prefix expansion for varying word forms.\n\ - - `{query: \"NEAR(birthday cake, 5)\", mode: \"fts5\"}` — terms close together but in any order.", + Modes: `fts5` (keyword + phrase + prefix + AND/OR/NOT + NEAR proximity), \ + `semantic` (embedding similarity, requires generated embeddings), `hybrid` (RRF merge, recommended; \ + degrades to fts5 when embeddings absent). Optional filters: `start_ts` / `end_ts` (real-UTC unix \ + seconds), `contact` (contact name, case-insensitive), `contact_id` (numeric), `is_mms` \ + (true = MMS only, false = SMS only), `has_media` (true = messages with image/video/audio \ + attachments only). Prefer `contact` over `contact_id` — the name is resolved server-side. \ + If both are provided, `contact_id` takes precedence. \ + For pure date / contact browsing without keywords, prefer `get_sms_messages`. \ + \n\nFTS5 query syntax (works in fts5 + hybrid modes):\n\ + - Phrase: `\"trader joe's\"` — exact word sequence (use double quotes).\n\ + - Prefix: `restaur*` — matches restaurant, restaurants, restauranteur, ….\n\ + - Boolean: `dinner AND tahoe`, `wedding OR reception OR ceremony`, `vacation NOT work` (operators must be UPPERCASE).\n\ + - Proximity: `NEAR(meeting work, 5)` — both terms within 5 tokens of each other.\n\ + - Combine: `(reception OR ceremony) AND tahoe*` — group with parens.\n\ + Unquoted multi-word queries are treated as implicit AND. Apostrophes / hyphens / colons are safe — they no longer downgrade to a slow LIKE scan. Use `mode: \"fts5\"` when you want the operators above to be authoritative; `hybrid` still respects them but may surface semantically-similar non-keyword hits alongside.\n\n\ + Examples:\n\ + - `{query: \"trader joe's\"}` — phrase across all time.\n\ + - `{query: \"dinner\", contact: \"Mom\"}` — keyword scoped to Mom's messages.\n\ + - `{query: \"dinner\", contact_id: 42, start_ts: 1700000000, end_ts: 1700604800}` — keyword within a contact and a week.\n\ + - `{query: \"vacation\", has_media: true}` — only matches that include photos / videos.\n\ + - `{query: \"wedding OR reception OR ceremony\", mode: \"fts5\"}` — any of several synonyms.\n\ + - `{query: \"restaur*\", mode: \"fts5\"}` — prefix expansion for varying word forms.\n\ + - `{query: \"NEAR(birthday cake, 5)\", mode: \"fts5\"}` — terms close together but in any order.", serde_json::json!({ "type": "object", "required": ["query"], @@ -3063,6 +3106,7 @@ Return ONLY the summary, nothing else."#, "mode": { "type": "string", "enum": ["fts5", "semantic", "hybrid"], "description": "Search strategy. Default: hybrid." }, "limit": { "type": "integer", "description": "Max results (default 20, max 50)." }, "contact_id": { "type": "integer", "description": "Optional numeric contact id to scope the search." }, + "contact": { "type": "string", "description": "Optional contact name (case-insensitive). Resolved to contact_id server-side. Use this when you know the name but not the ID." }, "start_ts": { "type": "integer", "description": "Optional inclusive lower bound, real-UTC unix seconds." }, "end_ts": { "type": "integer", "description": "Optional inclusive upper bound, real-UTC unix seconds." }, "is_mms": { "type": "boolean", "description": "Optional: true to restrict to MMS, false to restrict to SMS." }, @@ -3534,7 +3578,8 @@ Return ONLY the summary, nothing else."#, - When you identify people / places / events / things, use store_entity + store_fact to grow the persistent memory.\n\ - Before store_entity, call recall_entities to check whether a similar name already exists; reuse the existing entity_id rather than creating a near-duplicate (e.g. \"Sara\" vs \"Sarah J.\"). The DAO will collapse obvious cosine matches, but choosing the existing id keeps facts and photo links consolidated.\n\ - Predicates should be relationship-shaped verbs that encode a queryable claim — `lives_in`, `works_at`, `attended`, `is_friend_of`, `is_parent_of`, `interested_in`, `married_to`, `owns`. DO NOT use vague speech-act predicates like `expressed`, `said`, `mentioned`, `stated`, `quoted`, `noted`, `discussed`, `thought`, `wondered`. DO NOT store quotations or sentence fragments as `object_value` — paraphrase into a structured claim. Bad: `(Cameron, expressed, \"I'm tempted to get a part-time job there\")`. Good: `(Cameron, considered_employment_at, <Place>)` or `(Cameron, interested_in, \"part-time work\")`.\n\ - - A tool returning no results is informative; continue with the others.", + - A tool returning no results is informative; continue with the others.\n\ + - When writing your final answer, start with \"Title: <short title>\" (max 8 words) on the first line, then a blank line, then the body.", ); let mut out = identity; @@ -4059,7 +4104,7 @@ Return ONLY the summary, nothing else."#, iterations_used ); messages.push(ChatMessage::user(format!( - "Based on the context gathered, please write the final photo insight: a title and a detailed personal summary. Write in first person as {}.", + "Based on the context gathered, please write the final photo insight. Start with \"Title: <short title>\" on the first line (max 8 words), then a blank line, then the detailed personal summary. Write in first person as {}.", user_display_name() ))); let (final_response, prompt_tokens, eval_tokens) = backend @@ -4077,21 +4122,11 @@ Return ONLY the summary, nothing else."#, .set_attribute(KeyValue::new("iterations_used", iterations_used as i64)); loop_cx.span().set_status(Status::Ok); - // 13. Generate title via the same backend so voice stays consistent. - let title_prompt = format!( - "Create a short title (maximum 8 words) for the following journal entry:\n\n{}\n\nCapture the key moment or theme. Return ONLY the title, nothing else.", - final_content - ); - let title_system = custom_system_prompt.as_deref().unwrap_or( - "You are my long term memory assistant. Use only the information provided. Do not invent details.", - ); - let title_raw = backend - .chat() - .generate(&title_prompt, Some(title_system), None) - .await?; - let title = title_raw.trim().trim_matches('"').to_string(); + // 13. Parse title from the model's inline response. + let (title, body) = parse_title_body(&final_content); + final_content = body; - log::info!("Agentic generated title: {}", title); + log::info!("Agentic parsed title: {}", title); let summary_preview: String = final_content.chars().take(200).collect(); log::info!( "Agentic generated summary ({} chars): {}", @@ -4742,4 +4777,40 @@ mod tests { assert!(out.contains("-> empty (pivoted)")); assert!(out.contains("Final insight: Final title")); } + + #[test] + fn parse_title_body_standard_format() { + let (t, b) = parse_title_body("Title: Summer at the Lake\n\nWe spent the afternoon..."); + assert_eq!(t, "Summer at the Lake"); + assert_eq!(b, "We spent the afternoon..."); + } + + #[test] + fn parse_title_body_single_newline() { + let (t, b) = parse_title_body("Title: Morning Walk\nThe sun was rising..."); + assert_eq!(t, "Morning Walk"); + assert_eq!(b, "The sun was rising..."); + } + + #[test] + fn parse_title_body_lowercase_prefix() { + let (t, b) = parse_title_body("title: Garden Party\n\nEveryone gathered..."); + assert_eq!(t, "Garden Party"); + assert_eq!(b, "Everyone gathered..."); + } + + #[test] + fn parse_title_body_fallback_first_sentence() { + let (t, b) = parse_title_body("A warm summer day. We gathered at the park for a picnic."); + assert_eq!(t, "A warm summer day"); + assert_eq!(b, "We gathered at the park for a picnic."); + } + + #[test] + fn parse_title_body_fallback_truncate() { + let input = "A single long paragraph with no periods or title prefix that just keeps going on and on"; + let (t, b) = parse_title_body(input); + assert!(t.len() <= 60); + assert_eq!(b, input); + } } diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs index 77cd05f..8ea1063 100644 --- a/src/ai/llamacpp.rs +++ b/src/ai/llamacpp.rs @@ -354,6 +354,8 @@ impl LlamaCppClient { .and_then(|v| v.as_i64()) .map(|n| n as i32); + log_timings(&parsed, prompt_tokens, completion_tokens); + Ok((chat_msg, prompt_tokens, completion_tokens)) } } @@ -456,6 +458,7 @@ impl LlmClient for LlamaCppClient { let mut role = "assistant".to_string(); let mut prompt_tokens: Option<i32> = None; let mut completion_tokens: Option<i32> = None; + let mut last_frame: Option<Value> = None; let mut done_seen = false; while let Some(chunk) = byte_stream.next().await { @@ -505,6 +508,7 @@ impl LlmClient for LlamaCppClient { .get("completion_tokens") .and_then(|n| n.as_i64()) .map(|n| n as i32); + last_frame = Some(v.clone()); } let Some(choices) = v.get("choices").and_then(|c| c.as_array()) @@ -587,6 +591,10 @@ impl LlmClient for LlamaCppClient { Some(v) }; + if let Some(ref frame) = last_frame { + log_timings(frame, prompt_tokens, completion_tokens); + } + let message = ChatMessage { role, content: accumulated_content, @@ -720,6 +728,44 @@ impl LlamaCppClient { /// Extract a diagnostic fragment from a llama-swap / llama-server response /// that doesn't match the expected `{choices: [...]}` shape. llama-server /// returns errors as `{"error": {"message": "...", "code": N, "type": "..."}}`; +fn log_timings(parsed: &Value, prompt_tokens: Option<i32>, completion_tokens: Option<i32>) { + let timings = match parsed.get("timings") { + Some(t) => t, + None => return, + }; + let prompt_tps = timings.get("prompt_per_second").and_then(|v| v.as_f64()); + let gen_tps = timings.get("predicted_per_second").and_then(|v| v.as_f64()); + let prompt_ms = timings.get("prompt_ms").and_then(|v| v.as_f64()); + let gen_ms = timings.get("predicted_ms").and_then(|v| v.as_f64()); + + let mut parts: Vec<String> = Vec::new(); + if let Some(c) = prompt_tokens { + let mut s = format!("prompt={} tok", c); + if let Some(ms) = prompt_ms { + s.push_str(&format!(" ({:.0} ms", ms)); + if let Some(tps) = prompt_tps { + s.push_str(&format!(", {:.1} tok/s", tps)); + } + s.push(')'); + } + parts.push(s); + } + if let Some(c) = completion_tokens { + let mut s = format!("gen={} tok", c); + if let Some(ms) = gen_ms { + s.push_str(&format!(" ({:.0} ms", ms)); + if let Some(tps) = gen_tps { + s.push_str(&format!(", {:.1} tok/s", tps)); + } + s.push(')'); + } + parts.push(s); + } + if !parts.is_empty() { + log::info!("llama-swap chat metrics — {}", parts.join(", ")); + } +} + /// llama-swap itself sometimes wraps subprocess failures with its own /// `{"error": "..."}` flat shape. Surface either when present, otherwise fall /// back to a truncated raw-JSON view. diff --git a/src/ai/sms_client.rs b/src/ai/sms_client.rs index 6661bac..d5e175f 100644 --- a/src/ai/sms_client.rs +++ b/src/ai/sms_client.rs @@ -281,6 +281,9 @@ impl SmsApiClient { if let Some(cid) = params.contact_id { url.push_str(&format!("&contact_id={}", cid)); } + if let Some(ref c) = params.contact { + url.push_str(&format!("&contact={}", urlencoding::encode(c))); + } if let Some(off) = params.offset { url.push_str(&format!("&offset={}", off)); } @@ -413,6 +416,9 @@ pub struct SmsSearchParams<'a> { pub mode: &'a str, pub limit: usize, pub contact_id: Option<i64>, + /// Contact name (case-insensitive). Resolved to a numeric ID by the + /// SMS-API server when `contact_id` is not set. + pub contact: Option<String>, /// Unix-seconds inclusive lower bound on `date`. pub date_from: Option<i64>, /// Unix-seconds inclusive upper bound on `date`. -- 2.49.1 From b03ee6034237e99248c8f4f4d7f5cb91b972ef2e Mon Sep 17 00:00:00 2001 From: Cameron Cordes <cameronc.dev@gmail.com> Date: Tue, 26 May 2026 09:55:16 -0400 Subject: [PATCH 11/11] fix: prevent hybrid mode from leaking OpenRouter model to local llamacpp client When backend=hybrid with LLM_BACKEND=llamacpp, the user-selected model (an OpenRouter id like "google/gemini-3-flash-preview") was being applied to the local LlamaCppClient's primary_model and vision_model. This caused describe_image to send the OpenRouter model name to llama-swap, which returned 400 because it has no such slot. Guard the local-client model override with !is_hybrid so it only applies in local-only mode (where the user is selecting a different local model). Bump to v1.2.0. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com> --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/ai/backend.rs | 38 +++++++--- src/ai/insight_chat.rs | 37 +++++----- src/ai/insight_generator.rs | 138 ++++++++++++++++++++++++------------ src/ai/llamacpp.rs | 32 +++++++-- src/ai/mod.rs | 2 +- 7 files changed, 172 insertions(+), 79 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3f6d0c2..5d3e4ce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2051,7 +2051,7 @@ dependencies = [ [[package]] name = "image-api" -version = "1.1.0" +version = "1.2.0" dependencies = [ "actix", "actix-cors", diff --git a/Cargo.toml b/Cargo.toml index 7713970..7324001 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "image-api" -version = "1.1.0" +version = "1.2.0" authors = ["Cameron Cordes <cameronc.dev@gmail.com>"] edition = "2024" diff --git a/src/ai/backend.rs b/src/ai/backend.rs index 0fe7747..0515f1c 100644 --- a/src/ai/backend.rs +++ b/src/ai/backend.rs @@ -13,7 +13,10 @@ impl BackendKind { match s.trim().to_lowercase().as_str() { "local" | "" => Ok(Self::Local), "hybrid" => Ok(Self::Hybrid), - other => Err(anyhow!("unknown backend '{}'; expected 'local' or 'hybrid'", other)), + other => Err(anyhow!( + "unknown backend '{}'; expected 'local' or 'hybrid'", + other + )), } } @@ -65,7 +68,12 @@ impl ResolvedBackend { kind: BackendKind, images_inline: bool, ) -> Self { - Self { chat, local, kind, images_inline } + Self { + chat, + local, + kind, + images_inline, + } } pub fn chat(&self) -> &dyn LlmClient { @@ -97,21 +105,35 @@ mod tests { #[test] fn backend_kind_as_str_roundtrips() { - assert_eq!(BackendKind::parse(BackendKind::Local.as_str()).unwrap(), BackendKind::Local); - assert_eq!(BackendKind::parse(BackendKind::Hybrid.as_str()).unwrap(), BackendKind::Hybrid); + assert_eq!( + BackendKind::parse(BackendKind::Local.as_str()).unwrap(), + BackendKind::Local + ); + assert_eq!( + BackendKind::parse(BackendKind::Hybrid.as_str()).unwrap(), + BackendKind::Hybrid + ); } #[test] fn sampling_overrides_has_sampling() { let empty = SamplingOverrides { - model: None, num_ctx: None, temperature: None, - top_p: None, top_k: None, min_p: None, + model: None, + num_ctx: None, + temperature: None, + top_p: None, + top_k: None, + min_p: None, }; assert!(!empty.has_sampling()); let with_temp = SamplingOverrides { - model: None, num_ctx: Some(4096), temperature: Some(0.7), - top_p: None, top_k: None, min_p: None, + model: None, + num_ctx: Some(4096), + temperature: Some(0.7), + top_p: None, + top_k: None, + min_p: None, }; assert!(with_temp.has_sampling()); } diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs index 86671af..9837733 100644 --- a/src/ai/insight_chat.rs +++ b/src/ai/insight_chat.rs @@ -308,7 +308,9 @@ impl InsightChatService { let stored_model = insight.model_version.clone(); let overrides = SamplingOverrides { - model: req.model.clone() + model: req + .model + .clone() .or_else(|| Some(stored_model.clone())) .filter(|m| !m.is_empty()), num_ctx: req.num_ctx, @@ -731,7 +733,9 @@ impl InsightChatService { let stored_model = insight.model_version.clone(); let overrides = SamplingOverrides { - model: req.model.clone() + model: req + .model + .clone() .or_else(|| Some(stored_model.clone())) .filter(|m| !m.is_empty()), num_ctx: req.num_ctx, @@ -928,17 +932,15 @@ impl InsightChatService { // images_inline backends send images directly to the chat model. let visual_block = if !backend.images_inline { match image_base64.as_deref() { - Some(b64) => { - match backend.local().describe_image(b64).await { - Ok(desc) => { - format!("Visual description (from local vision model):\n{}\n", desc) - } - Err(e) => { - log::warn!("{} bootstrap: describe_image failed: {}", kind.as_str(), e); - String::new() - } + Some(b64) => match backend.local().describe_image(b64).await { + Ok(desc) => { + format!("Visual description (from local vision model):\n{}\n", desc) } - } + Err(e) => { + log::warn!("{} bootstrap: describe_image failed: {}", kind.as_str(), e); + String::new() + } + }, None => String::new(), } } else { @@ -1328,10 +1330,7 @@ fn resolve_bootstrap_backend(supplied: Option<&str>) -> Result<String> { .filter(|s| !s.is_empty()) .unwrap_or_else(|| "local".to_string()); if !matches!(lower.as_str(), "local" | "hybrid") { - bail!( - "unknown backend '{}'; expected 'local' or 'hybrid'", - lower - ); + bail!("unknown backend '{}'; expected 'local' or 'hybrid'", lower); } Ok(lower) } @@ -1971,7 +1970,11 @@ mod tests { // Both "openrouter" and the former "llamacpp" value are unknown now. for label in &["openrouter", "llamacpp"] { let err = validate_cross_replay("local", label).unwrap_err(); - assert!(format!("{}", err).contains("unknown backend"), "label={}", label); + assert!( + format!("{}", err).contains("unknown backend"), + "label={}", + label + ); } } diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs index b0cd54c..8e39c59 100644 --- a/src/ai/insight_generator.rs +++ b/src/ai/insight_generator.rs @@ -11,9 +11,9 @@ use std::sync::{Arc, Mutex}; use crate::ai::apollo_client::{ApolloClient, ApolloPlace}; use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides}; +use crate::ai::llamacpp::LlamaCppClient; use crate::ai::llm_client::LlmClient; use crate::ai::ollama::{ChatMessage, OllamaClient, Tool}; -use crate::ai::llamacpp::LlamaCppClient; use crate::ai::openrouter::OpenRouterClient; use crate::ai::sms_client::{SmsApiClient, SmsSearchHit, SmsSearchParams}; use crate::ai::user_display_name; @@ -35,7 +35,10 @@ pub(crate) fn parse_title_body(raw: &str) -> (String, String) { let trimmed = raw.trim(); // Try "Title: <title>\n\n<body>" or "Title: <title>\n<body>" - if let Some(rest) = trimmed.strip_prefix("Title:").or_else(|| trimmed.strip_prefix("title:")) { + if let Some(rest) = trimmed + .strip_prefix("Title:") + .or_else(|| trimmed.strip_prefix("title:")) + { let rest = rest.trim_start(); if let Some(split_pos) = rest.find("\n\n").or_else(|| rest.find('\n')) { let title = rest[..split_pos].trim(); @@ -1644,7 +1647,10 @@ Return ONLY the summary, nothing else."#, "get_location_history" => self.tool_get_location_history(arguments, cx).await, "get_file_tags" => self.tool_get_file_tags(arguments, cx).await, "get_faces_in_photo" => self.tool_get_faces_in_photo(arguments, cx).await, - "describe_photo" => self.tool_describe_photo(backend.local(), image_base64).await, + "describe_photo" => { + self.tool_describe_photo(backend.local(), image_base64) + .await + } "reverse_geocode" => self.tool_reverse_geocode(arguments).await, "get_personal_place_at" => self.tool_get_personal_place_at(arguments).await, "recall_entities" => self.tool_recall_entities(arguments, cx).await, @@ -1655,7 +1661,13 @@ Return ONLY the summary, nothing else."#, "store_entity" => self.tool_store_entity(arguments, cx).await, "store_fact" => { self.tool_store_fact( - arguments, file_path, user_id, persona_id, model, backend_label, cx, + arguments, + file_path, + user_id, + persona_id, + model, + backend_label, + cx, ) .await } @@ -1810,9 +1822,8 @@ Return ONLY the summary, nothing else."#, ); let started = std::time::Instant::now(); - let system = Some( - "You are a terse relevance ranker. You output only numbers separated by commas.", - ); + let system = + Some("You are a terse relevance ranker. You output only numbers separated by commas."); let response = local.generate(&prompt, system, None).await?; log::info!( "rerank: finished in {} ms (prompt={} chars)", @@ -1960,7 +1971,8 @@ Return ONLY the summary, nothing else."#, .unwrap_or(20) .clamp(1, 50) as usize; let contact_id = args.get("contact_id").and_then(|v| v.as_i64()); - let contact = args.get("contact") + let contact = args + .get("contact") .and_then(|v| v.as_str()) .map(|s| s.trim().to_string()) .filter(|s| !s.is_empty()) @@ -2710,22 +2722,17 @@ Return ONLY the summary, nothing else."#, // Generate embedding for name + description (best-effort) via the // configured local backend. let embed_text = format!("{} {}", name, description); - let embedding: Option<Vec<u8>> = match crate::ai::embed_one( - &self.ollama, - self.llamacpp.as_deref(), - &embed_text, - ) - .await - { - Ok(vec) => { - let bytes: Vec<u8> = vec.iter().flat_map(|f| f.to_le_bytes()).collect(); - Some(bytes) - } - Err(e) => { - log::warn!("Embedding generation failed for entity '{}': {}", name, e); - None - } - }; + let embedding: Option<Vec<u8>> = + match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &embed_text).await { + Ok(vec) => { + let bytes: Vec<u8> = vec.iter().flat_map(|f| f.to_le_bytes()).collect(); + Some(bytes) + } + Err(e) => { + log::warn!("Embedding generation failed for entity '{}': {}", name, e); + None + } + }; let now = chrono::Utc::now().timestamp(); let insert = InsertEntity { @@ -3606,8 +3613,7 @@ Return ONLY the summary, nothing else."#, kind: BackendKind, overrides: &SamplingOverrides, ) -> Result<ResolvedBackend> { - let local_via_llamacpp = - crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); + let local_via_llamacpp = crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); let is_hybrid = kind == BackendKind::Hybrid; // ── chat client ──────────────────────────────────────────────── @@ -3680,12 +3686,15 @@ Return ONLY the summary, nothing else."#, }; // ── local client (utility calls: rerank, describe_image, etc.) ─ - // For llamacpp: mirror the chat model selection so rerank / - // describe_image hit the same model that's already loaded — - // avoids a mid-turn model swap in llama-swap exclusive mode. + // For llamacpp in local mode: mirror the chat model selection so + // rerank / describe_image hit the same model that's already + // loaded — avoids a mid-turn model swap in llama-swap exclusive + // mode. In hybrid mode the override is an OpenRouter model id + // (e.g. "google/gemini-3-flash-preview") which llama-swap can't + // serve — keep the configured local slots. let local: Box<dyn LlmClient> = if local_via_llamacpp { let mut lc = self.llamacpp.as_ref().unwrap().as_ref().clone(); - if let Some(ref m) = overrides.model { + if !is_hybrid && let Some(ref m) = overrides.model { lc.primary_model = m.clone(); lc.set_vision_model(m.clone()); } @@ -3713,14 +3722,14 @@ Return ONLY the summary, nothing else."#, .await .unwrap_or(false); - let available_on_fallback = - if let Some(ref fallback_url) = self.ollama.fallback_url { - OllamaClient::is_model_available(fallback_url, model) - .await - .unwrap_or(false) - } else { - false - }; + let available_on_fallback = if let Some(ref fallback_url) = self.ollama.fallback_url + { + OllamaClient::is_model_available(fallback_url, model) + .await + .unwrap_or(false) + } else { + false + }; if !available_on_primary && !available_on_fallback { anyhow::bail!( @@ -3761,10 +3770,7 @@ Return ONLY the summary, nothing else."#, }; if !capabilities.has_tool_calling { - anyhow::bail!( - "tool calling not supported by model '{}'", - ollama_for_caps - ); + anyhow::bail!("tool calling not supported by model '{}'", ollama_for_caps); } capabilities.has_vision @@ -3801,9 +3807,7 @@ Return ONLY the summary, nothing else."#, span.set_attribute(KeyValue::new("max_iterations", max_iterations as i64)); // 1. Resolve backend + build clients. - let kind = BackendKind::parse( - backend.as_deref().unwrap_or("local"), - )?; + let kind = BackendKind::parse(backend.as_deref().unwrap_or("local"))?; span.set_attribute(KeyValue::new("backend", kind.as_str())); let overrides = SamplingOverrides { model: custom_model, @@ -3933,7 +3937,11 @@ Return ONLY the summary, nothing else."#, Some(desc) } Err(e) => { - log::warn!("{}: vision describe failed, continuing without: {}", kind, e); + log::warn!( + "{}: vision describe failed, continuing without: {}", + kind, + e + ); None } }, @@ -4813,4 +4821,42 @@ mod tests { assert!(t.len() <= 60); assert_eq!(b, input); } + + /// Regression: hybrid mode was leaking the OpenRouter model override + /// into the local llamacpp client, causing describe_image to send + /// e.g. "google/gemini-3-flash-preview" to llama-swap (which 400s). + #[test] + fn resolve_backend_hybrid_does_not_leak_model_to_local_llamacpp() { + use crate::ai::llamacpp::LlamaCppClient; + + let mut base = + LlamaCppClient::new(Some("http://localhost:9292/v1".into()), Some("chat".into())); + base.set_vision_model("vision".into()); + base.set_embedding_model("embed".into()); + + let openrouter_model = "google/gemini-3-flash-preview"; + let overrides_model: Option<String> = Some(openrouter_model.into()); + let is_hybrid = true; + + // Replicate the resolve_backend local-client construction + // (lines ~3686-3695 of this file). + let mut lc = base.clone(); + if let Some(ref m) = overrides_model { + if !is_hybrid { + lc.primary_model = m.clone(); + lc.set_vision_model(m.clone()); + } + } + + // In hybrid mode the local client must keep its configured slots. + assert_eq!( + lc.vision_model, "vision", + "hybrid mode must not override vision_model with OpenRouter model" + ); + assert_eq!( + lc.primary_model, "chat", + "hybrid mode must not override primary_model with OpenRouter model" + ); + assert_eq!(lc.embedding_model, "embed"); + } } diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs index 8ea1063..e2ba00d 100644 --- a/src/ai/llamacpp.rs +++ b/src/ai/llamacpp.rs @@ -409,10 +409,7 @@ impl LlmClient for LlamaCppClient { tools.len() ); let mut body = serde_json::Map::new(); - body.insert( - "model".into(), - Value::String(self.primary_model.clone()), - ); + body.insert("model".into(), Value::String(self.primary_model.clone())); body.insert( "messages".into(), Value::Array(Self::messages_to_openai(&messages)), @@ -1071,6 +1068,28 @@ mod tests { assert_eq!(local.embedding_model, "embed"); } + #[test] + fn hybrid_mode_local_client_preserves_vision_model() { + // In hybrid mode, overrides.model is an OpenRouter model id + // (e.g. "google/gemini-3-flash-preview"). The local llamacpp + // client must NOT adopt that as its vision_model — it should + // keep the configured LLAMA_SWAP_VISION_MODEL so describe_image + // hits the correct local slot instead of sending an unknown + // model name to llama-swap. + let mut base = LlamaCppClient::new(None, Some("chat".into())); + base.set_vision_model("vision".into()); + base.set_embedding_model("embed".into()); + + // Simulate what resolve_backend SHOULD do for hybrid mode: + // clone but do NOT override primary_model / vision_model. + let local = base.clone(); + + // The local client keeps its configured slots. + assert_eq!(local.primary_model, "chat"); + assert_eq!(local.vision_model, "vision"); + assert_eq!(local.embedding_model, "embed"); + } + #[test] fn assistant_tool_calls_emit_null_content() { let msg = ChatMessage { @@ -1086,7 +1105,10 @@ mod tests { images: None, }; let wire = LlamaCppClient::messages_to_openai(&[msg]); - assert!(wire[0]["content"].is_null(), "empty content + tool_calls should emit null"); + assert!( + wire[0]["content"].is_null(), + "empty content + tool_calls should emit null" + ); } #[test] diff --git a/src/ai/mod.rs b/src/ai/mod.rs index 09cbfda..c991c71 100644 --- a/src/ai/mod.rs +++ b/src/ai/mod.rs @@ -25,11 +25,11 @@ pub use handlers::{ get_insight_handler, get_openrouter_models_handler, rate_insight_handler, }; pub use insight_generator::InsightGenerator; +pub use llamacpp::LlamaCppClient; #[allow(unused_imports)] pub use llm_client::{ ChatMessage, LlmClient, ModelCapabilities, Tool, ToolCall, ToolCallFunction, ToolFunction, }; -pub use llamacpp::LlamaCppClient; pub use ollama::{EMBEDDING_MODEL, OllamaClient}; pub use sms_client::{SmsApiClient, SmsMessage}; -- 2.49.1