From f0927f535510ccfecf91c1896041b93c2c156603 Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Wed, 20 May 2026 17:52:33 -0400
Subject: [PATCH 01/11] ai: add llamacpp backend (llama-swap) as third LLM
 client

Wires a new LlamaCppClient (OpenAI-compatible /v1 wire format) alongside
OllamaClient and OpenRouterClient. Per-slot routing for chat/vision/embed
via env (LLAMA_SWAP_URL + *_MODEL vars); capability inference uses an
env allowlist since /v1/models doesn't report modality.

InsightGenerator + InsightChatService gain three-way dispatch on
chat_backend = "local" | "hybrid" | "llamacpp". Hybrid and llamacpp
share the describe-then-inline path (text-only chat after a separate
vision describe). HYBRID_VISION_BACKEND=llamacpp lets hybrid route its
describe pass through llama-swap's vision slot while chat still goes
to OpenRouter.

Cross-replay matrix added (validate_cross_replay): local<->llamacpp
and hybrid<->llamacpp allowed; local->hybrid and llamacpp->hybrid
rejected. New /insights/llamacpp/models handler mirrors the OpenRouter
shape.
---
 CLAUDE.md                     |  49 +-
 src/ai/handlers.rs            |  30 ++
 src/ai/insight_chat.rs        | 266 ++++++---
 src/ai/insight_generator.rs   | 171 ++++--
 src/ai/llamacpp.rs            | 978 ++++++++++++++++++++++++++++++++++
 src/ai/mod.rs                 |   4 +-
 src/bin/populate_knowledge.rs |   1 +
 src/main.rs                   |   1 +
 src/state.rs                  |  70 +++
 9 files changed, 1468 insertions(+), 102 deletions(-)
 create mode 100644 src/ai/llamacpp.rs

diff --git a/CLAUDE.md b/CLAUDE.md
index 7e605cc..d3419e6 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -475,6 +475,7 @@ POST /insights/generate/agentic      (tool-calling loop; body: { file_path, back
 GET  /insights?path=...&library=...
 GET  /insights/models                (local Ollama models + capabilities)
 GET  /insights/openrouter/models     (curated OpenRouter allowlist)
+GET  /insights/llamacpp/models       (curated llama-swap slot allowlist)
 POST /insights/rate                  (thumbs up/down for training data)
 
 // Insight Chat Continuation
@@ -631,6 +632,23 @@ OPENROUTER_EMBEDDING_MODEL=openai/text-embedding-3-small  # Optional, embeddings
 OPENROUTER_HTTP_REFERER=https://your-site.example    # Optional attribution header
 OPENROUTER_APP_TITLE=ImageApi                  # Optional attribution header
 
+# llama.cpp / llama-swap (Llamacpp Backend) - sibling to Ollama; OpenAI-compatible
+# proxy hosting one or more llama-server processes (chat / vision / embed slots).
+LLAMA_SWAP_URL=http://localhost:9292/v1         # Required to enable llamacpp backend
+LLAMA_SWAP_PRIMARY_MODEL=chat                   # Chat slot id (matches config.yaml)
+LLAMA_SWAP_VISION_MODEL=vision                  # Vision slot id; describe_image routes here
+LLAMA_SWAP_EMBEDDING_MODEL=embed                # Embedding slot id (when local embeddings via llamacpp)
+LLAMA_SWAP_VISION_MODELS=qwen-vl,llava          # Comma-separated slot ids known to have vision.
+                                                # Drives `has_vision` in /insights/llamacpp/models.
+                                                # `LLAMA_SWAP_VISION_MODEL` is auto-included.
+LLAMA_SWAP_ALLOWED_MODELS=chat,coder            # Curated allowlist exposed to clients via
+                                                # GET /insights/llamacpp/models. Empty = no picker.
+LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180          # Per-request timeout; bump for slow CPU offload
+HYBRID_VISION_BACKEND=llamacpp                  # Optional override for hybrid mode's describe_image:
+                                                # `ollama` (default) or `llamacpp`. When `llamacpp`,
+                                                # hybrid still routes chat to OpenRouter but uses
+                                                # llama-swap's vision slot to describe images.
+
 # Insight Chat Continuation
 AGENTIC_CHAT_MAX_ITERATIONS=6                  # Cap on tool-calling iterations per chat turn (default 6)
 ```
@@ -652,8 +670,11 @@ This allows runtime verification of model availability before generating insight
 
 **Hybrid Backend (OpenRouter):**
 - Per-request opt-in via `backend=hybrid` on `POST /insights/generate/agentic`.
-- Local Ollama still describes the image (vision); the description is inlined
-  into the chat prompt and the agentic loop runs on OpenRouter.
+- Vision describe happens before the agentic loop; the description is inlined
+  into the chat prompt and the agentic loop runs on OpenRouter. By default
+  vision uses local Ollama, but `HYBRID_VISION_BACKEND=llamacpp` flips it to
+  llama-swap's vision slot (useful when you want chat on a frontier model and
+  vision on a local-but-not-Ollama path).
 - `request.model` (if provided) overrides `OPENROUTER_DEFAULT_MODEL` for that
   call. The mobile picker reads from `OPENROUTER_ALLOWED_MODELS`.
 - No live capability precheck — the operator-curated allowlist is trusted.
@@ -661,6 +682,30 @@ This allows runtime verification of model availability before generating insight
 - `GET /insights/openrouter/models` returns `{ models, default_model, configured }`
   for client picker UIs.
 
+**Llamacpp Backend (llama-swap):**
+- Per-request opt-in via `backend=llamacpp` on `POST /insights/generate/agentic`.
+- Sibling to Ollama: a local OpenAI-compatible proxy (mostlygeek/llama-swap)
+  fronting one or more `llama-server` processes. The chat slot is text-only
+  by default; vision and embeddings have their own slots (`LLAMA_SWAP_VISION_MODEL`,
+  `LLAMA_SWAP_EMBEDDING_MODEL`) that llama-swap routes to by model id. The
+  bundled `docker-compose.yml` + `llama-swap/config.yaml` in the opencode root
+  is the reference deploy.
+- Operates in the same describe-then-inline shape as hybrid: the chat model
+  never sees raw images. Vision describe routes through llama-swap's vision
+  slot (`describe_image` on `LlamaCppClient`).
+- `request.model` (if provided) overrides `LLAMA_SWAP_PRIMARY_MODEL` for that
+  call (must match a slot id in llama-swap's `config.yaml`). The mobile picker
+  reads from `LLAMA_SWAP_ALLOWED_MODELS`.
+- No live capability precheck — slot ids are trusted. Tool calling is assumed
+  for every slot (llama-swap entries typically launch with `--jinja`).
+- `GET /insights/llamacpp/models` returns `{ models, default_model, configured }`.
+- Cross-replay matrix (chat continuation): `local ↔ llamacpp` allowed (the
+  LlamaCppClient passes images through to the chat slot — you're responsible
+  for a vision-capable slot if the stored transcript carries images);
+  `hybrid ↔ llamacpp` allowed (both transcripts are text-only); `local →
+  hybrid` and `llamacpp → hybrid` rejected (mid-conversation description
+  source change isn't supported).
+
 **Insight Chat Continuation:**
 
 After an agentic insight is generated, the full `Vec<ChatMessage>` transcript is
diff --git a/src/ai/handlers.rs b/src/ai/handlers.rs
index 0e46057..4809b25 100644
--- a/src/ai/handlers.rs
+++ b/src/ai/handlers.rs
@@ -549,6 +549,36 @@ pub async fn get_openrouter_models_handler(
     HttpResponse::Ok().json(response)
 }
 
+#[derive(serde::Serialize)]
+pub struct LlamaCppModelsResponse {
+    pub models: Vec<String>,
+    pub default_model: Option<String>,
+    pub configured: bool,
+}
+
+/// GET /insights/llamacpp/models - Curated llama-swap model ids exposed
+/// to clients for the llamacpp backend. Returned verbatim from
+/// `LLAMA_SWAP_ALLOWED_MODELS`; no live call to llama-swap. Use
+/// `LLAMA_SWAP_URL` plus `LLAMA_SWAP_PRIMARY_MODEL` on the server side to
+/// pick the actual chat slot.
+#[get("/insights/llamacpp/models")]
+pub async fn get_llamacpp_models_handler(
+    _claims: Claims,
+    app_state: web::Data<crate::state::AppState>,
+) -> impl Responder {
+    let configured = app_state.llamacpp.is_some();
+    let default_model = app_state
+        .llamacpp
+        .as_ref()
+        .map(|c| c.primary_model.clone());
+    let response = LlamaCppModelsResponse {
+        models: app_state.llamacpp_allowed_models.clone(),
+        default_model,
+        configured,
+    };
+    HttpResponse::Ok().json(response)
+}
+
 /// POST /insights/rate - Rate an insight (thumbs up/down for training data)
 #[post("/insights/rate")]
 pub async fn rate_insight_handler(
diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs
index b2a7af8..4e87d52 100644
--- a/src/ai/insight_chat.rs
+++ b/src/ai/insight_chat.rs
@@ -9,6 +9,7 @@ use tokio::sync::Mutex as TokioMutex;
 use crate::ai::insight_generator::InsightGenerator;
 use crate::ai::llm_client::{ChatMessage, LlmClient, LlmStreamEvent, Tool};
 use crate::ai::ollama::OllamaClient;
+use crate::ai::llamacpp::LlamaCppClient;
 use crate::ai::openrouter::OpenRouterClient;
 use crate::database::InsightDao;
 use crate::database::models::InsertPhotoInsight;
@@ -93,6 +94,7 @@ pub struct InsightChatService {
     generator: Arc<InsightGenerator>,
     ollama: OllamaClient,
     openrouter: Option<Arc<OpenRouterClient>>,
+    llamacpp: Option<Arc<LlamaCppClient>>,
     insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
     chat_locks: ChatLockMap,
 }
@@ -102,6 +104,7 @@ impl InsightChatService {
         generator: Arc<InsightGenerator>,
         ollama: OllamaClient,
         openrouter: Option<Arc<OpenRouterClient>>,
+        llamacpp: Option<Arc<LlamaCppClient>>,
         insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
         chat_locks: ChatLockMap,
     ) -> Self {
@@ -109,6 +112,7 @@ impl InsightChatService {
             generator,
             ollama,
             openrouter,
+            llamacpp,
             insight_dao,
             chat_locks,
         }
@@ -303,23 +307,15 @@ impl InsightChatService {
             .map(|s| s.trim().to_lowercase())
             .filter(|s| !s.is_empty())
             .unwrap_or_else(|| stored_backend.clone());
-        if !matches!(effective_backend.as_str(), "local" | "hybrid") {
-            bail!(
-                "unknown backend '{}'; expected 'local' or 'hybrid'",
-                effective_backend
-            );
-        }
-        if stored_backend == "local" && effective_backend == "hybrid" {
-            bail!(
-                "switching from local to hybrid mid-chat isn't supported yet; \
-                 regenerate the insight in hybrid mode if you want OpenRouter chat"
-            );
-        }
+        validate_cross_replay(&stored_backend, &effective_backend)?;
         let is_hybrid = effective_backend == "hybrid";
+        let is_llamacpp = effective_backend == "llamacpp";
+        let describes_then_inlines = is_hybrid || is_llamacpp;
         span.set_attribute(KeyValue::new("backend", effective_backend.clone()));
 
         // 4. Build the chat backend client. Ollama in local mode, a freshly
-        //    cloned OpenRouter client in hybrid mode (clone so per-request
+        //    cloned OpenRouter client in hybrid mode, a freshly cloned
+        //    LlamaCppClient in llamacpp mode (clone so per-request
         //    sampling/model overrides don't leak into shared state).
         let max_iterations = req
             .max_iterations
@@ -336,6 +332,7 @@ impl InsightChatService {
 
         let mut ollama_client = self.ollama.clone();
         let mut openrouter_client: Option<OpenRouterClient> = None;
+        let mut llamacpp_client: Option<LlamaCppClient> = None;
 
         if is_hybrid {
             let arc = self.openrouter.as_ref().ok_or_else(|| {
@@ -356,6 +353,25 @@ impl InsightChatService {
                 c.set_num_ctx(Some(ctx));
             }
             openrouter_client = Some(c);
+        } else if is_llamacpp {
+            let arc = self.llamacpp.as_ref().ok_or_else(|| {
+                anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured")
+            })?;
+            let mut c: LlamaCppClient = (**arc).clone();
+            if let Some(ref m) = custom_model {
+                c.primary_model = m.clone();
+            }
+            if req.temperature.is_some()
+                || req.top_p.is_some()
+                || req.top_k.is_some()
+                || req.min_p.is_some()
+            {
+                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
+            }
+            if let Some(ctx) = req.num_ctx {
+                c.set_num_ctx(Some(ctx));
+            }
+            llamacpp_client = Some(c);
         } else {
             // Local-mode model swap. Build a new client when the chat model
             // differs from the configured one (mirrors the agentic pattern).
@@ -381,7 +397,9 @@ impl InsightChatService {
             }
         }
 
-        let chat_backend: &dyn LlmClient = if let Some(ref c) = openrouter_client {
+        let chat_backend: &dyn LlmClient = if let Some(ref c) = llamacpp_client {
+            c
+        } else if let Some(ref c) = openrouter_client {
             c
         } else {
             &ollama_client
@@ -389,18 +407,19 @@ impl InsightChatService {
         let model_used = chat_backend.primary_model().to_string();
         span.set_attribute(KeyValue::new("model", model_used.clone()));
 
-        // 5. Decide vision + tool set. In hybrid we always omit
-        //    `describe_photo` (matches the original generation flow). In
-        //    local we trust the stored history's first-user shape: if it
-        //    carries `images`, the original model was vision-capable, and
-        //    we keep `describe_photo` available.
+        // 5. Decide vision + tool set. In describe-then-inline modes
+        //    (hybrid, llamacpp) we always omit `describe_photo` (matches the
+        //    original generation flow). In local we trust the stored
+        //    history's first-user shape: if it carries `images`, the
+        //    original model was vision-capable, and we keep `describe_photo`
+        //    available.
         let local_first_user_has_image = messages
             .iter()
             .find(|m| m.role == "user")
             .and_then(|m| m.images.as_ref())
             .map(|imgs| !imgs.is_empty())
             .unwrap_or(false);
-        let offer_describe_tool = !is_hybrid && local_first_user_has_image;
+        let offer_describe_tool = !describes_then_inlines && local_first_user_has_image;
         // current_gate_opts(has_vision) sets gate_opts.has_vision = has_vision
         // and probes the per-table presence flags. Pass `offer_describe_tool`
         // directly — the `!is_hybrid && local_first_user_has_image` decision
@@ -799,19 +818,10 @@ impl InsightChatService {
             .map(|s| s.trim().to_lowercase())
             .filter(|s| !s.is_empty())
             .unwrap_or_else(|| stored_backend.clone());
-        if !matches!(effective_backend.as_str(), "local" | "hybrid") {
-            bail!(
-                "unknown backend '{}'; expected 'local' or 'hybrid'",
-                effective_backend
-            );
-        }
-        if stored_backend == "local" && effective_backend == "hybrid" {
-            bail!(
-                "switching from local to hybrid mid-chat isn't supported yet; \
-                 regenerate the insight in hybrid mode if you want OpenRouter chat"
-            );
-        }
+        validate_cross_replay(&stored_backend, &effective_backend)?;
         let is_hybrid = effective_backend == "hybrid";
+        let is_llamacpp = effective_backend == "llamacpp";
+        let describes_then_inlines = is_hybrid || is_llamacpp;
 
         let max_iterations = req
             .max_iterations
@@ -826,20 +836,21 @@ impl InsightChatService {
             .filter(|m| !m.is_empty());
 
         let (chat_backend_holder, ollama_client) =
-            self.build_chat_clients(is_hybrid, custom_model.as_deref(), &req)?;
+            self.build_chat_clients(&effective_backend, custom_model.as_deref(), &req)?;
         let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref();
         let model_used = chat_backend.primary_model().to_string();
 
         // Tool set — local mode + first user turn carries an image →
-        // offer describe_photo. Hybrid: visual description was inlined
-        // when the insight was bootstrapped, no describe tool needed.
+        // offer describe_photo. Describe-then-inline modes (hybrid /
+        // llamacpp): visual description was inlined when the insight was
+        // bootstrapped, no describe tool needed.
         let local_first_user_has_image = messages
             .iter()
             .find(|m| m.role == "user")
             .and_then(|m| m.images.as_ref())
             .map(|imgs| !imgs.is_empty())
             .unwrap_or(false);
-        let offer_describe_tool = !is_hybrid && local_first_user_has_image;
+        let offer_describe_tool = !describes_then_inlines && local_first_user_has_image;
         let gate_opts = self.generator.current_gate_opts_for_persona(
             offer_describe_tool,
             Some((req.user_id, &active_persona)),
@@ -976,6 +987,8 @@ impl InsightChatService {
             .unwrap_or_else(|| "default".to_string());
         let effective_backend = resolve_bootstrap_backend(req.backend.as_deref())?;
         let is_hybrid = effective_backend == "hybrid";
+        let is_llamacpp = effective_backend == "llamacpp";
+        let describes_then_inlines = is_hybrid || is_llamacpp;
 
         let max_iterations = req
             .max_iterations
@@ -984,7 +997,7 @@ impl InsightChatService {
 
         let custom_model = req.model.clone().filter(|m| !m.is_empty());
         let (chat_backend_holder, ollama_client) =
-            self.build_chat_clients(is_hybrid, custom_model.as_deref(), &req)?;
+            self.build_chat_clients(&effective_backend, custom_model.as_deref(), &req)?;
         let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref();
         let model_used = chat_backend.primary_model().to_string();
 
@@ -1007,21 +1020,48 @@ impl InsightChatService {
                 _ => None,
             });
 
-        // Hybrid backend: pre-describe the image via local Ollama vision
-        // so OpenRouter chat models (which can't see images directly) get
-        // the visual description as text. Mirrors the same pre-describe
-        // pass that `generate_agentic_insight_for_photo` does for hybrid.
-        let visual_block = if is_hybrid {
+        // Describe-then-inline backends (hybrid, llamacpp): pre-describe the
+        // image so a text-only chat model gets the visual description inline.
+        // Vision source: llamacpp's vision slot in llamacpp mode; in hybrid
+        // mode Ollama by default, llamacpp via `HYBRID_VISION_BACKEND=llamacpp`.
+        let visual_block = if describes_then_inlines {
             match image_base64.as_deref() {
-                Some(b64) => match self.ollama.describe_image(b64).await {
-                    Ok(desc) => {
-                        format!("Visual description (from local vision model):\n{}\n", desc)
+                Some(b64) => {
+                    let use_llamacpp_vision = if is_llamacpp {
+                        true
+                    } else {
+                        matches!(
+                            std::env::var("HYBRID_VISION_BACKEND")
+                                .ok()
+                                .as_deref()
+                                .map(|s| s.trim().to_lowercase())
+                                .as_deref(),
+                            Some("llamacpp")
+                        )
+                    };
+                    let described = if use_llamacpp_vision {
+                        match self.llamacpp.as_ref() {
+                            Some(c) => c.describe_image(b64).await,
+                            None => {
+                                log::warn!(
+                                    "bootstrap: requested llamacpp vision but LLAMA_SWAP_URL unset; falling back to Ollama"
+                                );
+                                self.ollama.describe_image(b64).await
+                            }
+                        }
+                    } else {
+                        self.ollama.describe_image(b64).await
+                    };
+                    match described {
+                        Ok(desc) => {
+                            format!("Visual description (from local vision model):\n{}\n", desc)
+                        }
+                        Err(e) => {
+                            log::warn!("{} bootstrap: describe_image failed: {}", effective_backend, e);
+                            String::new()
+                        }
                     }
-                    Err(e) => {
-                        log::warn!("hybrid bootstrap: local describe_image failed: {}", e);
-                        String::new()
-                    }
-                },
+                }
                 None => String::new(),
             }
         } else {
@@ -1031,7 +1071,7 @@ impl InsightChatService {
         // Tool gates. Local + image present → expose describe_photo so
         // the chat model can re-look at the photo on demand. Hybrid:
         // already inlined, no tool needed.
-        let offer_describe_tool = !is_hybrid && image_base64.is_some();
+        let offer_describe_tool = !describes_then_inlines && image_base64.is_some();
         let gate_opts = self.generator.current_gate_opts_for_persona(
             offer_describe_tool,
             Some((req.user_id, &active_persona)),
@@ -1057,7 +1097,7 @@ impl InsightChatService {
         );
         let system_msg = ChatMessage::system(system_content);
         let mut user_msg = ChatMessage::user(req.user_message.clone());
-        if !is_hybrid && let Some(ref img) = image_base64 {
+        if !describes_then_inlines && let Some(ref img) = image_base64 {
             user_msg.images = Some(vec![img.clone()]);
         }
         let mut messages = vec![system_msg, user_msg];
@@ -1130,19 +1170,22 @@ impl InsightChatService {
         Ok(())
     }
 
-    /// Set up chat clients (Ollama + optional OpenRouter) shared by
-    /// bootstrap and continuation. Returns the chat-side backend client
-    /// (boxed because hybrid and local return different concrete types)
-    /// and the Ollama client used for describe-image / local tool calls.
+    /// Set up chat clients (Ollama + optional OpenRouter / LlamaCpp) shared
+    /// by bootstrap and continuation. Returns the chat-side backend client
+    /// (boxed because each backend has a different concrete type) and the
+    /// Ollama client used for describe-image / local tool calls.
+    ///
+    /// `effective_backend` must be one of `"local"`, `"hybrid"`, `"llamacpp"`
+    /// (validated upstream).
     fn build_chat_clients(
         &self,
-        is_hybrid: bool,
+        effective_backend: &str,
         custom_model: Option<&str>,
         req: &ChatTurnRequest,
     ) -> Result<(Box<dyn LlmClient>, OllamaClient)> {
         let mut ollama_client = self.ollama.clone();
 
-        if is_hybrid {
+        if effective_backend == "hybrid" {
             let arc = self.openrouter.as_ref().ok_or_else(|| {
                 anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
             })?;
@@ -1163,6 +1206,27 @@ impl InsightChatService {
             return Ok((Box::new(c), ollama_client));
         }
 
+        if effective_backend == "llamacpp" {
+            let arc = self.llamacpp.as_ref().ok_or_else(|| {
+                anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured")
+            })?;
+            let mut c: LlamaCppClient = (**arc).clone();
+            if let Some(m) = custom_model {
+                c.primary_model = m.to_string();
+            }
+            if req.temperature.is_some()
+                || req.top_p.is_some()
+                || req.top_k.is_some()
+                || req.min_p.is_some()
+            {
+                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
+            }
+            if let Some(ctx) = req.num_ctx {
+                c.set_num_ctx(Some(ctx));
+            }
+            return Ok((Box::new(c), ollama_client));
+        }
+
         if let Some(m) = custom_model
             && m != self.ollama.primary_model
         {
@@ -1459,6 +1523,49 @@ fn resolve_date_taken_for_context(
         .map(|dt| dt.format("%Y-%m-%d").to_string())
 }
 
+/// Validate a stored→effective backend transition for a chat continuation.
+/// Continuation runs against a transcript that was generated with a specific
+/// backend; some transitions break the conversation shape:
+///
+/// - `local → hybrid` — the stored transcript has images embedded in the
+///   first user message; the openrouter chat client surfaces them through
+///   the wire, but vision-only models routed via the hybrid path may not
+///   accept that shape consistently across providers. Reject to keep the
+///   `regenerate-in-hybrid-mode` workflow as the supported answer.
+/// - `llamacpp → hybrid` — the stored transcript already has an inlined
+///   visual description produced by llama-swap's vision slot. Switching
+///   to hybrid mid-conversation would mix description sources across
+///   subsequent turns (any new image in the chat continuation would be
+///   described by ollama-vision while the original was described by
+///   llama-vision). Reject for consistency.
+///
+/// All other transitions are allowed. `local ↔ llamacpp` works because
+/// LlamaCppClient passes image content-parts through to the chat slot —
+/// the user is responsible for picking a vision-capable chat model in
+/// that case. `hybrid ↔ llamacpp` works because both transcripts are
+/// text-only (visual description inlined at bootstrap).
+fn validate_cross_replay(stored: &str, effective: &str) -> Result<()> {
+    if !matches!(effective, "local" | "hybrid" | "llamacpp") {
+        bail!(
+            "unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'",
+            effective
+        );
+    }
+    if stored == "local" && effective == "hybrid" {
+        bail!(
+            "switching from local to hybrid mid-chat isn't supported yet; \
+             regenerate the insight in hybrid mode if you want OpenRouter chat"
+        );
+    }
+    if stored == "llamacpp" && effective == "hybrid" {
+        bail!(
+            "switching from llamacpp to hybrid mid-chat isn't supported yet; \
+             regenerate the insight in hybrid mode if you want OpenRouter chat"
+        );
+    }
+    Ok(())
+}
+
 /// Pick the backend label for bootstrap. Bootstrap has no stored insight
 /// to defer to (that's continuation's behaviour), so the default is
 /// `"local"`. Returns an error if the supplied label is non-empty but
@@ -1469,8 +1576,11 @@ fn resolve_bootstrap_backend(supplied: Option<&str>) -> Result<String> {
         .map(|s| s.trim().to_lowercase())
         .filter(|s| !s.is_empty())
         .unwrap_or_else(|| "local".to_string());
-    if !matches!(lower.as_str(), "local" | "hybrid") {
-        bail!("unknown backend '{}'; expected 'local' or 'hybrid'", lower);
+    if !matches!(lower.as_str(), "local" | "hybrid" | "llamacpp") {
+        bail!(
+            "unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'",
+            lower
+        );
     }
     Ok(lower)
 }
@@ -2074,6 +2184,10 @@ mod tests {
     fn bootstrap_backend_accepts_local_and_hybrid_case_insensitively() {
         assert_eq!(resolve_bootstrap_backend(Some("LOCAL")).unwrap(), "local");
         assert_eq!(resolve_bootstrap_backend(Some("Hybrid")).unwrap(), "hybrid");
+        assert_eq!(
+            resolve_bootstrap_backend(Some("Llamacpp")).unwrap(),
+            "llamacpp"
+        );
         assert_eq!(
             resolve_bootstrap_backend(Some("  local  ")).unwrap(),
             "local"
@@ -2088,6 +2202,38 @@ mod tests {
         assert!(msg.contains("openrouter"));
     }
 
+    #[test]
+    fn cross_replay_rejects_local_to_hybrid() {
+        let err = validate_cross_replay("local", "hybrid").unwrap_err();
+        assert!(format!("{}", err).contains("local to hybrid"));
+    }
+
+    #[test]
+    fn cross_replay_rejects_llamacpp_to_hybrid() {
+        let err = validate_cross_replay("llamacpp", "hybrid").unwrap_err();
+        assert!(format!("{}", err).contains("llamacpp to hybrid"));
+    }
+
+    #[test]
+    fn cross_replay_allows_local_llamacpp_and_hybrid_llamacpp_transitions() {
+        // Local ↔ llamacpp: user is responsible for picking a vision-capable
+        // chat slot when the transcript has images.
+        assert!(validate_cross_replay("local", "llamacpp").is_ok());
+        assert!(validate_cross_replay("llamacpp", "local").is_ok());
+        // Hybrid ↔ llamacpp: both transcripts are text-only.
+        assert!(validate_cross_replay("hybrid", "llamacpp").is_ok());
+        // Same-backend replays are always fine.
+        assert!(validate_cross_replay("local", "local").is_ok());
+        assert!(validate_cross_replay("hybrid", "hybrid").is_ok());
+        assert!(validate_cross_replay("llamacpp", "llamacpp").is_ok());
+    }
+
+    #[test]
+    fn cross_replay_rejects_unknown_effective() {
+        let err = validate_cross_replay("local", "openrouter").unwrap_err();
+        assert!(format!("{}", err).contains("unknown backend"));
+    }
+
     #[test]
     fn bootstrap_system_message_includes_path_and_persona() {
         let out = build_bootstrap_system_message("you are helpful", "pics/IMG.jpg", None, None, "");
diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs
index 2e2da33..2a11e29 100644
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -12,6 +12,7 @@ use std::sync::{Arc, Mutex};
 use crate::ai::apollo_client::{ApolloClient, ApolloPlace};
 use crate::ai::llm_client::LlmClient;
 use crate::ai::ollama::{ChatMessage, OllamaClient, Tool};
+use crate::ai::llamacpp::LlamaCppClient;
 use crate::ai::openrouter::OpenRouterClient;
 use crate::ai::sms_client::{SmsApiClient, SmsSearchHit, SmsSearchParams};
 use crate::ai::user_display_name;
@@ -68,6 +69,9 @@ pub struct InsightGenerator {
     /// Optional OpenRouter client, used when `backend=hybrid` is requested.
     /// `None` when `OPENROUTER_API_KEY` is not configured.
     openrouter: Option<Arc<OpenRouterClient>>,
+    /// Optional llama-swap client, used when `backend=llamacpp` is requested.
+    /// `None` when `LLAMA_SWAP_URL` is not configured.
+    llamacpp: Option<Arc<LlamaCppClient>>,
     sms_client: SmsApiClient,
     /// Optional integration with Apollo's user-defined Places. When the
     /// integration is disabled (`APOLLO_API_BASE_URL` unset), every
@@ -120,6 +124,7 @@ impl InsightGenerator {
     pub fn new(
         ollama: OllamaClient,
         openrouter: Option<Arc<OpenRouterClient>>,
+        llamacpp: Option<Arc<LlamaCppClient>>,
         sms_client: SmsApiClient,
         apollo_client: ApolloClient,
         insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
@@ -137,6 +142,7 @@ impl InsightGenerator {
         Self {
             ollama,
             openrouter,
+            llamacpp,
             sms_client,
             apollo_client,
             insight_dao,
@@ -3574,23 +3580,31 @@ Return ONLY the summary, nothing else."#,
             .map(|s| s.trim().to_lowercase())
             .filter(|s| !s.is_empty())
             .unwrap_or_else(|| "local".to_string());
-        if !matches!(backend_label.as_str(), "local" | "hybrid") {
+        if !matches!(backend_label.as_str(), "local" | "hybrid" | "llamacpp") {
             return Err(anyhow::anyhow!(
-                "unknown backend '{}'; expected 'local' or 'hybrid'",
+                "unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'",
                 backend_label
             ));
         }
         span.set_attribute(KeyValue::new("backend", backend_label.clone()));
         let is_hybrid = backend_label == "hybrid";
+        let is_llamacpp = backend_label == "llamacpp";
+        // In hybrid + llamacpp modes the chat model never sees the image
+        // directly; we describe-then-inline locally before the agentic loop
+        // starts. Tracked as a single flag so vision/tool-gate logic doesn't
+        // have to branch twice.
+        let describes_then_inlines = is_hybrid || is_llamacpp;
 
         // 1b. Always build an Ollama client. In local mode it owns the chat
-        //     loop; in hybrid mode it still handles describe_image + any
-        //     tool-local calls (e.g. if a future tool needs embeddings).
-        //     Sampling overrides only apply in local mode — in hybrid the
-        //     user's params belong to the OpenRouter chat client.
-        let apply_sampling_to_ollama = !is_hybrid;
+        //     loop; in hybrid/llamacpp mode it still handles tool-local calls
+        //     (e.g. future embedding-backed tools). The chat backend is
+        //     selected separately below.
+        //     Sampling overrides only apply in local mode — in
+        //     hybrid/llamacpp the user's params belong to the alternate chat
+        //     client.
+        let apply_sampling_to_ollama = !describes_then_inlines;
         let mut ollama_client = if let Some(ref model) = custom_model
-            && !is_hybrid
+            && !describes_then_inlines
         {
             log::info!("Using custom model for agentic: {}", model);
             span.set_attribute(KeyValue::new("custom_model", model.clone()));
@@ -3601,7 +3615,7 @@ Return ONLY the summary, nothing else."#,
                 Some(model.clone()),
             )
         } else {
-            if !is_hybrid {
+            if !describes_then_inlines {
                 span.set_attribute(KeyValue::new("model", self.ollama.primary_model.clone()));
             }
             self.ollama.clone()
@@ -3674,6 +3688,44 @@ Return ONLY the summary, nothing else."#,
             None
         };
 
+        // 1d. In llamacpp mode, clone the configured LlamaCpp client and
+        //     apply per-request overrides. Same shape as the openrouter
+        //     branch above; describe_image will route through the vision
+        //     slot configured on the client.
+        let llamacpp_client: Option<LlamaCppClient> = if is_llamacpp {
+            let arc = self.llamacpp.as_ref().ok_or_else(|| {
+                anyhow::anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured")
+            })?;
+            let mut c: LlamaCppClient = (**arc).clone();
+            if let Some(ref m) = custom_model {
+                c.primary_model = m.clone();
+                span.set_attribute(KeyValue::new("custom_model", m.clone()));
+            }
+            span.set_attribute(KeyValue::new("llamacpp_model", c.primary_model.clone()));
+            if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() {
+                if let Some(t) = temperature {
+                    span.set_attribute(KeyValue::new("temperature", t as f64));
+                }
+                if let Some(p) = top_p {
+                    span.set_attribute(KeyValue::new("top_p", p as f64));
+                }
+                if let Some(k) = top_k {
+                    span.set_attribute(KeyValue::new("top_k", k as i64));
+                }
+                if let Some(m) = min_p {
+                    span.set_attribute(KeyValue::new("min_p", m as f64));
+                }
+                c.set_sampling_params(temperature, top_p, top_k, min_p);
+            }
+            if let Some(ctx) = num_ctx {
+                span.set_attribute(KeyValue::new("num_ctx", ctx as i64));
+                c.set_num_ctx(Some(ctx));
+            }
+            Some(c)
+        } else {
+            None
+        };
+
         let insight_cx = current_cx.with_span(span);
 
         // 2. Verify chat model supports tool calling.
@@ -3681,10 +3733,11 @@ Return ONLY the summary, nothing else."#,
         //    - hybrid: trust the operator's curated allowlist
         //      (OPENROUTER_ALLOWED_MODELS) — no live precheck. A bad model id
         //      surfaces as a chat-call error on the next step.
-        let has_vision = if is_hybrid {
-            // In hybrid mode the chat model never sees images directly — we
-            // describe-then-inject, so `has_vision` drives only whether we
-            // bother loading the image to describe it, which we always do.
+        let has_vision = if describes_then_inlines {
+            // In hybrid + llamacpp modes the chat model never sees images
+            // directly — we describe-then-inject, so `has_vision` drives only
+            // whether we bother loading the image to describe it, which we
+            // always do.
             true
         } else {
             if let Some(ref model_name) = custom_model {
@@ -3864,24 +3917,61 @@ Return ONLY the summary, nothing else."#,
             None
         };
 
-        let hybrid_visual_description: Option<String> = if is_hybrid {
+        // describe-then-inline path. In hybrid mode the vision backend
+        // defaults to Ollama but can be flipped to llamacpp via
+        // `HYBRID_VISION_BACKEND=llamacpp` (so chat goes to OpenRouter while
+        // vision/audio routes through llama-swap). In llamacpp mode we always
+        // use the llamacpp client's configured vision slot.
+        let inlined_visual_description: Option<String> = if describes_then_inlines {
             match image_base64.as_deref() {
-                Some(b64) => match self.ollama.describe_image(b64).await {
-                    Ok(desc) => {
-                        log::info!(
-                            "Hybrid: local vision describe succeeded ({} chars)",
-                            desc.len()
-                        );
-                        Some(desc)
+                Some(b64) => {
+                    let use_llamacpp_vision = if is_llamacpp {
+                        true
+                    } else {
+                        // is_hybrid branch — consult env switch
+                        matches!(
+                            std::env::var("HYBRID_VISION_BACKEND")
+                                .ok()
+                                .as_deref()
+                                .map(|s| s.trim().to_lowercase())
+                                .as_deref(),
+                            Some("llamacpp")
+                        )
+                    };
+
+                    let described = if use_llamacpp_vision {
+                        match self.llamacpp.as_ref() {
+                            Some(c) => c.describe_image(b64).await,
+                            None => {
+                                log::warn!(
+                                    "describe-then-inline: requested llamacpp vision but LLAMA_SWAP_URL is unset, falling back to Ollama"
+                                );
+                                self.ollama.describe_image(b64).await
+                            }
+                        }
+                    } else {
+                        self.ollama.describe_image(b64).await
+                    };
+
+                    match described {
+                        Ok(desc) => {
+                            log::info!(
+                                "{}: vision describe succeeded ({} chars)",
+                                backend_label,
+                                desc.len()
+                            );
+                            Some(desc)
+                        }
+                        Err(e) => {
+                            log::warn!(
+                                "{}: vision describe failed, continuing without: {}",
+                                backend_label,
+                                e
+                            );
+                            None
+                        }
                     }
-                    Err(e) => {
-                        log::warn!(
-                            "Hybrid: local vision describe failed, continuing without: {}",
-                            e
-                        );
-                        None
-                    }
-                },
+                }
                 None => None,
             }
         } else {
@@ -3934,7 +4024,7 @@ Return ONLY the summary, nothing else."#,
             .map(|c| format!("Contact/Person: {}", c))
             .unwrap_or_else(|| "Contact/Person: unknown".to_string());
 
-        let visual_block = hybrid_visual_description
+        let visual_block = inlined_visual_description
             .as_deref()
             .map(|d| format!("Visual description (from local vision model):\n{}\n\n", d))
             .unwrap_or_default();
@@ -3954,25 +4044,28 @@ Return ONLY the summary, nothing else."#,
         );
 
         // 10. Define tools. Gate flags computed from current data presence;
-        //     hybrid mode omits describe_photo since the chat model receives
-        //     the visual description inline (so we pass `false` for has_vision
-        //     in hybrid mode regardless of the model's actual capability).
-        let gate_opts = self.current_gate_opts(has_vision && !is_hybrid);
+        //     describe-then-inline modes (hybrid, llamacpp) omit describe_photo
+        //     since the chat model receives the visual description inline (so
+        //     we pass `false` for has_vision in those modes regardless of the
+        //     model's actual capability).
+        let gate_opts = self.current_gate_opts(has_vision && !describes_then_inlines);
         let tools = Self::build_tool_definitions(gate_opts);
 
-        // 11. Build initial messages. In hybrid mode images are never
-        //     attached to the wire message — the description is part of
-        //     `user_content`.
+        // 11. Build initial messages. In describe-then-inline modes images
+        //     are never attached to the wire message — the description is part
+        //     of `user_content`.
         let system_msg = ChatMessage::system(system_content);
         let mut user_msg = ChatMessage::user(user_content);
-        if !is_hybrid && let Some(ref img) = image_base64 {
+        if !describes_then_inlines && let Some(ref img) = image_base64 {
             user_msg.images = Some(vec![img.clone()]);
         }
 
         let mut messages = vec![system_msg, user_msg];
 
         // 12. Agentic loop — dispatch through the selected backend.
-        let chat_backend: &dyn LlmClient = if let Some(ref or_c) = openrouter_client {
+        let chat_backend: &dyn LlmClient = if let Some(ref lc_c) = llamacpp_client {
+            lc_c
+        } else if let Some(ref or_c) = openrouter_client {
             or_c
         } else {
             &ollama_client
diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs
new file mode 100644
index 0000000..100020c
--- /dev/null
+++ b/src/ai/llamacpp.rs
@@ -0,0 +1,978 @@
+// LlamaCppClient — talks to a llama-swap proxy that fronts one or more
+// llama-server processes. llama-swap exposes an OpenAI-compatible HTTP
+// surface (`/v1/chat/completions`, `/v1/embeddings`, `/v1/models`), so the
+// wire translation mirrors `OpenRouterClient` almost exactly.
+//
+// Differences from OpenRouter:
+// - No bearer auth or attribution headers; llama-swap is LAN-only.
+// - Three model slots (`primary_model` = chat, `vision_model`, `embedding_model`)
+//   each map to a model id in the llama-swap config. `describe_image` and
+//   `generate_embeddings` issue requests with the appropriate slot id in the
+//   `model` field, which is how llama-swap selects which backend process to
+//   run.
+// - `/v1/models` returns only the configured slot ids — capabilities aren't
+//   reported by the API, so `vision_models` is a config-time allowlist (env
+//   `LLAMA_SWAP_VISION_MODELS`) used to set `has_vision` on responses.
+//   `has_tool_calling` is assumed true for every slot, since llama-swap entries
+//   default to launching llama-server with `--jinja`.
+//
+// First consumer lands alongside the three-way backend dispatch in
+// insight_generator / insight_chat.
+#![allow(dead_code)]
+
+use anyhow::{Context, Result, anyhow, bail};
+use async_trait::async_trait;
+use reqwest::Client;
+use serde::Deserialize;
+use serde_json::{Value, json};
+use std::time::Duration;
+
+use crate::ai::llm_client::{
+    ChatMessage, LlmClient, LlmStreamEvent, ModelCapabilities, Tool, ToolCall, ToolCallFunction,
+};
+use futures::stream::{BoxStream, StreamExt};
+
+const DEFAULT_BASE_URL: &str = "http://localhost:9292/v1";
+const DEFAULT_PRIMARY_MODEL: &str = "chat";
+const DEFAULT_VISION_MODEL: &str = "vision";
+const DEFAULT_EMBEDDING_MODEL: &str = "embed";
+const DEFAULT_REQUEST_TIMEOUT_SECS: u64 = 180;
+
+/// OpenAI-compatible client targeting a llama-swap proxy in front of one or
+/// more llama-server processes. See the module doc-comment for the slot model.
+#[derive(Clone)]
+pub struct LlamaCppClient {
+    client: Client,
+    pub base_url: String,
+    /// Chat model slot id (e.g. `"chat"`). Used for `generate` /
+    /// `chat_with_tools` / `chat_with_tools_stream`.
+    pub primary_model: String,
+    /// Embedding model slot id (e.g. `"embed"`). Used for
+    /// `generate_embeddings`.
+    pub embedding_model: String,
+    /// Vision model slot id (e.g. `"vision"`). Used for `describe_image` and
+    /// included in `vision_models` automatically so capability lookups for
+    /// the default vision slot report `has_vision = true` even when the env
+    /// allowlist is empty.
+    pub vision_model: String,
+    /// Operator-curated set of slot ids known to be multimodal. Drives the
+    /// `has_vision` field in `list_models` / `model_capabilities`, since
+    /// llama-swap's `/v1/models` doesn't report modality. Empty allowlist
+    /// still marks `vision_model` as vision-capable.
+    pub vision_models: Vec<String>,
+    num_ctx: Option<i32>,
+    temperature: Option<f32>,
+    top_p: Option<f32>,
+    top_k: Option<i32>,
+    min_p: Option<f32>,
+}
+
+impl LlamaCppClient {
+    pub fn new(base_url: Option<String>, primary_model: Option<String>) -> Self {
+        let timeout_secs = std::env::var("LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS")
+            .ok()
+            .and_then(|v| v.parse::<u64>().ok())
+            .unwrap_or(DEFAULT_REQUEST_TIMEOUT_SECS);
+        Self {
+            client: Client::builder()
+                .connect_timeout(Duration::from_secs(10))
+                .timeout(Duration::from_secs(timeout_secs))
+                .build()
+                .unwrap_or_else(|_| Client::new()),
+            base_url: base_url.unwrap_or_else(|| DEFAULT_BASE_URL.to_string()),
+            primary_model: primary_model.unwrap_or_else(|| DEFAULT_PRIMARY_MODEL.to_string()),
+            embedding_model: DEFAULT_EMBEDDING_MODEL.to_string(),
+            vision_model: DEFAULT_VISION_MODEL.to_string(),
+            vision_models: Vec::new(),
+            num_ctx: None,
+            temperature: None,
+            top_p: None,
+            top_k: None,
+            min_p: None,
+        }
+    }
+
+    pub fn set_embedding_model(&mut self, model: String) {
+        self.embedding_model = model;
+    }
+
+    pub fn set_vision_model(&mut self, model: String) {
+        self.vision_model = model;
+    }
+
+    pub fn set_vision_models(&mut self, models: Vec<String>) {
+        self.vision_models = models;
+    }
+
+    pub fn set_num_ctx(&mut self, num_ctx: Option<i32>) {
+        self.num_ctx = num_ctx;
+    }
+
+    pub fn set_sampling_params(
+        &mut self,
+        temperature: Option<f32>,
+        top_p: Option<f32>,
+        top_k: Option<i32>,
+        min_p: Option<f32>,
+    ) {
+        self.temperature = temperature;
+        self.top_p = top_p;
+        self.top_k = top_k;
+        self.min_p = min_p;
+    }
+
+    /// Translate canonical messages to the OpenAI-compatible wire shape.
+    /// Behaviorally identical to `OpenRouterClient::messages_to_openai` —
+    /// stringify tool-call arguments, rewrite images into content-parts, attach
+    /// `tool_call_id` to `role=tool` messages based on the preceding assistant
+    /// turn's tool calls.
+    fn messages_to_openai(messages: &[ChatMessage]) -> Vec<Value> {
+        let mut out = Vec::with_capacity(messages.len());
+        let mut last_tool_call_ids: Vec<String> = Vec::new();
+        let mut next_tool_result_idx: usize = 0;
+
+        for msg in messages {
+            let mut obj = serde_json::Map::new();
+            obj.insert("role".into(), Value::String(msg.role.clone()));
+
+            match &msg.images {
+                Some(images) if !images.is_empty() => {
+                    let mut parts: Vec<Value> = Vec::new();
+                    if !msg.content.is_empty() {
+                        parts.push(json!({"type": "text", "text": msg.content}));
+                    }
+                    for img in images {
+                        let url = image_to_data_url(img);
+                        parts.push(json!({
+                            "type": "image_url",
+                            "image_url": { "url": url }
+                        }));
+                    }
+                    obj.insert("content".into(), Value::Array(parts));
+                }
+                _ => {
+                    obj.insert("content".into(), Value::String(msg.content.clone()));
+                }
+            }
+
+            if let Some(tcs) = &msg.tool_calls
+                && msg.role == "assistant"
+            {
+                let converted: Vec<Value> = tcs
+                    .iter()
+                    .enumerate()
+                    .map(|(i, call)| {
+                        let id = call.id.clone().unwrap_or_else(|| format!("call_{}", i));
+                        let args_str = serde_json::to_string(&call.function.arguments)
+                            .unwrap_or_else(|_| "{}".to_string());
+                        json!({
+                            "id": id,
+                            "type": "function",
+                            "function": {
+                                "name": call.function.name,
+                                "arguments": args_str,
+                            }
+                        })
+                    })
+                    .collect();
+                last_tool_call_ids = converted
+                    .iter()
+                    .filter_map(|v| v.get("id").and_then(|x| x.as_str()).map(String::from))
+                    .collect();
+                next_tool_result_idx = 0;
+                obj.insert("tool_calls".into(), Value::Array(converted));
+            }
+
+            if msg.role == "tool" {
+                let id = last_tool_call_ids
+                    .get(next_tool_result_idx)
+                    .cloned()
+                    .unwrap_or_else(|| "call_0".to_string());
+                obj.insert("tool_call_id".into(), Value::String(id));
+                next_tool_result_idx += 1;
+            }
+
+            out.push(Value::Object(obj));
+        }
+
+        out
+    }
+
+    /// Parse an OpenAI-compatible assistant message back into canonical shape.
+    /// llama.cpp emits `reasoning_content` on thinking models; we drop it for
+    /// parity with OpenRouter (which also strips upstream reasoning fields).
+    fn openai_message_to_chat(msg: &Value) -> Result<ChatMessage> {
+        let obj = msg
+            .as_object()
+            .ok_or_else(|| anyhow!("response message is not an object"))?;
+        let role = obj
+            .get("role")
+            .and_then(|v| v.as_str())
+            .unwrap_or("assistant")
+            .to_string();
+        let content = obj
+            .get("content")
+            .and_then(|v| v.as_str())
+            .unwrap_or("")
+            .to_string();
+
+        let tool_calls = if let Some(tcs) = obj.get("tool_calls").and_then(|v| v.as_array()) {
+            let mut parsed = Vec::with_capacity(tcs.len());
+            for tc in tcs {
+                let id = tc.get("id").and_then(|v| v.as_str()).map(String::from);
+                let function = tc
+                    .get("function")
+                    .ok_or_else(|| anyhow!("tool_call missing function field"))?;
+                let name = function
+                    .get("name")
+                    .and_then(|v| v.as_str())
+                    .unwrap_or_default()
+                    .to_string();
+                let args_value = match function.get("arguments") {
+                    Some(Value::String(s)) => {
+                        serde_json::from_str::<Value>(s).unwrap_or_else(|_| json!({}))
+                    }
+                    Some(v @ Value::Object(_)) => v.clone(),
+                    _ => json!({}),
+                };
+                parsed.push(ToolCall {
+                    id,
+                    function: ToolCallFunction {
+                        name,
+                        arguments: args_value,
+                    },
+                });
+            }
+            Some(parsed)
+        } else {
+            None
+        };
+
+        Ok(ChatMessage {
+            role,
+            content,
+            tool_calls,
+            images: None,
+        })
+    }
+
+    fn build_options(&self) -> Vec<(&'static str, Value)> {
+        let mut v = Vec::new();
+        if let Some(t) = self.temperature {
+            v.push(("temperature", json!(t)));
+        }
+        if let Some(p) = self.top_p {
+            v.push(("top_p", json!(p)));
+        }
+        if let Some(k) = self.top_k {
+            v.push(("top_k", json!(k)));
+        }
+        if let Some(m) = self.min_p {
+            v.push(("min_p", json!(m)));
+        }
+        // num_ctx isn't an OpenAI param; llama-server bakes ctx in at launch
+        // via -c, so we silently drop the override here. The config.yaml
+        // entry is the source of truth for context size.
+        let _ = self.num_ctx;
+        v
+    }
+
+    /// Issue a chat request with an explicit model id override. Used by
+    /// `describe_image` to route through the vision slot without mutating
+    /// `self.primary_model`.
+    async fn chat_completion_with_model(
+        &self,
+        model: &str,
+        messages: Vec<ChatMessage>,
+        tools: Vec<Tool>,
+    ) -> Result<(ChatMessage, Option<i32>, Option<i32>)> {
+        let url = format!("{}/chat/completions", self.base_url);
+        let mut body = serde_json::Map::new();
+        body.insert("model".into(), Value::String(model.to_string()));
+        body.insert(
+            "messages".into(),
+            Value::Array(Self::messages_to_openai(&messages)),
+        );
+        body.insert("stream".into(), Value::Bool(false));
+        if !tools.is_empty() {
+            body.insert(
+                "tools".into(),
+                serde_json::to_value(&tools).context("serializing tools")?,
+            );
+        }
+        for (k, v) in self.build_options() {
+            body.insert(k.into(), v);
+        }
+
+        let resp = self
+            .client
+            .post(&url)
+            .json(&Value::Object(body))
+            .send()
+            .await
+            .with_context(|| format!("POST {} failed", url))?;
+
+        if !resp.status().is_success() {
+            let status = resp.status();
+            let body = resp.text().await.unwrap_or_default();
+            bail!("llama-swap chat request failed: {} — {}", status, body);
+        }
+
+        let parsed: Value = resp.json().await.context("parsing chat response")?;
+        let choice = parsed
+            .get("choices")
+            .and_then(|v| v.as_array())
+            .and_then(|a| a.first())
+            .ok_or_else(|| {
+                anyhow!(
+                    "response missing choices[0]: {}",
+                    extract_error_detail(&parsed)
+                )
+            })?;
+        let msg = choice.get("message").ok_or_else(|| {
+            anyhow!(
+                "choices[0] missing message: {}",
+                extract_error_detail(&parsed)
+            )
+        })?;
+        let chat_msg = Self::openai_message_to_chat(msg)?;
+
+        let usage = parsed.get("usage");
+        let prompt_tokens = usage
+            .and_then(|u| u.get("prompt_tokens"))
+            .and_then(|v| v.as_i64())
+            .map(|n| n as i32);
+        let completion_tokens = usage
+            .and_then(|u| u.get("completion_tokens"))
+            .and_then(|v| v.as_i64())
+            .map(|n| n as i32);
+
+        Ok((chat_msg, prompt_tokens, completion_tokens))
+    }
+}
+
+#[async_trait]
+impl LlmClient for LlamaCppClient {
+    async fn generate(
+        &self,
+        prompt: &str,
+        system: Option<&str>,
+        images: Option<Vec<String>>,
+    ) -> Result<String> {
+        let mut messages: Vec<ChatMessage> = Vec::new();
+        if let Some(sys) = system {
+            messages.push(ChatMessage::system(sys));
+        }
+        let mut user = ChatMessage::user(prompt);
+        user.images = images;
+        messages.push(user);
+
+        let (reply, _, _) = self.chat_with_tools(messages, Vec::new()).await?;
+        Ok(reply.content)
+    }
+
+    async fn chat_with_tools(
+        &self,
+        messages: Vec<ChatMessage>,
+        tools: Vec<Tool>,
+    ) -> Result<(ChatMessage, Option<i32>, Option<i32>)> {
+        log::info!(
+            "llama-swap chat_with_tools: model={} messages={} tools={}",
+            self.primary_model,
+            messages.len(),
+            tools.len()
+        );
+        self.chat_completion_with_model(&self.primary_model.clone(), messages, tools)
+            .await
+    }
+
+    async fn chat_with_tools_stream(
+        &self,
+        messages: Vec<ChatMessage>,
+        tools: Vec<Tool>,
+    ) -> Result<BoxStream<'static, Result<LlmStreamEvent>>> {
+        let url = format!("{}/chat/completions", self.base_url);
+        let mut body = serde_json::Map::new();
+        body.insert(
+            "model".into(),
+            Value::String(self.primary_model.clone()),
+        );
+        body.insert(
+            "messages".into(),
+            Value::Array(Self::messages_to_openai(&messages)),
+        );
+        body.insert("stream".into(), Value::Bool(true));
+        body.insert(
+            "stream_options".into(),
+            serde_json::json!({ "include_usage": true }),
+        );
+        if !tools.is_empty() {
+            body.insert(
+                "tools".into(),
+                serde_json::to_value(&tools).context("serializing tools")?,
+            );
+        }
+        for (k, v) in self.build_options() {
+            body.insert(k.into(), v);
+        }
+
+        let resp = self
+            .client
+            .post(&url)
+            .json(&Value::Object(body))
+            .send()
+            .await
+            .with_context(|| format!("POST {} failed", url))?;
+
+        if !resp.status().is_success() {
+            let status = resp.status();
+            let body = resp.text().await.unwrap_or_default();
+            bail!("llama-swap stream request failed: {} — {}", status, body);
+        }
+
+        let byte_stream = resp.bytes_stream();
+        let stream = async_stream::stream! {
+            let mut byte_stream = byte_stream;
+            let mut buf: Vec<u8> = Vec::new();
+            let mut accumulated_content = String::new();
+            let mut tool_state: std::collections::BTreeMap<
+                usize,
+                (Option<String>, Option<String>, String),
+            > = std::collections::BTreeMap::new();
+            let mut role = "assistant".to_string();
+            let mut prompt_tokens: Option<i32> = None;
+            let mut completion_tokens: Option<i32> = None;
+            let mut done_seen = false;
+
+            while let Some(chunk) = byte_stream.next().await {
+                let chunk = match chunk {
+                    Ok(b) => b,
+                    Err(e) => {
+                        yield Err(anyhow!("stream read failed: {}", e));
+                        return;
+                    }
+                };
+                buf.extend_from_slice(&chunk);
+
+                while let Some(sep) = find_double_newline(&buf) {
+                    let frame = buf.drain(..sep + 2).collect::<Vec<_>>();
+                    let frame_str = match std::str::from_utf8(&frame) {
+                        Ok(s) => s,
+                        Err(_) => continue,
+                    };
+                    for line in frame_str.lines() {
+                        let line = line.trim_end_matches('\r');
+                        let payload = match line.strip_prefix("data: ") {
+                            Some(p) => p,
+                            None => continue,
+                        };
+                        if payload == "[DONE]" {
+                            done_seen = true;
+                            break;
+                        }
+                        let v: Value = match serde_json::from_str(payload) {
+                            Ok(v) => v,
+                            Err(e) => {
+                                log::warn!(
+                                    "malformed llama-swap SSE frame: {} ({})",
+                                    payload,
+                                    e
+                                );
+                                continue;
+                            }
+                        };
+
+                        if let Some(usage) = v.get("usage") {
+                            prompt_tokens = usage
+                                .get("prompt_tokens")
+                                .and_then(|n| n.as_i64())
+                                .map(|n| n as i32);
+                            completion_tokens = usage
+                                .get("completion_tokens")
+                                .and_then(|n| n.as_i64())
+                                .map(|n| n as i32);
+                        }
+
+                        let Some(choices) = v.get("choices").and_then(|c| c.as_array())
+                        else {
+                            continue;
+                        };
+                        let Some(choice) = choices.first() else { continue };
+                        let delta = match choice.get("delta") {
+                            Some(d) => d,
+                            None => continue,
+                        };
+                        if let Some(r) = delta.get("role").and_then(|v| v.as_str()) {
+                            role = r.to_string();
+                        }
+                        if let Some(content) =
+                            delta.get("content").and_then(|v| v.as_str())
+                            && !content.is_empty()
+                        {
+                            accumulated_content.push_str(content);
+                            yield Ok(LlmStreamEvent::TextDelta(content.to_string()));
+                        }
+                        if let Some(tcs) = delta.get("tool_calls").and_then(|v| v.as_array()) {
+                            for tc_delta in tcs {
+                                let idx = tc_delta
+                                    .get("index")
+                                    .and_then(|n| n.as_u64())
+                                    .unwrap_or(0) as usize;
+                                let entry = tool_state
+                                    .entry(idx)
+                                    .or_insert((None, None, String::new()));
+                                if let Some(id) =
+                                    tc_delta.get("id").and_then(|v| v.as_str())
+                                {
+                                    entry.0 = Some(id.to_string());
+                                }
+                                if let Some(func) = tc_delta.get("function") {
+                                    if let Some(name) =
+                                        func.get("name").and_then(|v| v.as_str())
+                                    {
+                                        entry.1 = Some(name.to_string());
+                                    }
+                                    if let Some(args) =
+                                        func.get("arguments").and_then(|v| v.as_str())
+                                    {
+                                        entry.2.push_str(args);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    if done_seen {
+                        break;
+                    }
+                }
+                if done_seen {
+                    break;
+                }
+            }
+
+            let tool_calls: Option<Vec<ToolCall>> = if tool_state.is_empty() {
+                None
+            } else {
+                let mut v = Vec::with_capacity(tool_state.len());
+                for (_idx, (id, name, args)) in tool_state {
+                    let arguments: Value = if args.trim().is_empty() {
+                        Value::Object(Default::default())
+                    } else {
+                        serde_json::from_str(&args).unwrap_or_else(|_| {
+                            Value::Object(Default::default())
+                        })
+                    };
+                    v.push(ToolCall {
+                        id,
+                        function: ToolCallFunction {
+                            name: name.unwrap_or_default(),
+                            arguments,
+                        },
+                    });
+                }
+                Some(v)
+            };
+
+            let message = ChatMessage {
+                role,
+                content: accumulated_content,
+                tool_calls,
+                images: None,
+            };
+            yield Ok(LlmStreamEvent::Done {
+                message,
+                prompt_eval_count: prompt_tokens,
+                eval_count: completion_tokens,
+            });
+        };
+
+        Ok(Box::pin(stream))
+    }
+
+    async fn generate_embeddings(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>> {
+        let url = format!("{}/embeddings", self.base_url);
+        let body = json!({
+            "model": self.embedding_model,
+            "input": texts,
+        });
+
+        let resp = self
+            .client
+            .post(&url)
+            .json(&body)
+            .send()
+            .await
+            .with_context(|| format!("POST {} failed", url))?;
+
+        if !resp.status().is_success() {
+            let status = resp.status();
+            let body = resp.text().await.unwrap_or_default();
+            bail!("llama-swap embedding request failed: {} — {}", status, body);
+        }
+
+        #[derive(Deserialize)]
+        struct EmbedResponse {
+            data: Vec<EmbedItem>,
+        }
+        #[derive(Deserialize)]
+        struct EmbedItem {
+            embedding: Vec<f32>,
+        }
+
+        let parsed: EmbedResponse = resp.json().await.context("parsing embed response")?;
+        Ok(parsed.data.into_iter().map(|i| i.embedding).collect())
+    }
+
+    async fn describe_image(&self, image_base64: &str) -> Result<String> {
+        let prompt = "Briefly describe what you see in this image in 1-2 sentences. \
+                      Focus on the people, location, and activity.";
+        let system = "You are a scene description assistant. Be concise and factual.";
+
+        let messages = vec![
+            ChatMessage::system(system),
+            ChatMessage {
+                role: "user".to_string(),
+                content: prompt.to_string(),
+                tool_calls: None,
+                images: Some(vec![image_base64.to_string()]),
+            },
+        ];
+
+        let (reply, _, _) = self
+            .chat_completion_with_model(&self.vision_model.clone(), messages, Vec::new())
+            .await?;
+        Ok(reply.content)
+    }
+
+    async fn list_models(&self) -> Result<Vec<ModelCapabilities>> {
+        let url = format!("{}/models", self.base_url);
+        let resp = self
+            .client
+            .get(&url)
+            .send()
+            .await
+            .with_context(|| format!("GET {} failed", url))?;
+
+        if !resp.status().is_success() {
+            let status = resp.status();
+            let body = resp.text().await.unwrap_or_default();
+            bail!("llama-swap list_models failed: {} — {}", status, body);
+        }
+
+        let parsed: Value = resp.json().await.context("parsing models response")?;
+        let data = parsed
+            .get("data")
+            .and_then(|v| v.as_array())
+            .ok_or_else(|| anyhow!("models response missing data[]"))?;
+
+        let caps: Vec<ModelCapabilities> = data
+            .iter()
+            .map(|m| self.parse_model_capabilities(m))
+            .collect();
+
+        Ok(caps)
+    }
+
+    async fn model_capabilities(&self, model: &str) -> Result<ModelCapabilities> {
+        let all = self.list_models().await?;
+        all.into_iter()
+            .find(|m| m.name == model)
+            .ok_or_else(|| anyhow!("model '{}' not found on llama-swap", model))
+    }
+
+    fn primary_model(&self) -> &str {
+        &self.primary_model
+    }
+}
+
+impl LlamaCppClient {
+    fn parse_model_capabilities(&self, m: &Value) -> ModelCapabilities {
+        let name = m
+            .get("id")
+            .and_then(|v| v.as_str())
+            .unwrap_or_default()
+            .to_string();
+        let has_vision = name == self.vision_model || self.vision_models.iter().any(|v| v == &name);
+        // Tool calling is the default for llama-swap entries we configure
+        // (--jinja flag); no negative-list mechanism yet, so report true.
+        ModelCapabilities {
+            name,
+            has_vision,
+            has_tool_calling: true,
+        }
+    }
+}
+
+/// Extract a diagnostic fragment from a llama-swap / llama-server response
+/// that doesn't match the expected `{choices: [...]}` shape. llama-server
+/// returns errors as `{"error": {"message": "...", "code": N, "type": "..."}}`;
+/// llama-swap itself sometimes wraps subprocess failures with its own
+/// `{"error": "..."}` flat shape. Surface either when present, otherwise fall
+/// back to a truncated raw-JSON view.
+fn extract_error_detail(parsed: &Value) -> String {
+    if let Some(err) = parsed.get("error") {
+        match err {
+            Value::Object(_) => {
+                let message = err
+                    .get("message")
+                    .and_then(|v| v.as_str())
+                    .unwrap_or("(no message)");
+                let code = err
+                    .get("code")
+                    .map(|v| match v {
+                        Value::String(s) => s.clone(),
+                        other => other.to_string(),
+                    })
+                    .unwrap_or_else(|| "?".to_string());
+                let short_message: String = message.chars().take(240).collect();
+                return format!("error code={} message=\"{}\"", code, short_message);
+            }
+            Value::String(s) => {
+                let short: String = s.chars().take(240).collect();
+                return format!("error=\"{}\"", short);
+            }
+            _ => {}
+        }
+    }
+    let raw = parsed.to_string();
+    raw.chars().take(300).collect()
+}
+
+fn find_double_newline(buf: &[u8]) -> Option<usize> {
+    for i in 0..buf.len().saturating_sub(1) {
+        if buf[i] == b'\n' && buf[i + 1] == b'\n' {
+            return Some(i);
+        }
+        if i + 3 < buf.len()
+            && buf[i] == b'\r'
+            && buf[i + 1] == b'\n'
+            && buf[i + 2] == b'\r'
+            && buf[i + 3] == b'\n'
+        {
+            return Some(i + 1);
+        }
+    }
+    None
+}
+
+fn image_to_data_url(img: &str) -> String {
+    if img.starts_with("data:") {
+        img.to_string()
+    } else {
+        format!("data:image/jpeg;base64,{}", img)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn tool_call_arguments_stringified_on_send() {
+        let msg = ChatMessage {
+            role: "assistant".into(),
+            content: String::new(),
+            tool_calls: Some(vec![ToolCall {
+                id: Some("call_abc".into()),
+                function: ToolCallFunction {
+                    name: "search_sms".into(),
+                    arguments: json!({"query": "hello", "limit": 5}),
+                },
+            }]),
+            images: None,
+        };
+
+        let wire = LlamaCppClient::messages_to_openai(&[msg]);
+        let tcs = wire[0]
+            .get("tool_calls")
+            .and_then(|v| v.as_array())
+            .expect("tool_calls present");
+        let args = tcs[0]
+            .get("function")
+            .and_then(|f| f.get("arguments"))
+            .and_then(|a| a.as_str())
+            .expect("arguments stringified");
+        let parsed: Value = serde_json::from_str(args).unwrap();
+        assert_eq!(parsed["query"], "hello");
+        assert_eq!(parsed["limit"], 5);
+    }
+
+    #[test]
+    fn tool_call_arguments_parsed_on_receive() {
+        let response_msg = json!({
+            "role": "assistant",
+            "content": "",
+            "tool_calls": [{
+                "id": "call_xyz",
+                "type": "function",
+                "function": {
+                    "name": "get_weather",
+                    "arguments": "{\"city\":\"Boston\",\"units\":\"celsius\"}"
+                }
+            }]
+        });
+        let parsed = LlamaCppClient::openai_message_to_chat(&response_msg).unwrap();
+        let tcs = parsed.tool_calls.unwrap();
+        assert_eq!(tcs.len(), 1);
+        assert_eq!(tcs[0].function.name, "get_weather");
+        assert_eq!(tcs[0].function.arguments["city"], "Boston");
+        assert_eq!(tcs[0].function.arguments["units"], "celsius");
+        assert_eq!(tcs[0].id.as_deref(), Some("call_xyz"));
+    }
+
+    #[test]
+    fn tool_call_arguments_accept_native_json_on_receive() {
+        // Some llama.cpp builds emit arguments as a JSON object directly when
+        // jinja's tool-output strict-string rule isn't applied — accept both.
+        let response_msg = json!({
+            "role": "assistant",
+            "content": "",
+            "tool_calls": [{
+                "id": "call_1",
+                "type": "function",
+                "function": {
+                    "name": "foo",
+                    "arguments": {"nested": {"k": 1}}
+                }
+            }]
+        });
+        let parsed = LlamaCppClient::openai_message_to_chat(&response_msg).unwrap();
+        let tc = &parsed.tool_calls.unwrap()[0];
+        assert_eq!(tc.function.arguments["nested"]["k"], 1);
+    }
+
+    #[test]
+    fn images_become_content_parts() {
+        let mut msg = ChatMessage::user("What is in this photo?");
+        msg.images = Some(vec!["BASE64DATA".into()]);
+
+        let wire = LlamaCppClient::messages_to_openai(&[msg]);
+        let content = wire[0].get("content").and_then(|v| v.as_array()).unwrap();
+        assert_eq!(content.len(), 2);
+        assert_eq!(content[0]["type"], "text");
+        assert_eq!(content[0]["text"], "What is in this photo?");
+        assert_eq!(content[1]["type"], "image_url");
+        assert_eq!(
+            content[1]["image_url"]["url"],
+            "data:image/jpeg;base64,BASE64DATA"
+        );
+    }
+
+    #[test]
+    fn data_url_images_pass_through_unchanged() {
+        let mut msg = ChatMessage::user("");
+        msg.images = Some(vec!["data:image/png;base64,ABCDEF".into()]);
+        let wire = LlamaCppClient::messages_to_openai(&[msg]);
+        let content = wire[0].get("content").and_then(|v| v.as_array()).unwrap();
+        assert_eq!(content.len(), 1);
+        assert_eq!(
+            content[0]["image_url"]["url"],
+            "data:image/png;base64,ABCDEF"
+        );
+    }
+
+    #[test]
+    fn text_only_message_stays_string() {
+        let msg = ChatMessage::user("hello");
+        let wire = LlamaCppClient::messages_to_openai(&[msg]);
+        assert_eq!(wire[0]["content"], "hello");
+        assert!(wire[0]["content"].as_str().is_some());
+    }
+
+    #[test]
+    fn tool_result_inherits_tool_call_id_from_prior_assistant() {
+        let assistant = ChatMessage {
+            role: "assistant".into(),
+            content: String::new(),
+            tool_calls: Some(vec![ToolCall {
+                id: Some("call_42".into()),
+                function: ToolCallFunction {
+                    name: "lookup".into(),
+                    arguments: json!({}),
+                },
+            }]),
+            images: None,
+        };
+        let tool_result = ChatMessage::tool_result("found it");
+
+        let wire = LlamaCppClient::messages_to_openai(&[assistant, tool_result]);
+        assert_eq!(wire[1]["role"], "tool");
+        assert_eq!(wire[1]["tool_call_id"], "call_42");
+    }
+
+    #[test]
+    fn multiple_tool_results_map_to_sequential_call_ids() {
+        let assistant = ChatMessage {
+            role: "assistant".into(),
+            content: String::new(),
+            tool_calls: Some(vec![
+                ToolCall {
+                    id: Some("call_A".into()),
+                    function: ToolCallFunction {
+                        name: "a".into(),
+                        arguments: json!({}),
+                    },
+                },
+                ToolCall {
+                    id: Some("call_B".into()),
+                    function: ToolCallFunction {
+                        name: "b".into(),
+                        arguments: json!({}),
+                    },
+                },
+            ]),
+            images: None,
+        };
+        let r1 = ChatMessage::tool_result("a result");
+        let r2 = ChatMessage::tool_result("b result");
+
+        let wire = LlamaCppClient::messages_to_openai(&[assistant, r1, r2]);
+        assert_eq!(wire[1]["tool_call_id"], "call_A");
+        assert_eq!(wire[2]["tool_call_id"], "call_B");
+    }
+
+    #[test]
+    fn missing_tool_call_id_gets_synthetic_fallback() {
+        let assistant = ChatMessage {
+            role: "assistant".into(),
+            content: String::new(),
+            tool_calls: Some(vec![ToolCall {
+                id: None,
+                function: ToolCallFunction {
+                    name: "noid".into(),
+                    arguments: json!({}),
+                },
+            }]),
+            images: None,
+        };
+        let wire = LlamaCppClient::messages_to_openai(&[assistant]);
+        let tcs = wire[0]
+            .get("tool_calls")
+            .and_then(|v| v.as_array())
+            .unwrap();
+        assert_eq!(tcs[0]["id"], "call_0");
+    }
+
+    #[test]
+    fn capability_inference_uses_vision_model_and_allowlist() {
+        let mut c = LlamaCppClient::new(None, Some("chat".into()));
+        c.set_vision_model("vision".into());
+        c.set_vision_models(vec!["qwen-vl".into()]);
+
+        let m_chat = json!({ "id": "chat" });
+        let m_vision = json!({ "id": "vision" });
+        let m_qwen = json!({ "id": "qwen-vl" });
+        let m_other = json!({ "id": "embed" });
+
+        let chat = c.parse_model_capabilities(&m_chat);
+        let vision = c.parse_model_capabilities(&m_vision);
+        let qwen = c.parse_model_capabilities(&m_qwen);
+        let other = c.parse_model_capabilities(&m_other);
+
+        assert!(!chat.has_vision);
+        assert!(chat.has_tool_calling);
+        assert!(vision.has_vision);
+        assert!(qwen.has_vision);
+        assert!(!other.has_vision);
+    }
+}
diff --git a/src/ai/mod.rs b/src/ai/mod.rs
index 3468325..204da04 100644
--- a/src/ai/mod.rs
+++ b/src/ai/mod.rs
@@ -5,6 +5,7 @@ pub mod face_client;
 pub mod handlers;
 pub mod insight_chat;
 pub mod insight_generator;
+pub mod llamacpp;
 pub mod llm_client;
 pub mod ollama;
 pub mod openrouter;
@@ -20,7 +21,8 @@ pub use handlers::{
     chat_history_handler, chat_rewind_handler, chat_stream_handler, chat_turn_handler,
     delete_insight_handler, export_training_data_handler, generate_agentic_insight_handler,
     generate_insight_handler, get_all_insights_handler, get_available_models_handler,
-    get_insight_handler, get_openrouter_models_handler, rate_insight_handler,
+    get_insight_handler, get_llamacpp_models_handler, get_openrouter_models_handler,
+    rate_insight_handler,
 };
 pub use insight_generator::InsightGenerator;
 #[allow(unused_imports)]
diff --git a/src/bin/populate_knowledge.rs b/src/bin/populate_knowledge.rs
index 29945d7..71f2f8a 100644
--- a/src/bin/populate_knowledge.rs
+++ b/src/bin/populate_knowledge.rs
@@ -195,6 +195,7 @@ async fn main() -> anyhow::Result<()> {
     let generator = InsightGenerator::new(
         ollama,
         None,
+        None,
         sms_client,
         apollo_client,
         insight_dao.clone(),
diff --git a/src/main.rs b/src/main.rs
index 63013ce..3bd3656 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -313,6 +313,7 @@ fn main() -> std::io::Result<()> {
                 .service(ai::get_all_insights_handler)
                 .service(ai::get_available_models_handler)
                 .service(ai::get_openrouter_models_handler)
+                .service(ai::get_llamacpp_models_handler)
                 .service(ai::chat_turn_handler)
                 .service(ai::chat_stream_handler)
                 .service(ai::chat_history_handler)
diff --git a/src/state.rs b/src/state.rs
index 8f1bd4e..96d1c22 100644
--- a/src/state.rs
+++ b/src/state.rs
@@ -2,6 +2,7 @@ use crate::ai::apollo_client::ApolloClient;
 use crate::ai::clip_client::ClipClient;
 use crate::ai::face_client::FaceClient;
 use crate::ai::insight_chat::{ChatLockMap, InsightChatService};
+use crate::ai::llamacpp::LlamaCppClient;
 use crate::ai::openrouter::OpenRouterClient;
 use crate::ai::{InsightGenerator, OllamaClient, SmsApiClient};
 use crate::database::{
@@ -62,6 +63,16 @@ pub struct AppState {
     /// Curated list of OpenRouter model ids exposed to clients. Sourced from
     /// `OPENROUTER_ALLOWED_MODELS` (comma-separated). Empty when unset.
     pub openrouter_allowed_models: Vec<String>,
+    /// `None` when `LLAMA_SWAP_URL` is not configured. Consulted only when a
+    /// request explicitly opts into `backend=llamacpp`. Same shape as the
+    /// `openrouter` slot — present here so handlers can route to it without
+    /// threading through the generator.
+    #[allow(dead_code)]
+    pub llamacpp: Option<Arc<LlamaCppClient>>,
+    /// Curated list of llama-swap model ids exposed to clients. Sourced from
+    /// `LLAMA_SWAP_ALLOWED_MODELS` (comma-separated). Empty when unset; the
+    /// server then falls back to `LLAMA_SWAP_PRIMARY_MODEL`.
+    pub llamacpp_allowed_models: Vec<String>,
     pub sms_client: SmsApiClient,
     pub insight_generator: InsightGenerator,
     /// Chat continuation service. Hold an Arc so handlers can clone cheaply.
@@ -105,6 +116,8 @@ impl AppState {
         ollama: OllamaClient,
         openrouter: Option<Arc<OpenRouterClient>>,
         openrouter_allowed_models: Vec<String>,
+        llamacpp: Option<Arc<LlamaCppClient>>,
+        llamacpp_allowed_models: Vec<String>,
         sms_client: SmsApiClient,
         insight_generator: InsightGenerator,
         insight_chat: Arc<InsightChatService>,
@@ -145,6 +158,8 @@ impl AppState {
             ollama,
             openrouter,
             openrouter_allowed_models,
+            llamacpp,
+            llamacpp_allowed_models,
             sms_client,
             insight_generator,
             insight_chat,
@@ -186,6 +201,9 @@ impl Default for AppState {
         let openrouter = build_openrouter_from_env();
         let openrouter_allowed_models = parse_openrouter_allowed_models();
 
+        let llamacpp = build_llamacpp_from_env();
+        let llamacpp_allowed_models = parse_llamacpp_allowed_models();
+
         let sms_api_url =
             env::var("SMS_API_URL").unwrap_or_else(|_| "http://localhost:8000".to_string());
         let sms_api_token = env::var("SMS_API_TOKEN").ok();
@@ -250,6 +268,7 @@ impl Default for AppState {
         let insight_generator = InsightGenerator::new(
             ollama.clone(),
             openrouter.clone(),
+            llamacpp.clone(),
             sms_client.clone(),
             apollo_client.clone(),
             insight_dao.clone(),
@@ -273,6 +292,7 @@ impl Default for AppState {
             Arc::new(insight_generator.clone()),
             ollama.clone(),
             openrouter.clone(),
+            llamacpp.clone(),
             insight_dao.clone(),
             chat_locks,
         ));
@@ -294,6 +314,8 @@ impl Default for AppState {
             ollama,
             openrouter,
             openrouter_allowed_models,
+            llamacpp,
+            llamacpp_allowed_models,
             sms_client,
             insight_generator,
             insight_chat,
@@ -335,6 +357,50 @@ fn parse_openrouter_allowed_models() -> Vec<String> {
         .collect()
 }
 
+/// Build a `LlamaCppClient` from environment variables. Returns `None` when
+/// `LLAMA_SWAP_URL` is unset (the llamacpp backend is then unavailable and
+/// requests for it return a clear error). The slot ids default to the
+/// names the bundled `llama-swap/config.yaml` uses — `chat` / `vision` /
+/// `embed` — so a minimal deploy only needs to set `LLAMA_SWAP_URL`.
+fn build_llamacpp_from_env() -> Option<Arc<LlamaCppClient>> {
+    let base_url = env::var("LLAMA_SWAP_URL").ok()?;
+    let primary_model = env::var("LLAMA_SWAP_PRIMARY_MODEL").ok();
+    let mut client = LlamaCppClient::new(Some(base_url), primary_model);
+    if let Ok(model) = env::var("LLAMA_SWAP_EMBEDDING_MODEL") {
+        client.set_embedding_model(model);
+    }
+    if let Ok(model) = env::var("LLAMA_SWAP_VISION_MODEL") {
+        client.set_vision_model(model);
+    }
+    client.set_vision_models(parse_llamacpp_vision_models());
+    Some(Arc::new(client))
+}
+
+/// Parse `LLAMA_SWAP_ALLOWED_MODELS` (comma-separated) into a vec. Used to
+/// drive `/insights/llamacpp/models`; empty when unset.
+fn parse_llamacpp_allowed_models() -> Vec<String> {
+    env::var("LLAMA_SWAP_ALLOWED_MODELS")
+        .unwrap_or_default()
+        .split(',')
+        .map(|s| s.trim().to_string())
+        .filter(|s| !s.is_empty())
+        .collect()
+}
+
+/// Parse `LLAMA_SWAP_VISION_MODELS` (comma-separated) — slot ids that report
+/// `has_vision = true` in capability lookups. The configured `vision_model`
+/// (default `vision`) is always considered vision-capable regardless of this
+/// list, so a deploy that only uses the default vision slot can leave it
+/// unset.
+fn parse_llamacpp_vision_models() -> Vec<String> {
+    env::var("LLAMA_SWAP_VISION_MODELS")
+        .unwrap_or_default()
+        .split(',')
+        .map(|s| s.trim().to_string())
+        .filter(|s| !s.is_empty())
+        .collect()
+}
+
 #[cfg(test)]
 impl AppState {
     /// Creates an AppState instance for testing with temporary directories
@@ -397,6 +463,7 @@ impl AppState {
         let insight_generator = InsightGenerator::new(
             ollama.clone(),
             None,
+            None,
             sms_client.clone(),
             apollo_client.clone(),
             insight_dao.clone(),
@@ -418,6 +485,7 @@ impl AppState {
             Arc::new(insight_generator.clone()),
             ollama.clone(),
             None,
+            None,
             insight_dao.clone(),
             chat_locks,
         ));
@@ -445,6 +513,8 @@ impl AppState {
             ollama,
             None,
             Vec::new(),
+            None,
+            Vec::new(),
             sms_client,
             insight_generator,
             insight_chat,
-- 
2.49.1


From d14df63f196d80b73c9135621ab842af7b2e45f7 Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Wed, 20 May 2026 17:54:08 -0400
Subject: [PATCH 02/11] env.example: document LLAMA_SWAP_* +
 HYBRID_VISION_BACKEND vars

Mirrors the section added to CLAUDE.md so deploys can opt into the
llamacpp backend from the template alone.
---
 .env.example | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)

diff --git a/.env.example b/.env.example
index 718f6bd..63d8436 100644
--- a/.env.example
+++ b/.env.example
@@ -53,6 +53,27 @@ AGENTIC_CHAT_MAX_ITERATIONS=6
 # OPENROUTER_HTTP_REFERER=https://your-site.example
 # OPENROUTER_APP_TITLE=ImageApi
 
+# ── AI Insights — llama.cpp / llama-swap (optional) ─────────────────────
+# Set LLAMA_SWAP_URL to enable the `llamacpp` chat_backend. Talks
+# OpenAI-compatible /v1 to a llama-swap proxy that fronts per-slot
+# llama-server instances (chat / vision / embed). Like hybrid, the
+# agentic loop describes images via the vision slot then inlines the
+# text into the chat slot — so the chat slot itself can be text-only.
+# LLAMA_SWAP_URL=http://localhost:9292/v1
+# LLAMA_SWAP_PRIMARY_MODEL=chat
+# LLAMA_SWAP_VISION_MODEL=vision
+# LLAMA_SWAP_EMBEDDING_MODEL=embed
+# Comma-separated allowlist of model ids the /v1/models endpoint should
+# advertise as vision-capable (llama-swap doesn't report modality).
+# LLAMA_SWAP_VISION_MODELS=vision
+# Comma-separated allowlist surfaced by /insights/llamacpp/models.
+# LLAMA_SWAP_ALLOWED_MODELS=chat,vision,embed
+# LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=120
+# Routes hybrid mode's vision-describe pass through llama-swap's vision
+# slot instead of Ollama (chat still goes to OpenRouter). Values:
+# `ollama` (default) | `llamacpp`.
+# HYBRID_VISION_BACKEND=ollama
+
 # ── AI Insights — sibling services (optional) ───────────────────────────
 # Apollo (places, face inference, CLIP encoders). Single-Apollo deploys
 # typically set only APOLLO_API_BASE_URL and let the face + CLIP
-- 
2.49.1


From be51421b3879253bea74e27bc48ce659f419ba2c Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Thu, 21 May 2026 11:36:58 -0400
Subject: [PATCH 03/11] ai: collapse llamacpp into LLM_BACKEND env switch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reverts the per-request backend="llamacpp" value. Chat/vision/embedding
backend is now a deploy-time decision (LLM_BACKEND=ollama|llamacpp),
applied globally across chat, vision describe, and embeddings — so
embedding vectors stay in one space across the index.

- Per-request backend whitelist back to "local"|"hybrid". A request
  arriving with backend="llamacpp" is rejected.
- LLM_BACKEND=llamacpp swaps the entire local stack to llama-swap:
  chat hits the chat slot, describe hits the vision slot, embeddings
  hit the embed slot. Hybrid mode still routes chat to OpenRouter
  but uses LLM_BACKEND for the describe pass.
- Drops env vars HYBRID_VISION_BACKEND, LLAMA_SWAP_VISION_MODELS,
  EMBEDDING_BACKEND (the last never shipped). Drops the
  LlamaCppClient.vision_models allowlist — capability inference now
  reports has_vision only for the configured vision_model slot.
- Drops the /insights/llamacpp/models handler. /insights/models is
  the single endpoint; returns Ollama servers under LLM_BACKEND=ollama
  and llama-swap slots (from LLAMA_SWAP_ALLOWED_MODELS) under
  LLM_BACKEND=llamacpp. Same envelope shape either way.
- New ai::embed_one helper routes embeddings through llama-swap when
  LLM_BACKEND=llamacpp (else Ollama). Wires it into the four
  insight_generator embedding sites.
- Cross-replay matrix simplifies to pre-llamacpp shape (local↔local,
  hybrid↔hybrid, hybrid→local allowed; local→hybrid rejected).
---
 .env.example                |  32 ++++---
 CLAUDE.md                   |  96 +++++++++++---------
 src/ai/handlers.rs          |  64 +++++++-------
 src/ai/insight_chat.rs      | 171 ++++++++++++++----------------------
 src/ai/insight_generator.rs | 127 +++++++++++++-------------
 src/ai/llamacpp.rs          |  33 ++-----
 src/ai/mod.rs               |  88 ++++++++++++++++++-
 src/main.rs                 |   1 -
 src/state.rs                |  27 ++----
 9 files changed, 338 insertions(+), 301 deletions(-)

diff --git a/.env.example b/.env.example
index 63d8436..81fab4e 100644
--- a/.env.example
+++ b/.env.example
@@ -53,26 +53,30 @@ AGENTIC_CHAT_MAX_ITERATIONS=6
 # OPENROUTER_HTTP_REFERER=https://your-site.example
 # OPENROUTER_APP_TITLE=ImageApi
 
+# ── AI Insights — local backend switch ──────────────────────────────────
+# Picks which local LLM stack the server uses for chat, vision describe,
+# and embeddings. `ollama` (default) uses the OLLAMA_* settings above;
+# `llamacpp` uses the LLAMA_SWAP_* settings below. The switch is global
+# and applies to both `backend=local` and `backend=hybrid` (hybrid keeps
+# chat on OpenRouter but still uses this stack for the describe pass).
+# Don't flip mid-deploy without re-embedding existing index rows —
+# mixed vector spaces break similarity search.
+# LLM_BACKEND=ollama
+
 # ── AI Insights — llama.cpp / llama-swap (optional) ─────────────────────
-# Set LLAMA_SWAP_URL to enable the `llamacpp` chat_backend. Talks
-# OpenAI-compatible /v1 to a llama-swap proxy that fronts per-slot
-# llama-server instances (chat / vision / embed). Like hybrid, the
-# agentic loop describes images via the vision slot then inlines the
-# text into the chat slot — so the chat slot itself can be text-only.
+# Set LLAMA_SWAP_URL plus LLM_BACKEND=llamacpp to swap the local stack
+# off Ollama. Talks OpenAI-compatible /v1 to a llama-swap proxy fronting
+# per-slot llama-server instances (chat / vision / embed). The chat slot
+# is treated as text-only — images are pre-described via the vision slot
+# and inlined into the prompt.
 # LLAMA_SWAP_URL=http://localhost:9292/v1
 # LLAMA_SWAP_PRIMARY_MODEL=chat
 # LLAMA_SWAP_VISION_MODEL=vision
 # LLAMA_SWAP_EMBEDDING_MODEL=embed
-# Comma-separated allowlist of model ids the /v1/models endpoint should
-# advertise as vision-capable (llama-swap doesn't report modality).
-# LLAMA_SWAP_VISION_MODELS=vision
-# Comma-separated allowlist surfaced by /insights/llamacpp/models.
+# Comma-separated allowlist surfaced by /insights/models when
+# LLM_BACKEND=llamacpp.
 # LLAMA_SWAP_ALLOWED_MODELS=chat,vision,embed
-# LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=120
-# Routes hybrid mode's vision-describe pass through llama-swap's vision
-# slot instead of Ollama (chat still goes to OpenRouter). Values:
-# `ollama` (default) | `llamacpp`.
-# HYBRID_VISION_BACKEND=ollama
+# LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180
 
 # ── AI Insights — sibling services (optional) ───────────────────────────
 # Apollo (places, face inference, CLIP encoders). Single-Apollo deploys
diff --git a/CLAUDE.md b/CLAUDE.md
index d3419e6..d06b29f 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -473,9 +473,8 @@ GET /memories?path=...&recursive=true
 POST /insights/generate              (non-agentic single-shot)
 POST /insights/generate/agentic      (tool-calling loop; body: { file_path, backend?, model?, ... })
 GET  /insights?path=...&library=...
-GET  /insights/models                (local Ollama models + capabilities)
+GET  /insights/models                (local-backend models + capabilities; Ollama OR llama-swap based on LLM_BACKEND)
 GET  /insights/openrouter/models     (curated OpenRouter allowlist)
-GET  /insights/llamacpp/models       (curated llama-swap slot allowlist)
 POST /insights/rate                  (thumbs up/down for training data)
 
 // Insight Chat Continuation
@@ -632,22 +631,27 @@ OPENROUTER_EMBEDDING_MODEL=openai/text-embedding-3-small  # Optional, embeddings
 OPENROUTER_HTTP_REFERER=https://your-site.example    # Optional attribution header
 OPENROUTER_APP_TITLE=ImageApi                  # Optional attribution header
 
-# llama.cpp / llama-swap (Llamacpp Backend) - sibling to Ollama; OpenAI-compatible
+# Local LLM backend switch. `ollama` (default) keeps the OLLAMA_* settings
+# above; `llamacpp` swaps the entire local stack (chat + vision describe +
+# embeddings) over to llama-swap. The switch is global and applies to
+# `backend=local` requests and to `backend=hybrid`'s describe pass (hybrid
+# chat still goes to OpenRouter). Don't flip mid-deploy without
+# re-embedding — mixed vector spaces break similarity search.
+LLM_BACKEND=ollama
+
+# llama.cpp / llama-swap (used when LLM_BACKEND=llamacpp). OpenAI-compatible
 # proxy hosting one or more llama-server processes (chat / vision / embed slots).
-LLAMA_SWAP_URL=http://localhost:9292/v1         # Required to enable llamacpp backend
+LLAMA_SWAP_URL=http://localhost:9292/v1         # Required when LLM_BACKEND=llamacpp
 LLAMA_SWAP_PRIMARY_MODEL=chat                   # Chat slot id (matches config.yaml)
-LLAMA_SWAP_VISION_MODEL=vision                  # Vision slot id; describe_image routes here
-LLAMA_SWAP_EMBEDDING_MODEL=embed                # Embedding slot id (when local embeddings via llamacpp)
-LLAMA_SWAP_VISION_MODELS=qwen-vl,llava          # Comma-separated slot ids known to have vision.
-                                                # Drives `has_vision` in /insights/llamacpp/models.
-                                                # `LLAMA_SWAP_VISION_MODEL` is auto-included.
-LLAMA_SWAP_ALLOWED_MODELS=chat,coder            # Curated allowlist exposed to clients via
-                                                # GET /insights/llamacpp/models. Empty = no picker.
+LLAMA_SWAP_VISION_MODEL=vision                  # Vision slot id; describe_image routes here.
+                                                # The only slot reported as has_vision=true in
+                                                # /insights/models — chat slots are treated as
+                                                # text-only (images pre-described and inlined).
+LLAMA_SWAP_EMBEDDING_MODEL=embed                # Embedding slot id
+LLAMA_SWAP_ALLOWED_MODELS=chat,coder            # Curated allowlist surfaced by GET /insights/models
+                                                # when LLM_BACKEND=llamacpp. Empty = picker shows
+                                                # only the configured primary model.
 LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180          # Per-request timeout; bump for slow CPU offload
-HYBRID_VISION_BACKEND=llamacpp                  # Optional override for hybrid mode's describe_image:
-                                                # `ollama` (default) or `llamacpp`. When `llamacpp`,
-                                                # hybrid still routes chat to OpenRouter but uses
-                                                # llama-swap's vision slot to describe images.
 
 # Insight Chat Continuation
 AGENTIC_CHAT_MAX_ITERATIONS=6                  # Cap on tool-calling iterations per chat turn (default 6)
@@ -668,13 +672,36 @@ The `OllamaClient` provides methods to query available models:
 
 This allows runtime verification of model availability before generating insights.
 
+**Local backend switch (`LLM_BACKEND`):**
+
+One env var decides which "local" stack the server runs against — `ollama`
+(default) or `llamacpp`. It's global on purpose: chat, vision describe, and
+embeddings all route through the same backend, so the embedding-vector
+column in SQLite stays in one vector space. Don't flip mid-deploy without
+re-embedding the affected rows — similarity search will collapse.
+
+- `LLM_BACKEND=ollama`: chat and embeddings use Ollama; vision describe
+  uses Ollama's multimodal model.
+- `LLM_BACKEND=llamacpp`: chat hits llama-swap's `chat` slot (which is
+  treated as text-only — images are pre-described via the `vision` slot
+  and inlined), embeddings hit the `embed` slot, vision describe hits the
+  `vision` slot. Requires `LLAMA_SWAP_URL`.
+
+The per-request `backend=hybrid` override is orthogonal: it always sends
+chat to OpenRouter, but the describe pass still routes through whichever
+`LLM_BACKEND` is configured.
+
+`GET /insights/models` returns the local-backend models with capabilities
+in the same envelope shape regardless of `LLM_BACKEND`: Ollama servers
+when `ollama`, llama-swap slots (from `LLAMA_SWAP_ALLOWED_MODELS`) when
+`llamacpp`. No `/insights/llamacpp/models` — the picker reads a single
+endpoint.
+
 **Hybrid Backend (OpenRouter):**
 - Per-request opt-in via `backend=hybrid` on `POST /insights/generate/agentic`.
 - Vision describe happens before the agentic loop; the description is inlined
-  into the chat prompt and the agentic loop runs on OpenRouter. By default
-  vision uses local Ollama, but `HYBRID_VISION_BACKEND=llamacpp` flips it to
-  llama-swap's vision slot (useful when you want chat on a frontier model and
-  vision on a local-but-not-Ollama path).
+  into the chat prompt and the agentic loop runs on OpenRouter. Vision
+  routes through whichever `LLM_BACKEND` is configured.
 - `request.model` (if provided) overrides `OPENROUTER_DEFAULT_MODEL` for that
   call. The mobile picker reads from `OPENROUTER_ALLOWED_MODELS`.
 - No live capability precheck — the operator-curated allowlist is trusted.
@@ -682,29 +709,14 @@ This allows runtime verification of model availability before generating insight
 - `GET /insights/openrouter/models` returns `{ models, default_model, configured }`
   for client picker UIs.
 
-**Llamacpp Backend (llama-swap):**
-- Per-request opt-in via `backend=llamacpp` on `POST /insights/generate/agentic`.
-- Sibling to Ollama: a local OpenAI-compatible proxy (mostlygeek/llama-swap)
-  fronting one or more `llama-server` processes. The chat slot is text-only
-  by default; vision and embeddings have their own slots (`LLAMA_SWAP_VISION_MODEL`,
-  `LLAMA_SWAP_EMBEDDING_MODEL`) that llama-swap routes to by model id. The
-  bundled `docker-compose.yml` + `llama-swap/config.yaml` in the opencode root
-  is the reference deploy.
-- Operates in the same describe-then-inline shape as hybrid: the chat model
-  never sees raw images. Vision describe routes through llama-swap's vision
-  slot (`describe_image` on `LlamaCppClient`).
-- `request.model` (if provided) overrides `LLAMA_SWAP_PRIMARY_MODEL` for that
-  call (must match a slot id in llama-swap's `config.yaml`). The mobile picker
-  reads from `LLAMA_SWAP_ALLOWED_MODELS`.
-- No live capability precheck — slot ids are trusted. Tool calling is assumed
-  for every slot (llama-swap entries typically launch with `--jinja`).
-- `GET /insights/llamacpp/models` returns `{ models, default_model, configured }`.
-- Cross-replay matrix (chat continuation): `local ↔ llamacpp` allowed (the
-  LlamaCppClient passes images through to the chat slot — you're responsible
-  for a vision-capable slot if the stored transcript carries images);
-  `hybrid ↔ llamacpp` allowed (both transcripts are text-only); `local →
-  hybrid` and `llamacpp → hybrid` rejected (mid-conversation description
-  source change isn't supported).
+**Cross-replay matrix (chat continuation):**
+- `local → local` allowed (whether served by Ollama or llama-swap; that's
+  a deploy-time decision, not a request-time one).
+- `hybrid → hybrid` allowed.
+- `hybrid → local` allowed (the inlined description replays as text).
+- `local → hybrid` rejected — the stored transcript has raw images in the
+  first user message and OpenRouter providers don't accept that shape
+  consistently. Regenerate the insight in hybrid mode instead.
 
 **Insight Chat Continuation:**
 
diff --git a/src/ai/handlers.rs b/src/ai/handlers.rs
index 4809b25..c86a5c8 100644
--- a/src/ai/handlers.rs
+++ b/src/ai/handlers.rs
@@ -470,7 +470,16 @@ pub async fn generate_agentic_insight_handler(
     }
 }
 
-/// GET /insights/models - List available models from both servers with capabilities
+/// GET /insights/models - Local-backend models with capabilities. Returns
+/// Ollama servers when `LLM_BACKEND=ollama` (default), or llama-swap slots
+/// when `LLM_BACKEND=llamacpp`. Same envelope shape either way so the
+/// client picker doesn't have to branch on backend kind.
+///
+/// For llama-swap: `models` comes verbatim from `LLAMA_SWAP_ALLOWED_MODELS`
+/// (no live `/v1/models` probe), `has_vision` is true only for the
+/// configured `LLAMA_SWAP_VISION_MODEL` slot id, and `has_tool_calling` is
+/// reported as true for every slot (llama-server is launched with `--jinja`
+/// by convention — a misconfigured slot surfaces as a chat-call error).
 #[get("/insights/models")]
 pub async fn get_available_models_handler(
     _claims: Claims,
@@ -478,6 +487,29 @@ pub async fn get_available_models_handler(
 ) -> impl Responder {
     log::debug!("Fetching available models with capabilities");
 
+    if crate::ai::local_backend_is_llamacpp()
+        && let Some(lc) = app_state.llamacpp.as_ref()
+    {
+        let models: Vec<ModelCapabilities> = app_state
+            .llamacpp_allowed_models
+            .iter()
+            .map(|name| ModelCapabilities {
+                name: name.clone(),
+                has_vision: name == &lc.vision_model,
+                has_tool_calling: true,
+            })
+            .collect();
+        let primary = ServerModels {
+            url: lc.base_url.clone(),
+            models,
+            default_model: lc.primary_model.clone(),
+        };
+        return HttpResponse::Ok().json(AvailableModelsResponse {
+            primary,
+            fallback: None,
+        });
+    }
+
     let ollama_client = &app_state.ollama;
 
     // Fetch models with capabilities from primary server
@@ -549,36 +581,6 @@ pub async fn get_openrouter_models_handler(
     HttpResponse::Ok().json(response)
 }
 
-#[derive(serde::Serialize)]
-pub struct LlamaCppModelsResponse {
-    pub models: Vec<String>,
-    pub default_model: Option<String>,
-    pub configured: bool,
-}
-
-/// GET /insights/llamacpp/models - Curated llama-swap model ids exposed
-/// to clients for the llamacpp backend. Returned verbatim from
-/// `LLAMA_SWAP_ALLOWED_MODELS`; no live call to llama-swap. Use
-/// `LLAMA_SWAP_URL` plus `LLAMA_SWAP_PRIMARY_MODEL` on the server side to
-/// pick the actual chat slot.
-#[get("/insights/llamacpp/models")]
-pub async fn get_llamacpp_models_handler(
-    _claims: Claims,
-    app_state: web::Data<crate::state::AppState>,
-) -> impl Responder {
-    let configured = app_state.llamacpp.is_some();
-    let default_model = app_state
-        .llamacpp
-        .as_ref()
-        .map(|c| c.primary_model.clone());
-    let response = LlamaCppModelsResponse {
-        models: app_state.llamacpp_allowed_models.clone(),
-        default_model,
-        configured,
-    };
-    HttpResponse::Ok().json(response)
-}
-
 /// POST /insights/rate - Rate an insight (thumbs up/down for training data)
 #[post("/insights/rate")]
 pub async fn rate_insight_handler(
diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs
index 4e87d52..b549dab 100644
--- a/src/ai/insight_chat.rs
+++ b/src/ai/insight_chat.rs
@@ -309,14 +309,15 @@ impl InsightChatService {
             .unwrap_or_else(|| stored_backend.clone());
         validate_cross_replay(&stored_backend, &effective_backend)?;
         let is_hybrid = effective_backend == "hybrid";
-        let is_llamacpp = effective_backend == "llamacpp";
-        let describes_then_inlines = is_hybrid || is_llamacpp;
+        let local_via_llamacpp =
+            crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
+        let describes_then_inlines = is_hybrid || local_via_llamacpp;
         span.set_attribute(KeyValue::new("backend", effective_backend.clone()));
 
-        // 4. Build the chat backend client. Ollama in local mode, a freshly
-        //    cloned OpenRouter client in hybrid mode, a freshly cloned
-        //    LlamaCppClient in llamacpp mode (clone so per-request
-        //    sampling/model overrides don't leak into shared state).
+        // 4. Build the chat backend client. Hybrid → OpenRouter; local with
+        //    `LLM_BACKEND=llamacpp` → llama-swap; otherwise Ollama. Clones
+        //    so per-request sampling/model overrides don't leak into shared
+        //    state.
         let max_iterations = req
             .max_iterations
             .unwrap_or(DEFAULT_MAX_ITERATIONS)
@@ -353,9 +354,9 @@ impl InsightChatService {
                 c.set_num_ctx(Some(ctx));
             }
             openrouter_client = Some(c);
-        } else if is_llamacpp {
+        } else if local_via_llamacpp {
             let arc = self.llamacpp.as_ref().ok_or_else(|| {
-                anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured")
+                anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured")
             })?;
             let mut c: LlamaCppClient = (**arc).clone();
             if let Some(ref m) = custom_model {
@@ -373,8 +374,8 @@ impl InsightChatService {
             }
             llamacpp_client = Some(c);
         } else {
-            // Local-mode model swap. Build a new client when the chat model
-            // differs from the configured one (mirrors the agentic pattern).
+            // Pure local (Ollama): model swap. Build a new client when the
+            // chat model differs from the configured one.
             if let Some(ref m) = custom_model
                 && m != &self.ollama.primary_model
             {
@@ -820,8 +821,9 @@ impl InsightChatService {
             .unwrap_or_else(|| stored_backend.clone());
         validate_cross_replay(&stored_backend, &effective_backend)?;
         let is_hybrid = effective_backend == "hybrid";
-        let is_llamacpp = effective_backend == "llamacpp";
-        let describes_then_inlines = is_hybrid || is_llamacpp;
+        let local_via_llamacpp =
+            crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
+        let describes_then_inlines = is_hybrid || local_via_llamacpp;
 
         let max_iterations = req
             .max_iterations
@@ -841,9 +843,9 @@ impl InsightChatService {
         let model_used = chat_backend.primary_model().to_string();
 
         // Tool set — local mode + first user turn carries an image →
-        // offer describe_photo. Describe-then-inline modes (hybrid /
-        // llamacpp): visual description was inlined when the insight was
-        // bootstrapped, no describe tool needed.
+        // offer describe_photo. Describe-then-inline modes (hybrid OR
+        // local_via_llamacpp): visual description was inlined when the
+        // insight was bootstrapped, no describe tool needed.
         let local_first_user_has_image = messages
             .iter()
             .find(|m| m.role == "user")
@@ -987,8 +989,9 @@ impl InsightChatService {
             .unwrap_or_else(|| "default".to_string());
         let effective_backend = resolve_bootstrap_backend(req.backend.as_deref())?;
         let is_hybrid = effective_backend == "hybrid";
-        let is_llamacpp = effective_backend == "llamacpp";
-        let describes_then_inlines = is_hybrid || is_llamacpp;
+        let local_via_llamacpp =
+            crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
+        let describes_then_inlines = is_hybrid || local_via_llamacpp;
 
         let max_iterations = req
             .max_iterations
@@ -1020,35 +1023,19 @@ impl InsightChatService {
                 _ => None,
             });
 
-        // Describe-then-inline backends (hybrid, llamacpp): pre-describe the
-        // image so a text-only chat model gets the visual description inline.
-        // Vision source: llamacpp's vision slot in llamacpp mode; in hybrid
-        // mode Ollama by default, llamacpp via `HYBRID_VISION_BACKEND=llamacpp`.
+        // Describe-then-inline (hybrid OR local_via_llamacpp): pre-describe
+        // the image so a text-only chat model gets the visual description
+        // inline. Vision source follows `LLM_BACKEND`: llama-swap when
+        // `local_via_llamacpp`, else Ollama.
         let visual_block = if describes_then_inlines {
             match image_base64.as_deref() {
                 Some(b64) => {
-                    let use_llamacpp_vision = if is_llamacpp {
-                        true
-                    } else {
-                        matches!(
-                            std::env::var("HYBRID_VISION_BACKEND")
-                                .ok()
-                                .as_deref()
-                                .map(|s| s.trim().to_lowercase())
-                                .as_deref(),
-                            Some("llamacpp")
-                        )
-                    };
-                    let described = if use_llamacpp_vision {
-                        match self.llamacpp.as_ref() {
-                            Some(c) => c.describe_image(b64).await,
-                            None => {
-                                log::warn!(
-                                    "bootstrap: requested llamacpp vision but LLAMA_SWAP_URL unset; falling back to Ollama"
-                                );
-                                self.ollama.describe_image(b64).await
-                            }
-                        }
+                    let described = if local_via_llamacpp {
+                        self.llamacpp
+                            .as_ref()
+                            .expect("local_via_llamacpp guarantees Some")
+                            .describe_image(b64)
+                            .await
                     } else {
                         self.ollama.describe_image(b64).await
                     };
@@ -1175,8 +1162,11 @@ impl InsightChatService {
     /// (boxed because each backend has a different concrete type) and the
     /// Ollama client used for describe-image / local tool calls.
     ///
-    /// `effective_backend` must be one of `"local"`, `"hybrid"`, `"llamacpp"`
-    /// (validated upstream).
+    /// `effective_backend` must be one of `"local"` or `"hybrid"` (validated
+    /// upstream). Hybrid → OpenRouter; local with `LLM_BACKEND=llamacpp` →
+    /// llama-swap; pure local → Ollama. Returns the dispatched chat client
+    /// plus the (possibly per-request) Ollama client that the caller uses
+    /// for non-chat helpers (image describe in non-llamacpp mode, tool ops).
     fn build_chat_clients(
         &self,
         effective_backend: &str,
@@ -1206,10 +1196,10 @@ impl InsightChatService {
             return Ok((Box::new(c), ollama_client));
         }
 
-        if effective_backend == "llamacpp" {
-            let arc = self.llamacpp.as_ref().ok_or_else(|| {
-                anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured")
-            })?;
+        // Local mode — env switch decides between Ollama and llama-swap.
+        if crate::ai::local_backend_is_llamacpp()
+            && let Some(arc) = self.llamacpp.as_ref()
+        {
             let mut c: LlamaCppClient = (**arc).clone();
             if let Some(m) = custom_model {
                 c.primary_model = m.to_string();
@@ -1525,41 +1515,26 @@ fn resolve_date_taken_for_context(
 
 /// Validate a stored→effective backend transition for a chat continuation.
 /// Continuation runs against a transcript that was generated with a specific
-/// backend; some transitions break the conversation shape:
+/// backend; the only blocked transition is `local → hybrid`, because the
+/// stored transcript has images embedded in the first user message and the
+/// hybrid path (OpenRouter chat with describe-then-inline) can't replay
+/// raw image bytes through OpenRouter consistently across providers.
+/// `hybrid → local` is allowed (the inlined description replays verbatim
+/// as text).
 ///
-/// - `local → hybrid` — the stored transcript has images embedded in the
-///   first user message; the openrouter chat client surfaces them through
-///   the wire, but vision-only models routed via the hybrid path may not
-///   accept that shape consistently across providers. Reject to keep the
-///   `regenerate-in-hybrid-mode` workflow as the supported answer.
-/// - `llamacpp → hybrid` — the stored transcript already has an inlined
-///   visual description produced by llama-swap's vision slot. Switching
-///   to hybrid mid-conversation would mix description sources across
-///   subsequent turns (any new image in the chat continuation would be
-///   described by ollama-vision while the original was described by
-///   llama-vision). Reject for consistency.
-///
-/// All other transitions are allowed. `local ↔ llamacpp` works because
-/// LlamaCppClient passes image content-parts through to the chat slot —
-/// the user is responsible for picking a vision-capable chat model in
-/// that case. `hybrid ↔ llamacpp` works because both transcripts are
-/// text-only (visual description inlined at bootstrap).
+/// Whether "local" routes through Ollama or llama-swap is decided at
+/// startup by `LLM_BACKEND`; both share the same transcript shape from
+/// the chat-replay perspective.
 fn validate_cross_replay(stored: &str, effective: &str) -> Result<()> {
-    if !matches!(effective, "local" | "hybrid" | "llamacpp") {
+    if !matches!(effective, "local" | "hybrid") {
         bail!(
-            "unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'",
+            "unknown backend '{}'; expected 'local' or 'hybrid'",
             effective
         );
     }
     if stored == "local" && effective == "hybrid" {
         bail!(
-            "switching from local to hybrid mid-chat isn't supported yet; \
-             regenerate the insight in hybrid mode if you want OpenRouter chat"
-        );
-    }
-    if stored == "llamacpp" && effective == "hybrid" {
-        bail!(
-            "switching from llamacpp to hybrid mid-chat isn't supported yet; \
+            "switching from local to hybrid mid-chat isn't supported; \
              regenerate the insight in hybrid mode if you want OpenRouter chat"
         );
     }
@@ -1576,9 +1551,9 @@ fn resolve_bootstrap_backend(supplied: Option<&str>) -> Result<String> {
         .map(|s| s.trim().to_lowercase())
         .filter(|s| !s.is_empty())
         .unwrap_or_else(|| "local".to_string());
-    if !matches!(lower.as_str(), "local" | "hybrid" | "llamacpp") {
+    if !matches!(lower.as_str(), "local" | "hybrid") {
         bail!(
-            "unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'",
+            "unknown backend '{}'; expected 'local' or 'hybrid'",
             lower
         );
     }
@@ -2184,10 +2159,6 @@ mod tests {
     fn bootstrap_backend_accepts_local_and_hybrid_case_insensitively() {
         assert_eq!(resolve_bootstrap_backend(Some("LOCAL")).unwrap(), "local");
         assert_eq!(resolve_bootstrap_backend(Some("Hybrid")).unwrap(), "hybrid");
-        assert_eq!(
-            resolve_bootstrap_backend(Some("Llamacpp")).unwrap(),
-            "llamacpp"
-        );
         assert_eq!(
             resolve_bootstrap_backend(Some("  local  ")).unwrap(),
             "local"
@@ -2196,10 +2167,13 @@ mod tests {
 
     #[test]
     fn bootstrap_backend_rejects_unknown_label() {
-        let err = resolve_bootstrap_backend(Some("openrouter")).unwrap_err();
-        let msg = format!("{}", err);
-        assert!(msg.contains("unknown backend"));
-        assert!(msg.contains("openrouter"));
+        // `llamacpp` is no longer a per-request backend value — it's chosen
+        // at deploy time via `LLM_BACKEND`.
+        for label in &["openrouter", "llamacpp", "ollama"] {
+            let err = resolve_bootstrap_backend(Some(label)).unwrap_err();
+            let msg = format!("{}", err);
+            assert!(msg.contains("unknown backend"), "label={}", label);
+        }
     }
 
     #[test]
@@ -2209,29 +2183,20 @@ mod tests {
     }
 
     #[test]
-    fn cross_replay_rejects_llamacpp_to_hybrid() {
-        let err = validate_cross_replay("llamacpp", "hybrid").unwrap_err();
-        assert!(format!("{}", err).contains("llamacpp to hybrid"));
-    }
-
-    #[test]
-    fn cross_replay_allows_local_llamacpp_and_hybrid_llamacpp_transitions() {
-        // Local ↔ llamacpp: user is responsible for picking a vision-capable
-        // chat slot when the transcript has images.
-        assert!(validate_cross_replay("local", "llamacpp").is_ok());
-        assert!(validate_cross_replay("llamacpp", "local").is_ok());
-        // Hybrid ↔ llamacpp: both transcripts are text-only.
-        assert!(validate_cross_replay("hybrid", "llamacpp").is_ok());
-        // Same-backend replays are always fine.
+    fn cross_replay_allows_supported_transitions() {
         assert!(validate_cross_replay("local", "local").is_ok());
         assert!(validate_cross_replay("hybrid", "hybrid").is_ok());
-        assert!(validate_cross_replay("llamacpp", "llamacpp").is_ok());
+        // Hybrid → local replays the inlined description as plain text.
+        assert!(validate_cross_replay("hybrid", "local").is_ok());
     }
 
     #[test]
     fn cross_replay_rejects_unknown_effective() {
-        let err = validate_cross_replay("local", "openrouter").unwrap_err();
-        assert!(format!("{}", err).contains("unknown backend"));
+        // Both "openrouter" and the former "llamacpp" value are unknown now.
+        for label in &["openrouter", "llamacpp"] {
+            let err = validate_cross_replay("local", label).unwrap_err();
+            assert!(format!("{}", err).contains("unknown backend"), "label={}", label);
+        }
     }
 
     #[test]
diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs
index 2a11e29..08fcfef 100644
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -471,8 +471,11 @@ impl InsightGenerator {
         log::info!("RAG QUERY: {}", query);
         log::info!("========================================");
 
-        // Generate embedding for the query
-        let query_embedding = self.ollama.generate_embedding(&query).await?;
+        // Generate embedding for the query via the configured local backend
+        // (`LLM_BACKEND` switch). Must match the backend that populated the
+        // daily-summary embeddings or similarity search will be garbage.
+        let query_embedding =
+            crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &query).await?;
 
         // Search for similar daily summaries with time-based weighting
         // This prioritizes summaries temporally close to the query date
@@ -563,7 +566,7 @@ impl InsightGenerator {
         let calendar_cx = parent_cx.with_span(span);
 
         let query_embedding = if let Some(loc) = location {
-            match self.ollama.generate_embedding(loc).await {
+            match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), loc).await {
                 Ok(emb) => Some(emb),
                 Err(e) => {
                     log::warn!("Failed to generate embedding for location '{}': {}", loc, e);
@@ -734,16 +737,17 @@ impl InsightGenerator {
             )
         };
 
-        let query_embedding = match self.ollama.generate_embedding(&query_text).await {
-            Ok(emb) => emb,
-            Err(e) => {
-                log::warn!("Failed to generate search embedding: {}", e);
-                search_cx.span().set_status(Status::Error {
-                    description: e.to_string().into(),
-                });
-                return Ok(None);
-            }
-        };
+        let query_embedding =
+            match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &query_text).await {
+                Ok(emb) => emb,
+                Err(e) => {
+                    log::warn!("Failed to generate search embedding: {}", e);
+                    search_cx.span().set_status(Status::Error {
+                        description: e.to_string().into(),
+                    });
+                    return Ok(None);
+                }
+            };
 
         let searches = {
             let mut dao = self
@@ -2608,11 +2612,13 @@ Return ONLY the summary, nothing else."#,
         }
     }
 
-    /// Tool: store_entity — upsert an entity into the knowledge memory
+    /// Tool: store_entity — upsert an entity into the knowledge memory.
+    /// Embeddings go through the configured local backend (`LLM_BACKEND`),
+    /// independent of the per-request chat backend in the caller.
     async fn tool_store_entity(
         &self,
         args: &serde_json::Value,
-        ollama: &OllamaClient,
+        _ollama: &OllamaClient,
         cx: &opentelemetry::Context,
     ) -> String {
         use crate::database::models::InsertEntity;
@@ -2672,9 +2678,16 @@ Return ONLY the summary, nothing else."#,
                 .collect()
         };
 
-        // Generate embedding for name + description (best-effort)
+        // Generate embedding for name + description (best-effort) via the
+        // configured local backend.
         let embed_text = format!("{} {}", name, description);
-        let embedding: Option<Vec<u8>> = match ollama.generate_embedding(&embed_text).await {
+        let embedding: Option<Vec<u8>> = match crate::ai::embed_one(
+            &self.ollama,
+            self.llamacpp.as_deref(),
+            &embed_text,
+        )
+        .await
+        {
             Ok(vec) => {
                 let bytes: Vec<u8> = vec.iter().flat_map(|f| f.to_le_bytes()).collect();
                 Some(bytes)
@@ -3580,20 +3593,24 @@ Return ONLY the summary, nothing else."#,
             .map(|s| s.trim().to_lowercase())
             .filter(|s| !s.is_empty())
             .unwrap_or_else(|| "local".to_string());
-        if !matches!(backend_label.as_str(), "local" | "hybrid" | "llamacpp") {
+        if !matches!(backend_label.as_str(), "local" | "hybrid") {
             return Err(anyhow::anyhow!(
-                "unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'",
+                "unknown backend '{}'; expected 'local' or 'hybrid'",
                 backend_label
             ));
         }
         span.set_attribute(KeyValue::new("backend", backend_label.clone()));
         let is_hybrid = backend_label == "hybrid";
-        let is_llamacpp = backend_label == "llamacpp";
-        // In hybrid + llamacpp modes the chat model never sees the image
-        // directly; we describe-then-inline locally before the agentic loop
-        // starts. Tracked as a single flag so vision/tool-gate logic doesn't
-        // have to branch twice.
-        let describes_then_inlines = is_hybrid || is_llamacpp;
+        // `LLM_BACKEND=llamacpp` swaps Ollama out for llama-swap as the
+        // "local" stack — chat + vision describe + embeddings all route
+        // through llama-swap. In hybrid mode this still applies to vision
+        // describe (chat continues to go to OpenRouter). The chat slot is
+        // text-only in either case, so we describe-then-inline.
+        let local_via_llamacpp =
+            crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
+        // Describe-then-inline: hybrid (chat is OpenRouter, text-only) or
+        // any path where chat goes through llama-swap (chat slot is text-only).
+        let describes_then_inlines = is_hybrid || local_via_llamacpp;
 
         // 1b. Always build an Ollama client. In local mode it owns the chat
         //     loop; in hybrid/llamacpp mode it still handles tool-local calls
@@ -3688,13 +3705,14 @@ Return ONLY the summary, nothing else."#,
             None
         };
 
-        // 1d. In llamacpp mode, clone the configured LlamaCpp client and
-        //     apply per-request overrides. Same shape as the openrouter
-        //     branch above; describe_image will route through the vision
-        //     slot configured on the client.
-        let llamacpp_client: Option<LlamaCppClient> = if is_llamacpp {
+        // 1d. When `LLM_BACKEND=llamacpp` and we're in local mode (not
+        //     hybrid — hybrid keeps chat on OpenRouter), clone the llamacpp
+        //     client and apply per-request overrides. Same shape as the
+        //     openrouter branch above; describe_image will route through
+        //     the vision slot configured on the client.
+        let llamacpp_client: Option<LlamaCppClient> = if local_via_llamacpp && !is_hybrid {
             let arc = self.llamacpp.as_ref().ok_or_else(|| {
-                anyhow::anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured")
+                anyhow::anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured")
             })?;
             let mut c: LlamaCppClient = (**arc).clone();
             if let Some(ref m) = custom_model {
@@ -3917,38 +3935,19 @@ Return ONLY the summary, nothing else."#,
             None
         };
 
-        // describe-then-inline path. In hybrid mode the vision backend
-        // defaults to Ollama but can be flipped to llamacpp via
-        // `HYBRID_VISION_BACKEND=llamacpp` (so chat goes to OpenRouter while
-        // vision/audio routes through llama-swap). In llamacpp mode we always
-        // use the llamacpp client's configured vision slot.
+        // describe-then-inline path. Vision describe routes through whichever
+        // `LLM_BACKEND` is configured — llama-swap when `local_via_llamacpp`
+        // is set (even in hybrid mode, since chat is OpenRouter but vision
+        // stays on the local stack), otherwise Ollama.
         let inlined_visual_description: Option<String> = if describes_then_inlines {
             match image_base64.as_deref() {
                 Some(b64) => {
-                    let use_llamacpp_vision = if is_llamacpp {
-                        true
-                    } else {
-                        // is_hybrid branch — consult env switch
-                        matches!(
-                            std::env::var("HYBRID_VISION_BACKEND")
-                                .ok()
-                                .as_deref()
-                                .map(|s| s.trim().to_lowercase())
-                                .as_deref(),
-                            Some("llamacpp")
-                        )
-                    };
-
-                    let described = if use_llamacpp_vision {
-                        match self.llamacpp.as_ref() {
-                            Some(c) => c.describe_image(b64).await,
-                            None => {
-                                log::warn!(
-                                    "describe-then-inline: requested llamacpp vision but LLAMA_SWAP_URL is unset, falling back to Ollama"
-                                );
-                                self.ollama.describe_image(b64).await
-                            }
-                        }
+                    let described = if local_via_llamacpp {
+                        self.llamacpp
+                            .as_ref()
+                            .expect("local_via_llamacpp guarantees Some")
+                            .describe_image(b64)
+                            .await
                     } else {
                         self.ollama.describe_image(b64).await
                     };
@@ -4044,10 +4043,10 @@ Return ONLY the summary, nothing else."#,
         );
 
         // 10. Define tools. Gate flags computed from current data presence;
-        //     describe-then-inline modes (hybrid, llamacpp) omit describe_photo
-        //     since the chat model receives the visual description inline (so
-        //     we pass `false` for has_vision in those modes regardless of the
-        //     model's actual capability).
+        //     describe-then-inline modes (hybrid OR local_via_llamacpp) omit
+        //     describe_photo since the chat model receives the visual
+        //     description inline (so we pass `false` for has_vision in
+        //     those modes regardless of the model's actual capability).
         let gate_opts = self.current_gate_opts(has_vision && !describes_then_inlines);
         let tools = Self::build_tool_definitions(gate_opts);
 
diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs
index 100020c..d23fad5 100644
--- a/src/ai/llamacpp.rs
+++ b/src/ai/llamacpp.rs
@@ -11,10 +11,10 @@
 //   `model` field, which is how llama-swap selects which backend process to
 //   run.
 // - `/v1/models` returns only the configured slot ids — capabilities aren't
-//   reported by the API, so `vision_models` is a config-time allowlist (env
-//   `LLAMA_SWAP_VISION_MODELS`) used to set `has_vision` on responses.
-//   `has_tool_calling` is assumed true for every slot, since llama-swap entries
-//   default to launching llama-server with `--jinja`.
+//   reported by the API, so we infer `has_vision` from a single config field
+//   (`vision_model`, defaulting to `"vision"`) and assume `has_tool_calling`
+//   is true for every slot, since llama-swap entries default to launching
+//   llama-server with `--jinja`.
 //
 // First consumer lands alongside the three-way backend dispatch in
 // insight_generator / insight_chat.
@@ -50,16 +50,10 @@ pub struct LlamaCppClient {
     /// Embedding model slot id (e.g. `"embed"`). Used for
     /// `generate_embeddings`.
     pub embedding_model: String,
-    /// Vision model slot id (e.g. `"vision"`). Used for `describe_image` and
-    /// included in `vision_models` automatically so capability lookups for
-    /// the default vision slot report `has_vision = true` even when the env
-    /// allowlist is empty.
+    /// Vision model slot id (e.g. `"vision"`). Used for `describe_image`,
+    /// and the only slot that reports `has_vision = true` in capability
+    /// lookups (llama-swap's `/v1/models` doesn't surface modality).
     pub vision_model: String,
-    /// Operator-curated set of slot ids known to be multimodal. Drives the
-    /// `has_vision` field in `list_models` / `model_capabilities`, since
-    /// llama-swap's `/v1/models` doesn't report modality. Empty allowlist
-    /// still marks `vision_model` as vision-capable.
-    pub vision_models: Vec<String>,
     num_ctx: Option<i32>,
     temperature: Option<f32>,
     top_p: Option<f32>,
@@ -83,7 +77,6 @@ impl LlamaCppClient {
             primary_model: primary_model.unwrap_or_else(|| DEFAULT_PRIMARY_MODEL.to_string()),
             embedding_model: DEFAULT_EMBEDDING_MODEL.to_string(),
             vision_model: DEFAULT_VISION_MODEL.to_string(),
-            vision_models: Vec::new(),
             num_ctx: None,
             temperature: None,
             top_p: None,
@@ -100,10 +93,6 @@ impl LlamaCppClient {
         self.vision_model = model;
     }
 
-    pub fn set_vision_models(&mut self, models: Vec<String>) {
-        self.vision_models = models;
-    }
-
     pub fn set_num_ctx(&mut self, num_ctx: Option<i32>) {
         self.num_ctx = num_ctx;
     }
@@ -692,7 +681,7 @@ impl LlamaCppClient {
             .and_then(|v| v.as_str())
             .unwrap_or_default()
             .to_string();
-        let has_vision = name == self.vision_model || self.vision_models.iter().any(|v| v == &name);
+        let has_vision = name == self.vision_model;
         // Tool calling is the default for llama-swap entries we configure
         // (--jinja flag); no negative-list mechanism yet, so report true.
         ModelCapabilities {
@@ -954,25 +943,21 @@ mod tests {
     }
 
     #[test]
-    fn capability_inference_uses_vision_model_and_allowlist() {
+    fn capability_inference_marks_only_vision_slot() {
         let mut c = LlamaCppClient::new(None, Some("chat".into()));
         c.set_vision_model("vision".into());
-        c.set_vision_models(vec!["qwen-vl".into()]);
 
         let m_chat = json!({ "id": "chat" });
         let m_vision = json!({ "id": "vision" });
-        let m_qwen = json!({ "id": "qwen-vl" });
         let m_other = json!({ "id": "embed" });
 
         let chat = c.parse_model_capabilities(&m_chat);
         let vision = c.parse_model_capabilities(&m_vision);
-        let qwen = c.parse_model_capabilities(&m_qwen);
         let other = c.parse_model_capabilities(&m_other);
 
         assert!(!chat.has_vision);
         assert!(chat.has_tool_calling);
         assert!(vision.has_vision);
-        assert!(qwen.has_vision);
         assert!(!other.has_vision);
     }
 }
diff --git a/src/ai/mod.rs b/src/ai/mod.rs
index 204da04..8d634fd 100644
--- a/src/ai/mod.rs
+++ b/src/ai/mod.rs
@@ -21,14 +21,14 @@ pub use handlers::{
     chat_history_handler, chat_rewind_handler, chat_stream_handler, chat_turn_handler,
     delete_insight_handler, export_training_data_handler, generate_agentic_insight_handler,
     generate_insight_handler, get_all_insights_handler, get_available_models_handler,
-    get_insight_handler, get_llamacpp_models_handler, get_openrouter_models_handler,
-    rate_insight_handler,
+    get_insight_handler, get_openrouter_models_handler, rate_insight_handler,
 };
 pub use insight_generator::InsightGenerator;
 #[allow(unused_imports)]
 pub use llm_client::{
     ChatMessage, LlmClient, ModelCapabilities, Tool, ToolCall, ToolCallFunction, ToolFunction,
 };
+pub use llamacpp::LlamaCppClient;
 pub use ollama::{EMBEDDING_MODEL, OllamaClient};
 pub use sms_client::{SmsApiClient, SmsMessage};
 
@@ -40,3 +40,87 @@ pub use sms_client::{SmsApiClient, SmsMessage};
 pub fn user_display_name() -> String {
     std::env::var("USER_NAME").unwrap_or_else(|_| "Me".to_string())
 }
+
+/// One switch for the "local" LLM stack: when `LLM_BACKEND=llamacpp` is
+/// set, chat / vision describe / embeddings all route through llama-swap
+/// instead of Ollama. Any other value (including unset, the default) is
+/// Ollama. This is intentionally global — embeddings must be drawn from
+/// a single source or similarity search across the index breaks (mixed
+/// vector spaces, possibly mixed dims). The `backend=hybrid` per-request
+/// override remains orthogonal: it always sends chat to OpenRouter, and
+/// uses `LLM_BACKEND` for the describe-then-inline vision pass.
+pub fn local_backend_is_llamacpp() -> bool {
+    matches!(
+        std::env::var("LLM_BACKEND")
+            .ok()
+            .as_deref()
+            .map(|s| s.trim().to_lowercase())
+            .as_deref(),
+        Some("llamacpp")
+    )
+}
+
+/// Embed one string via the configured local backend. Routes through
+/// llama-swap when `LLM_BACKEND=llamacpp` (and a client is configured),
+/// else Ollama. Returns the single embedding vector. See
+/// [`local_backend_is_llamacpp`] for the rationale on consistency.
+pub async fn embed_one(
+    ollama: &OllamaClient,
+    llamacpp: Option<&LlamaCppClient>,
+    text: &str,
+) -> anyhow::Result<Vec<f32>> {
+    if local_backend_is_llamacpp() {
+        if let Some(lc) = llamacpp {
+            let mut vecs = <LlamaCppClient as LlmClient>::generate_embeddings(lc, &[text]).await?;
+            return vecs
+                .pop()
+                .ok_or_else(|| anyhow::anyhow!("llama-swap returned no embeddings"));
+        }
+        log::warn!(
+            "LLM_BACKEND=llamacpp but LlamaCppClient is unconfigured; falling back to Ollama embeddings"
+        );
+    }
+    ollama.generate_embedding(text).await
+}
+
+#[cfg(test)]
+mod env_dispatch_tests {
+    use super::*;
+
+    fn with_env<F: FnOnce()>(key: &str, val: Option<&str>, f: F) {
+        let prev = std::env::var(key).ok();
+        match val {
+            Some(v) => unsafe { std::env::set_var(key, v) },
+            None => unsafe { std::env::remove_var(key) },
+        }
+        f();
+        match prev {
+            Some(v) => unsafe { std::env::set_var(key, v) },
+            None => unsafe { std::env::remove_var(key) },
+        }
+    }
+
+    #[test]
+    fn llm_backend_defaults_to_ollama() {
+        with_env("LLM_BACKEND", None, || {
+            assert!(!local_backend_is_llamacpp());
+        });
+    }
+
+    #[test]
+    fn llm_backend_llamacpp_case_insensitive() {
+        with_env("LLM_BACKEND", Some("LlamaCpp"), || {
+            assert!(local_backend_is_llamacpp());
+        });
+        with_env("LLM_BACKEND", Some("  llamacpp "), || {
+            assert!(local_backend_is_llamacpp());
+        });
+    }
+
+    #[test]
+    fn llm_backend_unknown_value_is_ollama() {
+        with_env("LLM_BACKEND", Some("vllm"), || {
+            assert!(!local_backend_is_llamacpp());
+        });
+    }
+}
diff --git a/src/main.rs b/src/main.rs
index 3bd3656..63013ce 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -313,7 +313,6 @@ fn main() -> std::io::Result<()> {
                 .service(ai::get_all_insights_handler)
                 .service(ai::get_available_models_handler)
                 .service(ai::get_openrouter_models_handler)
-                .service(ai::get_llamacpp_models_handler)
                 .service(ai::chat_turn_handler)
                 .service(ai::chat_stream_handler)
                 .service(ai::chat_history_handler)
diff --git a/src/state.rs b/src/state.rs
index 96d1c22..c4f810a 100644
--- a/src/state.rs
+++ b/src/state.rs
@@ -358,10 +358,11 @@ fn parse_openrouter_allowed_models() -> Vec<String> {
 }
 
 /// Build a `LlamaCppClient` from environment variables. Returns `None` when
-/// `LLAMA_SWAP_URL` is unset (the llamacpp backend is then unavailable and
-/// requests for it return a clear error). The slot ids default to the
-/// names the bundled `llama-swap/config.yaml` uses — `chat` / `vision` /
-/// `embed` — so a minimal deploy only needs to set `LLAMA_SWAP_URL`.
+/// `LLAMA_SWAP_URL` is unset. The client is constructed unconditionally
+/// when the URL is set (so it's available even under `LLM_BACKEND=ollama`
+/// for ad-hoc tooling), but the agentic / chat paths only route through it
+/// when `LLM_BACKEND=llamacpp`. Slot ids default to the names the bundled
+/// `llama-swap/config.yaml` uses — `chat` / `vision` / `embed`.
 fn build_llamacpp_from_env() -> Option<Arc<LlamaCppClient>> {
     let base_url = env::var("LLAMA_SWAP_URL").ok()?;
     let primary_model = env::var("LLAMA_SWAP_PRIMARY_MODEL").ok();
@@ -372,12 +373,12 @@ fn build_llamacpp_from_env() -> Option<Arc<LlamaCppClient>> {
     if let Ok(model) = env::var("LLAMA_SWAP_VISION_MODEL") {
         client.set_vision_model(model);
     }
-    client.set_vision_models(parse_llamacpp_vision_models());
     Some(Arc::new(client))
 }
 
 /// Parse `LLAMA_SWAP_ALLOWED_MODELS` (comma-separated) into a vec. Used to
-/// drive `/insights/llamacpp/models`; empty when unset.
+/// populate the model picker when `LLM_BACKEND=llamacpp` — `/insights/models`
+/// surfaces these slots with capabilities. Empty when unset.
 fn parse_llamacpp_allowed_models() -> Vec<String> {
     env::var("LLAMA_SWAP_ALLOWED_MODELS")
         .unwrap_or_default()
@@ -387,20 +388,6 @@ fn parse_llamacpp_allowed_models() -> Vec<String> {
         .collect()
 }
 
-/// Parse `LLAMA_SWAP_VISION_MODELS` (comma-separated) — slot ids that report
-/// `has_vision = true` in capability lookups. The configured `vision_model`
-/// (default `vision`) is always considered vision-capable regardless of this
-/// list, so a deploy that only uses the default vision slot can leave it
-/// unset.
-fn parse_llamacpp_vision_models() -> Vec<String> {
-    env::var("LLAMA_SWAP_VISION_MODELS")
-        .unwrap_or_default()
-        .split(',')
-        .map(|s| s.trim().to_string())
-        .filter(|s| !s.is_empty())
-        .collect()
-}
-
 #[cfg(test)]
 impl AppState {
     /// Creates an AppState instance for testing with temporary directories
-- 
2.49.1


From 0631820fbf55dd27a522212b9656183614fc4a39 Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Sun, 24 May 2026 14:00:37 -0400
Subject: [PATCH 04/11] ai: send images directly to llamacpp chat models + add
 ResolvedBackend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

llamacpp models now receive images via OpenAI content-parts instead of
the describe-then-inline strategy (hybrid mode unchanged). Fixes
assistant messages with tool_calls emitting content: null instead of ""
to satisfy strict Jinja template role-alternation checks. Adds debug
logging of message role sequences on llamacpp requests.

Introduces BackendKind enum, SamplingOverrides, and ResolvedBackend in
a new backend.rs module. InsightGenerator::resolve_backend centralises
client construction + vision capability detection — next step wires the
existing inline dispatch through it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/ai/backend.rs           | 118 ++++++++++++++++
 src/ai/handlers.rs          |   2 +-
 src/ai/insight_chat.rs      |  33 ++---
 src/ai/insight_generator.rs | 262 ++++++++++++++++++++++++++++++------
 src/ai/llamacpp.rs          |  49 +++++--
 src/ai/mod.rs               |   1 +
 6 files changed, 395 insertions(+), 70 deletions(-)
 create mode 100644 src/ai/backend.rs

diff --git a/src/ai/backend.rs b/src/ai/backend.rs
new file mode 100644
index 0000000..0fe7747
--- /dev/null
+++ b/src/ai/backend.rs
@@ -0,0 +1,118 @@
+use anyhow::{Result, anyhow};
+
+use crate::ai::llm_client::LlmClient;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum BackendKind {
+    Local,
+    Hybrid,
+}
+
+impl BackendKind {
+    pub fn parse(s: &str) -> Result<Self> {
+        match s.trim().to_lowercase().as_str() {
+            "local" | "" => Ok(Self::Local),
+            "hybrid" => Ok(Self::Hybrid),
+            other => Err(anyhow!("unknown backend '{}'; expected 'local' or 'hybrid'", other)),
+        }
+    }
+
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            Self::Local => "local",
+            Self::Hybrid => "hybrid",
+        }
+    }
+}
+
+impl std::fmt::Display for BackendKind {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.as_str())
+    }
+}
+
+pub struct SamplingOverrides {
+    pub model: Option<String>,
+    pub num_ctx: Option<i32>,
+    pub temperature: Option<f32>,
+    pub top_p: Option<f32>,
+    pub top_k: Option<i32>,
+    pub min_p: Option<f32>,
+}
+
+impl SamplingOverrides {
+    pub fn has_sampling(&self) -> bool {
+        self.temperature.is_some()
+            || self.top_p.is_some()
+            || self.top_k.is_some()
+            || self.min_p.is_some()
+    }
+}
+
+pub struct ResolvedBackend {
+    chat: Box<dyn LlmClient>,
+    local: Box<dyn LlmClient>,
+    pub kind: BackendKind,
+    /// `true` when the chat model receives images directly (Ollama with
+    /// vision, or llamacpp). `false` for hybrid where we describe-then-inline.
+    pub images_inline: bool,
+}
+
+impl ResolvedBackend {
+    pub fn new(
+        chat: Box<dyn LlmClient>,
+        local: Box<dyn LlmClient>,
+        kind: BackendKind,
+        images_inline: bool,
+    ) -> Self {
+        Self { chat, local, kind, images_inline }
+    }
+
+    pub fn chat(&self) -> &dyn LlmClient {
+        self.chat.as_ref()
+    }
+
+    pub fn local(&self) -> &dyn LlmClient {
+        self.local.as_ref()
+    }
+
+    pub fn model(&self) -> &str {
+        self.chat.primary_model()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parse_backend_kind() {
+        assert_eq!(BackendKind::parse("local").unwrap(), BackendKind::Local);
+        assert_eq!(BackendKind::parse("hybrid").unwrap(), BackendKind::Hybrid);
+        assert_eq!(BackendKind::parse("  Local ").unwrap(), BackendKind::Local);
+        assert_eq!(BackendKind::parse("HYBRID").unwrap(), BackendKind::Hybrid);
+        assert_eq!(BackendKind::parse("").unwrap(), BackendKind::Local);
+        assert!(BackendKind::parse("vllm").is_err());
+    }
+
+    #[test]
+    fn backend_kind_as_str_roundtrips() {
+        assert_eq!(BackendKind::parse(BackendKind::Local.as_str()).unwrap(), BackendKind::Local);
+        assert_eq!(BackendKind::parse(BackendKind::Hybrid.as_str()).unwrap(), BackendKind::Hybrid);
+    }
+
+    #[test]
+    fn sampling_overrides_has_sampling() {
+        let empty = SamplingOverrides {
+            model: None, num_ctx: None, temperature: None,
+            top_p: None, top_k: None, min_p: None,
+        };
+        assert!(!empty.has_sampling());
+
+        let with_temp = SamplingOverrides {
+            model: None, num_ctx: Some(4096), temperature: Some(0.7),
+            top_p: None, top_k: None, min_p: None,
+        };
+        assert!(with_temp.has_sampling());
+    }
+}
diff --git a/src/ai/handlers.rs b/src/ai/handlers.rs
index c86a5c8..a7a3720 100644
--- a/src/ai/handlers.rs
+++ b/src/ai/handlers.rs
@@ -495,7 +495,7 @@ pub async fn get_available_models_handler(
             .iter()
             .map(|name| ModelCapabilities {
                 name: name.clone(),
-                has_vision: name == &lc.vision_model,
+                has_vision: true,
                 has_tool_calling: true,
             })
             .collect();
diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs
index b549dab..54350df 100644
--- a/src/ai/insight_chat.rs
+++ b/src/ai/insight_chat.rs
@@ -311,7 +311,7 @@ impl InsightChatService {
         let is_hybrid = effective_backend == "hybrid";
         let local_via_llamacpp =
             crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
-        let describes_then_inlines = is_hybrid || local_via_llamacpp;
+        let describes_then_inlines = is_hybrid;
         span.set_attribute(KeyValue::new("backend", effective_backend.clone()));
 
         // 4. Build the chat backend client. Hybrid → OpenRouter; local with
@@ -408,12 +408,11 @@ impl InsightChatService {
         let model_used = chat_backend.primary_model().to_string();
         span.set_attribute(KeyValue::new("model", model_used.clone()));
 
-        // 5. Decide vision + tool set. In describe-then-inline modes
-        //    (hybrid, llamacpp) we always omit `describe_photo` (matches the
-        //    original generation flow). In local we trust the stored
-        //    history's first-user shape: if it carries `images`, the
-        //    original model was vision-capable, and we keep `describe_photo`
-        //    available.
+        // 5. Decide vision + tool set. In describe-then-inline mode
+        //    (hybrid only) we omit `describe_photo`. In local and llamacpp
+        //    we trust the stored history's first-user shape: if it carries
+        //    `images`, the original model was vision-capable, and we keep
+        //    `describe_photo` available.
         let local_first_user_has_image = messages
             .iter()
             .find(|m| m.role == "user")
@@ -821,9 +820,7 @@ impl InsightChatService {
             .unwrap_or_else(|| stored_backend.clone());
         validate_cross_replay(&stored_backend, &effective_backend)?;
         let is_hybrid = effective_backend == "hybrid";
-        let local_via_llamacpp =
-            crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
-        let describes_then_inlines = is_hybrid || local_via_llamacpp;
+        let describes_then_inlines = is_hybrid;
 
         let max_iterations = req
             .max_iterations
@@ -842,10 +839,9 @@ impl InsightChatService {
         let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref();
         let model_used = chat_backend.primary_model().to_string();
 
-        // Tool set — local mode + first user turn carries an image →
-        // offer describe_photo. Describe-then-inline modes (hybrid OR
-        // local_via_llamacpp): visual description was inlined when the
-        // insight was bootstrapped, no describe tool needed.
+        // Tool set — local/llamacpp mode + first user turn carries an image →
+        // offer describe_photo. Describe-then-inline mode (hybrid only):
+        // visual description was inlined at bootstrap, no describe tool needed.
         let local_first_user_has_image = messages
             .iter()
             .find(|m| m.role == "user")
@@ -991,7 +987,7 @@ impl InsightChatService {
         let is_hybrid = effective_backend == "hybrid";
         let local_via_llamacpp =
             crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
-        let describes_then_inlines = is_hybrid || local_via_llamacpp;
+        let describes_then_inlines = is_hybrid;
 
         let max_iterations = req
             .max_iterations
@@ -1023,10 +1019,9 @@ impl InsightChatService {
                 _ => None,
             });
 
-        // Describe-then-inline (hybrid OR local_via_llamacpp): pre-describe
-        // the image so a text-only chat model gets the visual description
-        // inline. Vision source follows `LLM_BACKEND`: llama-swap when
-        // `local_via_llamacpp`, else Ollama.
+        // Describe-then-inline (hybrid only): pre-describe the image so a
+        // text-only chat model gets the visual description inline. llamacpp
+        // sends images directly to the chat model.
         let visual_block = if describes_then_inlines {
             match image_base64.as_deref() {
                 Some(b64) => {
diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs
index 08fcfef..075001a 100644
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -10,6 +10,7 @@ use std::io::Cursor;
 use std::sync::{Arc, Mutex};
 
 use crate::ai::apollo_client::{ApolloClient, ApolloPlace};
+use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides};
 use crate::ai::llm_client::LlmClient;
 use crate::ai::ollama::{ChatMessage, OllamaClient, Tool};
 use crate::ai::llamacpp::LlamaCppClient;
@@ -1781,14 +1782,18 @@ Return ONLY the summary, nothing else."#,
         );
 
         let started = std::time::Instant::now();
-        let response = ollama
-            .generate_no_think(
-                &prompt,
-                Some(
-                    "You are a terse relevance ranker. You output only numbers separated by commas.",
-                ),
-            )
-            .await?;
+        let system = Some(
+            "You are a terse relevance ranker. You output only numbers separated by commas.",
+        );
+        let response = if crate::ai::local_backend_is_llamacpp() {
+            if let Some(ref lc) = self.llamacpp {
+                lc.generate(&prompt, system, None).await?
+            } else {
+                ollama.generate_no_think(&prompt, system).await?
+            }
+        } else {
+            ollama.generate_no_think(&prompt, system).await?
+        };
         log::info!(
             "rerank: finished in {} ms (prompt={} chars)",
             started.elapsed().as_millis(),
@@ -2360,7 +2365,8 @@ Return ONLY the summary, nothing else."#,
         out
     }
 
-    /// Tool: describe_photo — generate a visual description of the photo
+    /// Tool: describe_photo — generate a visual description of the photo.
+    /// Routes through llama-swap when `LLM_BACKEND=llamacpp`, Ollama otherwise.
     async fn tool_describe_photo(
         &self,
         ollama: &OllamaClient,
@@ -2369,10 +2375,21 @@ Return ONLY the summary, nothing else."#,
         log::info!("tool_describe_photo: generating visual description");
 
         match image_base64 {
-            Some(img) => match ollama.generate_photo_description(img).await {
-                Ok(desc) => desc,
-                Err(e) => format!("Error describing photo: {}", e),
-            },
+            Some(img) => {
+                let result = if crate::ai::local_backend_is_llamacpp() {
+                    if let Some(ref lc) = self.llamacpp {
+                        lc.describe_image(img).await
+                    } else {
+                        ollama.generate_photo_description(img).await
+                    }
+                } else {
+                    ollama.generate_photo_description(img).await
+                };
+                match result {
+                    Ok(desc) => desc,
+                    Err(e) => format!("Error describing photo: {}", e),
+                }
+            }
             None => "No image available for description.".to_string(),
         }
     }
@@ -3560,6 +3577,177 @@ Return ONLY the summary, nothing else."#,
         out
     }
 
+    /// Consolidate client construction for the agentic insight loop.
+    ///
+    /// Returns a [`ResolvedBackend`] containing the **chat** client (the model
+    /// that drives the agent loop), the **local** client (always the configured
+    /// local backend — Ollama or llama-swap — for utility calls like
+    /// describe_image, rerank, embeddings), the backend kind, and whether the
+    /// chat model receives images inline.
+    pub async fn resolve_backend(
+        &self,
+        kind: BackendKind,
+        overrides: &SamplingOverrides,
+    ) -> Result<ResolvedBackend> {
+        let local_via_llamacpp =
+            crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
+        let is_hybrid = kind == BackendKind::Hybrid;
+
+        // ── chat client ────────────────────────────────────────────────
+        let chat: Box<dyn LlmClient> = if is_hybrid {
+            // Hybrid: chat through OpenRouter.
+            let arc = self.openrouter.as_ref().ok_or_else(|| {
+                anyhow::anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
+            })?;
+            let mut c: OpenRouterClient = (**arc).clone();
+            if let Some(ref m) = overrides.model {
+                c.primary_model = m.clone();
+            }
+            if overrides.has_sampling() {
+                c.set_sampling_params(
+                    overrides.temperature,
+                    overrides.top_p,
+                    overrides.top_k,
+                    overrides.min_p,
+                );
+            }
+            if let Some(ctx) = overrides.num_ctx {
+                c.set_num_ctx(Some(ctx));
+            }
+            Box::new(c)
+        } else if local_via_llamacpp {
+            // Local via llama-swap.
+            let arc = self.llamacpp.as_ref().ok_or_else(|| {
+                anyhow::anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured")
+            })?;
+            let mut c: LlamaCppClient = (**arc).clone();
+            if let Some(ref m) = overrides.model {
+                c.primary_model = m.clone();
+            }
+            if overrides.has_sampling() {
+                c.set_sampling_params(
+                    overrides.temperature,
+                    overrides.top_p,
+                    overrides.top_k,
+                    overrides.min_p,
+                );
+            }
+            if let Some(ctx) = overrides.num_ctx {
+                c.set_num_ctx(Some(ctx));
+            }
+            Box::new(c)
+        } else {
+            // Pure Ollama local.
+            let mut ollama_client = if let Some(ref model) = overrides.model {
+                OllamaClient::new(
+                    self.ollama.primary_url.clone(),
+                    self.ollama.fallback_url.clone(),
+                    model.clone(),
+                    Some(model.clone()),
+                )
+            } else {
+                self.ollama.clone()
+            };
+            if overrides.has_sampling() {
+                ollama_client.set_sampling_params(
+                    overrides.temperature,
+                    overrides.top_p,
+                    overrides.top_k,
+                    overrides.min_p,
+                );
+            }
+            if let Some(ctx) = overrides.num_ctx {
+                ollama_client.set_num_ctx(Some(ctx));
+            }
+            Box::new(ollama_client)
+        };
+
+        // ── local client (utility calls: rerank, describe_image, etc.) ─
+        let local: Box<dyn LlmClient> = if local_via_llamacpp {
+            Box::new(self.llamacpp.as_ref().unwrap().as_ref().clone())
+        } else {
+            Box::new(self.ollama.clone())
+        };
+
+        // ── images_inline ──────────────────────────────────────────────
+        let images_inline = if is_hybrid {
+            // Hybrid: chat model never sees images — describe-then-inject.
+            false
+        } else if local_via_llamacpp {
+            // llama-swap models receive images directly via OpenAI content
+            // parts. Capability probing isn't available (no `/api/show`),
+            // so assume vision support; a misconfigured model surfaces as
+            // a chat-call error.
+            true
+        } else {
+            // Pure Ollama: probe model capabilities.
+            let ollama_for_caps = if let Some(ref model) = overrides.model {
+                // Verify custom model is available on at least one server.
+                let available_on_primary =
+                    OllamaClient::is_model_available(&self.ollama.primary_url, model)
+                        .await
+                        .unwrap_or(false);
+
+                let available_on_fallback =
+                    if let Some(ref fallback_url) = self.ollama.fallback_url {
+                        OllamaClient::is_model_available(fallback_url, model)
+                            .await
+                            .unwrap_or(false)
+                    } else {
+                        false
+                    };
+
+                if !available_on_primary && !available_on_fallback {
+                    anyhow::bail!(
+                        "model not available: '{}' not found on any configured server",
+                        model
+                    );
+                }
+                model.as_str()
+            } else {
+                self.ollama.primary_model.as_str()
+            };
+
+            let capabilities = match OllamaClient::check_model_capabilities(
+                &self.ollama.primary_url,
+                ollama_for_caps,
+            )
+            .await
+            {
+                Ok(caps) => caps,
+                Err(_) => {
+                    let fallback_url =
+                        self.ollama.fallback_url.as_deref().ok_or_else(|| {
+                            anyhow::anyhow!(
+                                "Failed to check model capabilities for '{}': model not found on primary server and no fallback configured",
+                                ollama_for_caps
+                            )
+                        })?;
+                    OllamaClient::check_model_capabilities(fallback_url, ollama_for_caps)
+                        .await
+                        .map_err(|e| {
+                            anyhow::anyhow!(
+                                "Failed to check model capabilities for '{}': {}",
+                                ollama_for_caps,
+                                e
+                            )
+                        })?
+                }
+            };
+
+            if !capabilities.has_tool_calling {
+                anyhow::bail!(
+                    "tool calling not supported by model '{}'",
+                    ollama_for_caps
+                );
+            }
+
+            capabilities.has_vision
+        };
+
+        Ok(ResolvedBackend::new(chat, local, kind, images_inline))
+    }
+
     pub async fn generate_agentic_insight_for_photo(
         &self,
         file_path: &str,
@@ -3602,26 +3790,22 @@ Return ONLY the summary, nothing else."#,
         span.set_attribute(KeyValue::new("backend", backend_label.clone()));
         let is_hybrid = backend_label == "hybrid";
         // `LLM_BACKEND=llamacpp` swaps Ollama out for llama-swap as the
-        // "local" stack — chat + vision describe + embeddings all route
-        // through llama-swap. In hybrid mode this still applies to vision
-        // describe (chat continues to go to OpenRouter). The chat slot is
-        // text-only in either case, so we describe-then-inline.
+        // "local" stack — chat + embeddings route through llama-swap.
+        // llamacpp models receive images directly (vision-capable); only
+        // hybrid mode (OpenRouter chat) uses describe-then-inline.
         let local_via_llamacpp =
             crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
-        // Describe-then-inline: hybrid (chat is OpenRouter, text-only) or
-        // any path where chat goes through llama-swap (chat slot is text-only).
-        let describes_then_inlines = is_hybrid || local_via_llamacpp;
+        let describes_then_inlines = is_hybrid;
+        let ollama_is_chat = !is_hybrid && !local_via_llamacpp;
 
         // 1b. Always build an Ollama client. In local mode it owns the chat
         //     loop; in hybrid/llamacpp mode it still handles tool-local calls
         //     (e.g. future embedding-backed tools). The chat backend is
         //     selected separately below.
-        //     Sampling overrides only apply in local mode — in
-        //     hybrid/llamacpp the user's params belong to the alternate chat
-        //     client.
-        let apply_sampling_to_ollama = !describes_then_inlines;
+        //     Sampling overrides only apply when Ollama is the chat backend.
+        let apply_sampling_to_ollama = ollama_is_chat;
         let mut ollama_client = if let Some(ref model) = custom_model
-            && !describes_then_inlines
+            && ollama_is_chat
         {
             log::info!("Using custom model for agentic: {}", model);
             span.set_attribute(KeyValue::new("custom_model", model.clone()));
@@ -3632,7 +3816,7 @@ Return ONLY the summary, nothing else."#,
                 Some(model.clone()),
             )
         } else {
-            if !describes_then_inlines {
+            if ollama_is_chat {
                 span.set_attribute(KeyValue::new("model", self.ollama.primary_model.clone()));
             }
             self.ollama.clone()
@@ -3752,10 +3936,13 @@ Return ONLY the summary, nothing else."#,
         //      (OPENROUTER_ALLOWED_MODELS) — no live precheck. A bad model id
         //      surfaces as a chat-call error on the next step.
         let has_vision = if describes_then_inlines {
-            // In hybrid + llamacpp modes the chat model never sees images
-            // directly — we describe-then-inject, so `has_vision` drives only
-            // whether we bother loading the image to describe it, which we
-            // always do.
+            // Hybrid: chat model never sees images — describe-then-inject.
+            true
+        } else if local_via_llamacpp {
+            // llama-swap models receive images directly via OpenAI content
+            // parts. Capability probing isn't available (no `/api/show`),
+            // so assume vision support; a misconfigured model surfaces as
+            // a chat-call error.
             true
         } else {
             if let Some(ref model_name) = custom_model {
@@ -3935,10 +4122,9 @@ Return ONLY the summary, nothing else."#,
             None
         };
 
-        // describe-then-inline path. Vision describe routes through whichever
-        // `LLM_BACKEND` is configured — llama-swap when `local_via_llamacpp`
-        // is set (even in hybrid mode, since chat is OpenRouter but vision
-        // stays on the local stack), otherwise Ollama.
+        // describe-then-inline path (hybrid only). Vision describe routes
+        // through whichever local backend is configured — llama-swap when
+        // `local_via_llamacpp`, otherwise Ollama.
         let inlined_visual_description: Option<String> = if describes_then_inlines {
             match image_base64.as_deref() {
                 Some(b64) => {
@@ -4043,10 +4229,10 @@ Return ONLY the summary, nothing else."#,
         );
 
         // 10. Define tools. Gate flags computed from current data presence;
-        //     describe-then-inline modes (hybrid OR local_via_llamacpp) omit
-        //     describe_photo since the chat model receives the visual
-        //     description inline (so we pass `false` for has_vision in
-        //     those modes regardless of the model's actual capability).
+        //     hybrid mode omits describe_photo since the chat model receives
+        //     the visual description inline (so we pass `false` for
+        //     has_vision in that mode regardless of the model's actual
+        //     capability).
         let gate_opts = self.current_gate_opts(has_vision && !describes_then_inlines);
         let tools = Self::build_tool_definitions(gate_opts);
 
diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs
index d23fad5..2a03aed 100644
--- a/src/ai/llamacpp.rs
+++ b/src/ai/llamacpp.rs
@@ -50,9 +50,9 @@ pub struct LlamaCppClient {
     /// Embedding model slot id (e.g. `"embed"`). Used for
     /// `generate_embeddings`.
     pub embedding_model: String,
-    /// Vision model slot id (e.g. `"vision"`). Used for `describe_image`,
-    /// and the only slot that reports `has_vision = true` in capability
-    /// lookups (llama-swap's `/v1/models` doesn't surface modality).
+    /// Vision model slot id. Used for `describe_image` routing. Defaults
+    /// to `primary_model` so describe_image works out of the box; override
+    /// via `LLAMA_SWAP_VISION_MODEL` for a dedicated vision slot.
     pub vision_model: String,
     num_ctx: Option<i32>,
     temperature: Option<f32>,
@@ -67,6 +67,7 @@ impl LlamaCppClient {
             .ok()
             .and_then(|v| v.parse::<u64>().ok())
             .unwrap_or(DEFAULT_REQUEST_TIMEOUT_SECS);
+        let pm = primary_model.unwrap_or_else(|| DEFAULT_PRIMARY_MODEL.to_string());
         Self {
             client: Client::builder()
                 .connect_timeout(Duration::from_secs(10))
@@ -74,9 +75,9 @@ impl LlamaCppClient {
                 .build()
                 .unwrap_or_else(|_| Client::new()),
             base_url: base_url.unwrap_or_else(|| DEFAULT_BASE_URL.to_string()),
-            primary_model: primary_model.unwrap_or_else(|| DEFAULT_PRIMARY_MODEL.to_string()),
+            primary_model: pm.clone(),
             embedding_model: DEFAULT_EMBEDDING_MODEL.to_string(),
-            vision_model: DEFAULT_VISION_MODEL.to_string(),
+            vision_model: pm,
             num_ctx: None,
             temperature: None,
             top_p: None,
@@ -124,6 +125,13 @@ impl LlamaCppClient {
             let mut obj = serde_json::Map::new();
             obj.insert("role".into(), Value::String(msg.role.clone()));
 
+            // Assistant messages with tool_calls must emit `content: null`
+            // (not `""`) — some Jinja templates (Mistral-family) treat
+            // empty-string content as a regular message rather than a
+            // tool-calling turn, breaking role-alternation validation.
+            let has_tool_calls = msg.role == "assistant"
+                && msg.tool_calls.as_ref().is_some_and(|tcs| !tcs.is_empty());
+
             match &msg.images {
                 Some(images) if !images.is_empty() => {
                     let mut parts: Vec<Value> = Vec::new();
@@ -139,6 +147,9 @@ impl LlamaCppClient {
                     }
                     obj.insert("content".into(), Value::Array(parts));
                 }
+                _ if has_tool_calls && msg.content.is_empty() => {
+                    obj.insert("content".into(), Value::Null);
+                }
                 _ => {
                     obj.insert("content".into(), Value::String(msg.content.clone()));
                 }
@@ -276,6 +287,13 @@ impl LlamaCppClient {
         tools: Vec<Tool>,
     ) -> Result<(ChatMessage, Option<i32>, Option<i32>)> {
         let url = format!("{}/chat/completions", self.base_url);
+        let roles: Vec<&str> = messages.iter().map(|m| m.role.as_str()).collect();
+        log::debug!(
+            "llama-swap chat_completion: model={} roles={:?} tools={}",
+            model,
+            roles,
+            tools.len()
+        );
         let mut body = serde_json::Map::new();
         body.insert("model".into(), Value::String(model.to_string()));
         body.insert(
@@ -381,6 +399,13 @@ impl LlmClient for LlamaCppClient {
         tools: Vec<Tool>,
     ) -> Result<BoxStream<'static, Result<LlmStreamEvent>>> {
         let url = format!("{}/chat/completions", self.base_url);
+        let roles: Vec<&str> = messages.iter().map(|m| m.role.as_str()).collect();
+        log::debug!(
+            "llama-swap stream: model={} roles={:?} tools={}",
+            self.primary_model,
+            roles,
+            tools.len()
+        );
         let mut body = serde_json::Map::new();
         body.insert(
             "model".into(),
@@ -681,12 +706,12 @@ impl LlamaCppClient {
             .and_then(|v| v.as_str())
             .unwrap_or_default()
             .to_string();
-        let has_vision = name == self.vision_model;
-        // Tool calling is the default for llama-swap entries we configure
-        // (--jinja flag); no negative-list mechanism yet, so report true.
+        // llama-swap doesn't expose per-model modality flags. Assume all
+        // configured models support vision and tool calling; a model
+        // without multimodal support surfaces as a chat-call error.
         ModelCapabilities {
             name,
-            has_vision,
+            has_vision: true,
             has_tool_calling: true,
         }
     }
@@ -943,7 +968,7 @@ mod tests {
     }
 
     #[test]
-    fn capability_inference_marks_only_vision_slot() {
+    fn all_models_report_vision_capable() {
         let mut c = LlamaCppClient::new(None, Some("chat".into()));
         c.set_vision_model("vision".into());
 
@@ -955,9 +980,9 @@ mod tests {
         let vision = c.parse_model_capabilities(&m_vision);
         let other = c.parse_model_capabilities(&m_other);
 
-        assert!(!chat.has_vision);
+        assert!(chat.has_vision);
         assert!(chat.has_tool_calling);
         assert!(vision.has_vision);
-        assert!(!other.has_vision);
+        assert!(other.has_vision);
     }
 }
diff --git a/src/ai/mod.rs b/src/ai/mod.rs
index 8d634fd..09cbfda 100644
--- a/src/ai/mod.rs
+++ b/src/ai/mod.rs
@@ -1,4 +1,5 @@
 pub mod apollo_client;
+pub mod backend;
 pub mod clip_client;
 pub mod daily_summary_job;
 pub mod face_client;
-- 
2.49.1


From a8a661f70a8022f304851985575bc58193b9354d Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Sun, 24 May 2026 15:00:50 -0400
Subject: [PATCH 05/11] ai: extract ResolvedBackend, remove ~480 lines of
 duplicated dispatch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace 5 copies of the ~80-line backend resolution pattern with a
single InsightGenerator::resolve_backend() builder that returns a
ResolvedBackend (chat + local clients, BackendKind enum, images_inline
flag). Tool dispatch now takes &ResolvedBackend instead of
&OllamaClient + model + backend strings.

Remove duplicated ollama/openrouter/llamacpp fields from
InsightChatService — InsightGenerator owns them and resolve_backend
uses them. Delete build_chat_clients (replaced by resolve_backend).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/ai/insight_chat.rs      | 362 +++++++-----------------------
 src/ai/insight_generator.rs | 430 +++++++-----------------------------
 src/state.rs                |   6 -
 3 files changed, 158 insertions(+), 640 deletions(-)

diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs
index 54350df..6d52f8b 100644
--- a/src/ai/insight_chat.rs
+++ b/src/ai/insight_chat.rs
@@ -6,11 +6,9 @@ use std::collections::HashMap;
 use std::sync::{Arc, Mutex};
 use tokio::sync::Mutex as TokioMutex;
 
+use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides};
 use crate::ai::insight_generator::InsightGenerator;
-use crate::ai::llm_client::{ChatMessage, LlmClient, LlmStreamEvent, Tool};
-use crate::ai::ollama::OllamaClient;
-use crate::ai::llamacpp::LlamaCppClient;
-use crate::ai::openrouter::OpenRouterClient;
+use crate::ai::llm_client::{ChatMessage, LlmStreamEvent, Tool};
 use crate::database::InsightDao;
 use crate::database::models::InsertPhotoInsight;
 use crate::otel::global_tracer;
@@ -92,9 +90,6 @@ pub struct ChatTurnResult {
 #[derive(Clone)]
 pub struct InsightChatService {
     generator: Arc<InsightGenerator>,
-    ollama: OllamaClient,
-    openrouter: Option<Arc<OpenRouterClient>>,
-    llamacpp: Option<Arc<LlamaCppClient>>,
     insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
     chat_locks: ChatLockMap,
 }
@@ -102,17 +97,11 @@ pub struct InsightChatService {
 impl InsightChatService {
     pub fn new(
         generator: Arc<InsightGenerator>,
-        ollama: OllamaClient,
-        openrouter: Option<Arc<OpenRouterClient>>,
-        llamacpp: Option<Arc<LlamaCppClient>>,
         insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
         chat_locks: ChatLockMap,
     ) -> Self {
         Self {
             generator,
-            ollama,
-            openrouter,
-            llamacpp,
             insight_dao,
             chat_locks,
         }
@@ -308,16 +297,9 @@ impl InsightChatService {
             .filter(|s| !s.is_empty())
             .unwrap_or_else(|| stored_backend.clone());
         validate_cross_replay(&stored_backend, &effective_backend)?;
-        let is_hybrid = effective_backend == "hybrid";
-        let local_via_llamacpp =
-            crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
-        let describes_then_inlines = is_hybrid;
-        span.set_attribute(KeyValue::new("backend", effective_backend.clone()));
+        let kind = BackendKind::parse(&effective_backend)?;
+        span.set_attribute(KeyValue::new("backend", kind.as_str()));
 
-        // 4. Build the chat backend client. Hybrid → OpenRouter; local with
-        //    `LLM_BACKEND=llamacpp` → llama-swap; otherwise Ollama. Clones
-        //    so per-request sampling/model overrides don't leak into shared
-        //    state.
         let max_iterations = req
             .max_iterations
             .unwrap_or(DEFAULT_MAX_ITERATIONS)
@@ -325,113 +307,36 @@ impl InsightChatService {
         span.set_attribute(KeyValue::new("max_iterations", max_iterations as i64));
 
         let stored_model = insight.model_version.clone();
-        let custom_model = req
-            .model
-            .clone()
-            .or_else(|| Some(stored_model.clone()))
-            .filter(|m| !m.is_empty());
-
-        let mut ollama_client = self.ollama.clone();
-        let mut openrouter_client: Option<OpenRouterClient> = None;
-        let mut llamacpp_client: Option<LlamaCppClient> = None;
-
-        if is_hybrid {
-            let arc = self.openrouter.as_ref().ok_or_else(|| {
-                anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
-            })?;
-            let mut c: OpenRouterClient = (**arc).clone();
-            if let Some(ref m) = custom_model {
-                c.primary_model = m.clone();
-            }
-            if req.temperature.is_some()
-                || req.top_p.is_some()
-                || req.top_k.is_some()
-                || req.min_p.is_some()
-            {
-                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
-            }
-            if let Some(ctx) = req.num_ctx {
-                c.set_num_ctx(Some(ctx));
-            }
-            openrouter_client = Some(c);
-        } else if local_via_llamacpp {
-            let arc = self.llamacpp.as_ref().ok_or_else(|| {
-                anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured")
-            })?;
-            let mut c: LlamaCppClient = (**arc).clone();
-            if let Some(ref m) = custom_model {
-                c.primary_model = m.clone();
-            }
-            if req.temperature.is_some()
-                || req.top_p.is_some()
-                || req.top_k.is_some()
-                || req.min_p.is_some()
-            {
-                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
-            }
-            if let Some(ctx) = req.num_ctx {
-                c.set_num_ctx(Some(ctx));
-            }
-            llamacpp_client = Some(c);
-        } else {
-            // Pure local (Ollama): model swap. Build a new client when the
-            // chat model differs from the configured one.
-            if let Some(ref m) = custom_model
-                && m != &self.ollama.primary_model
-            {
-                ollama_client = OllamaClient::new(
-                    self.ollama.primary_url.clone(),
-                    self.ollama.fallback_url.clone(),
-                    m.clone(),
-                    Some(m.clone()),
-                );
-            }
-            if req.temperature.is_some()
-                || req.top_p.is_some()
-                || req.top_k.is_some()
-                || req.min_p.is_some()
-            {
-                ollama_client.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
-            }
-            if let Some(ctx) = req.num_ctx {
-                ollama_client.set_num_ctx(Some(ctx));
-            }
-        }
-
-        let chat_backend: &dyn LlmClient = if let Some(ref c) = llamacpp_client {
-            c
-        } else if let Some(ref c) = openrouter_client {
-            c
-        } else {
-            &ollama_client
+        let overrides = SamplingOverrides {
+            model: req.model.clone()
+                .or_else(|| Some(stored_model.clone()))
+                .filter(|m| !m.is_empty()),
+            num_ctx: req.num_ctx,
+            temperature: req.temperature,
+            top_p: req.top_p,
+            top_k: req.top_k,
+            min_p: req.min_p,
         };
-        let model_used = chat_backend.primary_model().to_string();
+        let backend = self.generator.resolve_backend(kind, &overrides).await?;
+        let model_used = backend.model().to_string();
         span.set_attribute(KeyValue::new("model", model_used.clone()));
 
-        // 5. Decide vision + tool set. In describe-then-inline mode
-        //    (hybrid only) we omit `describe_photo`. In local and llamacpp
-        //    we trust the stored history's first-user shape: if it carries
-        //    `images`, the original model was vision-capable, and we keep
-        //    `describe_photo` available.
+        // 5. Decide vision + tool set. In hybrid (describe-then-inline) mode
+        //    we omit `describe_photo`. Otherwise trust the stored history:
+        //    if the first user message carries images, describe_photo stays.
         let local_first_user_has_image = messages
             .iter()
             .find(|m| m.role == "user")
             .and_then(|m| m.images.as_ref())
             .map(|imgs| !imgs.is_empty())
             .unwrap_or(false);
-        let offer_describe_tool = !describes_then_inlines && local_first_user_has_image;
-        // current_gate_opts(has_vision) sets gate_opts.has_vision = has_vision
-        // and probes the per-table presence flags. Pass `offer_describe_tool`
-        // directly — the `!is_hybrid && local_first_user_has_image` decision
-        // is the chat-path's vision predicate.
+        let offer_describe_tool = backend.images_inline && local_first_user_has_image;
         let gate_opts = self.generator.current_gate_opts_for_persona(
             offer_describe_tool,
             Some((req.user_id, &active_persona)),
         );
         let tools = InsightGenerator::build_tool_definitions(gate_opts);
 
-        // Image base64 only needed when describe_photo is on the menu. Load
-        // lazily to avoid disk IO when the loop never invokes it.
         let image_base64: Option<String> = if offer_describe_tool {
             self.generator.load_image_as_base64(&normalized).ok()
         } else {
@@ -480,13 +385,13 @@ impl InsightChatService {
             iterations_used = iteration + 1;
             log::info!("Chat iteration {}/{}", iterations_used, max_iterations);
 
-            let (response, prompt_tokens, eval_tokens) = chat_backend
+            let (response, prompt_tokens, eval_tokens) = backend
+                .chat()
                 .chat_with_tools(messages.clone(), tools.clone())
                 .await?;
             last_prompt_eval_count = prompt_tokens;
             last_eval_count = eval_tokens;
 
-            // Ollama rejects non-object tool-call arguments on replay.
             let mut response = response;
             if let Some(ref mut tcs) = response.tool_calls {
                 for tc in tcs.iter_mut() {
@@ -514,13 +419,11 @@ impl InsightChatService {
                         .execute_tool(
                             &tool_call.function.name,
                             &tool_call.function.arguments,
-                            &ollama_client,
+                            &backend,
                             &image_base64,
                             &normalized,
                             req.user_id,
                             &active_persona,
-                            &model_used,
-                            &effective_backend,
                             &loop_cx,
                         )
                         .await;
@@ -534,8 +437,6 @@ impl InsightChatService {
         }
 
         if final_content.is_empty() {
-            // The model never produced a final answer; ask once more without
-            // tools to force a textual reply.
             log::info!(
                 "Chat loop exhausted after {} iterations, requesting final answer",
                 iterations_used
@@ -543,7 +444,8 @@ impl InsightChatService {
             messages.push(ChatMessage::user(
                 "Please write your final answer now without calling any more tools.",
             ));
-            let (final_response, prompt_tokens, eval_tokens) = chat_backend
+            let (final_response, prompt_tokens, eval_tokens) = backend
+                .chat()
                 .chat_with_tools(messages.clone(), vec![])
                 .await?;
             last_prompt_eval_count = prompt_tokens;
@@ -579,7 +481,8 @@ impl InsightChatService {
                  Capture the key moment or theme. Return ONLY the title, nothing else.",
                 final_content
             );
-            let title_raw = chat_backend
+            let title_raw = backend
+                .chat()
                 .generate(
                     &title_prompt,
                     Some(
@@ -604,7 +507,7 @@ impl InsightChatService {
                 model_version: model_used.clone(),
                 is_current: true,
                 training_messages: Some(json),
-                backend: effective_backend.clone(),
+                backend: kind.as_str().to_string(),
                 fewshot_source_ids: None,
                 content_hash: None,
             };
@@ -629,7 +532,7 @@ impl InsightChatService {
             prompt_eval_count: last_prompt_eval_count,
             eval_count: last_eval_count,
             amended_insight_id,
-            backend_used: effective_backend,
+            backend_used: kind.as_str().to_string(),
             model_used,
         })
     }
@@ -818,9 +721,8 @@ impl InsightChatService {
             .map(|s| s.trim().to_lowercase())
             .filter(|s| !s.is_empty())
             .unwrap_or_else(|| stored_backend.clone());
-        validate_cross_replay(&stored_backend, &effective_backend)?;
-        let is_hybrid = effective_backend == "hybrid";
-        let describes_then_inlines = is_hybrid;
+        let kind = BackendKind::parse(&effective_backend)?;
+        validate_cross_replay(&stored_backend, kind.as_str())?;
 
         let max_iterations = req
             .max_iterations
@@ -828,18 +730,20 @@ impl InsightChatService {
             .clamp(1, env_max_iterations());
 
         let stored_model = insight.model_version.clone();
-        let custom_model = req
-            .model
-            .clone()
-            .or_else(|| Some(stored_model.clone()))
-            .filter(|m| !m.is_empty());
+        let overrides = SamplingOverrides {
+            model: req.model.clone()
+                .or_else(|| Some(stored_model.clone()))
+                .filter(|m| !m.is_empty()),
+            num_ctx: req.num_ctx,
+            temperature: req.temperature,
+            top_p: req.top_p,
+            top_k: req.top_k,
+            min_p: req.min_p,
+        };
+        let backend = self.generator.resolve_backend(kind, &overrides).await?;
+        let model_used = backend.model().to_string();
 
-        let (chat_backend_holder, ollama_client) =
-            self.build_chat_clients(&effective_backend, custom_model.as_deref(), &req)?;
-        let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref();
-        let model_used = chat_backend.primary_model().to_string();
-
-        // Tool set — local/llamacpp mode + first user turn carries an image →
+        // Tool set — images_inline mode + first user turn carries an image →
         // offer describe_photo. Describe-then-inline mode (hybrid only):
         // visual description was inlined at bootstrap, no describe tool needed.
         let local_first_user_has_image = messages
@@ -848,7 +752,7 @@ impl InsightChatService {
             .and_then(|m| m.images.as_ref())
             .map(|imgs| !imgs.is_empty())
             .unwrap_or(false);
-        let offer_describe_tool = !describes_then_inlines && local_first_user_has_image;
+        let offer_describe_tool = backend.images_inline && local_first_user_has_image;
         let gate_opts = self.generator.current_gate_opts_for_persona(
             offer_describe_tool,
             Some((req.user_id, &active_persona)),
@@ -879,16 +783,13 @@ impl InsightChatService {
 
         let outcome = self
             .run_streaming_agentic_loop(
-                chat_backend,
-                &ollama_client,
+                &backend,
                 &mut messages,
                 tools,
                 &image_base64,
                 &normalized,
                 req.user_id,
                 &active_persona,
-                &model_used,
-                &effective_backend,
                 max_iterations,
                 &tx,
             )
@@ -916,7 +817,7 @@ impl InsightChatService {
 
         let mut amended_insight_id: Option<i32> = None;
         if req.amend {
-            let title = self.generate_title(chat_backend, &final_content).await?;
+            let title = self.generate_title(&backend, &final_content).await?;
 
             // Amended rows intentionally do not inherit the parent's
             // `fewshot_source_ids`. The parent's few-shot influence is still
@@ -932,7 +833,7 @@ impl InsightChatService {
                 model_version: model_used.clone(),
                 is_current: true,
                 training_messages: Some(json),
-                backend: effective_backend.clone(),
+                backend: kind.as_str().to_string(),
                 fewshot_source_ids: None,
                 content_hash: None,
             };
@@ -958,7 +859,7 @@ impl InsightChatService {
                 eval_tokens: last_eval_count,
                 num_ctx: req.num_ctx,
                 amended_insight_id,
-                backend_used: effective_backend,
+                backend_used: kind.as_str().to_string(),
                 model_used,
             })
             .await;
@@ -984,21 +885,23 @@ impl InsightChatService {
             .filter(|s| !s.trim().is_empty())
             .unwrap_or_else(|| "default".to_string());
         let effective_backend = resolve_bootstrap_backend(req.backend.as_deref())?;
-        let is_hybrid = effective_backend == "hybrid";
-        let local_via_llamacpp =
-            crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
-        let describes_then_inlines = is_hybrid;
+        let kind = BackendKind::parse(&effective_backend)?;
 
         let max_iterations = req
             .max_iterations
             .unwrap_or(DEFAULT_MAX_ITERATIONS)
             .clamp(1, env_max_iterations());
 
-        let custom_model = req.model.clone().filter(|m| !m.is_empty());
-        let (chat_backend_holder, ollama_client) =
-            self.build_chat_clients(&effective_backend, custom_model.as_deref(), &req)?;
-        let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref();
-        let model_used = chat_backend.primary_model().to_string();
+        let overrides = SamplingOverrides {
+            model: req.model.clone().filter(|m| !m.is_empty()),
+            num_ctx: req.num_ctx,
+            temperature: req.temperature,
+            top_p: req.top_p,
+            top_k: req.top_k,
+            min_p: req.min_p,
+        };
+        let backend = self.generator.resolve_backend(kind, &overrides).await?;
+        let model_used = backend.model().to_string();
 
         // Load image bytes once. RAW preview fallback is handled inside
         // load_image_as_base64. Errors degrade silently — a chat that
@@ -1020,26 +923,17 @@ impl InsightChatService {
             });
 
         // Describe-then-inline (hybrid only): pre-describe the image so a
-        // text-only chat model gets the visual description inline. llamacpp
-        // sends images directly to the chat model.
-        let visual_block = if describes_then_inlines {
+        // text-only chat model gets the visual description inline.
+        // images_inline backends send images directly to the chat model.
+        let visual_block = if !backend.images_inline {
             match image_base64.as_deref() {
                 Some(b64) => {
-                    let described = if local_via_llamacpp {
-                        self.llamacpp
-                            .as_ref()
-                            .expect("local_via_llamacpp guarantees Some")
-                            .describe_image(b64)
-                            .await
-                    } else {
-                        self.ollama.describe_image(b64).await
-                    };
-                    match described {
+                    match backend.local().describe_image(b64).await {
                         Ok(desc) => {
                             format!("Visual description (from local vision model):\n{}\n", desc)
                         }
                         Err(e) => {
-                            log::warn!("{} bootstrap: describe_image failed: {}", effective_backend, e);
+                            log::warn!("{} bootstrap: describe_image failed: {}", kind.as_str(), e);
                             String::new()
                         }
                     }
@@ -1050,10 +944,10 @@ impl InsightChatService {
             String::new()
         };
 
-        // Tool gates. Local + image present → expose describe_photo so
-        // the chat model can re-look at the photo on demand. Hybrid:
+        // Tool gates. images_inline + image present → expose describe_photo so
+        // the chat model can re-look at the photo on demand. Non-inline:
         // already inlined, no tool needed.
-        let offer_describe_tool = !describes_then_inlines && image_base64.is_some();
+        let offer_describe_tool = backend.images_inline && image_base64.is_some();
         let gate_opts = self.generator.current_gate_opts_for_persona(
             offer_describe_tool,
             Some((req.user_id, &active_persona)),
@@ -1079,23 +973,22 @@ impl InsightChatService {
         );
         let system_msg = ChatMessage::system(system_content);
         let mut user_msg = ChatMessage::user(req.user_message.clone());
-        if !describes_then_inlines && let Some(ref img) = image_base64 {
-            user_msg.images = Some(vec![img.clone()]);
+        if backend.images_inline {
+            if let Some(ref img) = image_base64 {
+                user_msg.images = Some(vec![img.clone()]);
+            }
         }
         let mut messages = vec![system_msg, user_msg];
 
         let outcome = self
             .run_streaming_agentic_loop(
-                chat_backend,
-                &ollama_client,
+                &backend,
                 &mut messages,
                 tools,
                 &image_base64,
                 &normalized,
                 req.user_id,
                 &active_persona,
-                &model_used,
-                &effective_backend,
                 max_iterations,
                 &tx,
             )
@@ -1108,7 +1001,7 @@ impl InsightChatService {
             final_content,
         } = outcome;
 
-        let title = self.generate_title(chat_backend, &final_content).await?;
+        let title = self.generate_title(&backend, &final_content).await?;
 
         let json = serde_json::to_string(&messages)
             .map_err(|e| anyhow!("failed to serialize chat history: {}", e))?;
@@ -1121,7 +1014,7 @@ impl InsightChatService {
             model_version: model_used.clone(),
             is_current: true,
             training_messages: Some(json),
-            backend: effective_backend.clone(),
+            backend: kind.as_str().to_string(),
             fewshot_source_ids: None,
             content_hash: None,
         };
@@ -1144,7 +1037,7 @@ impl InsightChatService {
                 eval_tokens: last_eval_count,
                 num_ctx: req.num_ctx,
                 amended_insight_id: Some(stored.id),
-                backend_used: effective_backend,
+                backend_used: kind.as_str().to_string(),
                 model_used,
             })
             .await;
@@ -1152,95 +1045,12 @@ impl InsightChatService {
         Ok(())
     }
 
-    /// Set up chat clients (Ollama + optional OpenRouter / LlamaCpp) shared
-    /// by bootstrap and continuation. Returns the chat-side backend client
-    /// (boxed because each backend has a different concrete type) and the
-    /// Ollama client used for describe-image / local tool calls.
-    ///
-    /// `effective_backend` must be one of `"local"` or `"hybrid"` (validated
-    /// upstream). Hybrid → OpenRouter; local with `LLM_BACKEND=llamacpp` →
-    /// llama-swap; pure local → Ollama. Returns the dispatched chat client
-    /// plus the (possibly per-request) Ollama client that the caller uses
-    /// for non-chat helpers (image describe in non-llamacpp mode, tool ops).
-    fn build_chat_clients(
-        &self,
-        effective_backend: &str,
-        custom_model: Option<&str>,
-        req: &ChatTurnRequest,
-    ) -> Result<(Box<dyn LlmClient>, OllamaClient)> {
-        let mut ollama_client = self.ollama.clone();
-
-        if effective_backend == "hybrid" {
-            let arc = self.openrouter.as_ref().ok_or_else(|| {
-                anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
-            })?;
-            let mut c: OpenRouterClient = (**arc).clone();
-            if let Some(m) = custom_model {
-                c.primary_model = m.to_string();
-            }
-            if req.temperature.is_some()
-                || req.top_p.is_some()
-                || req.top_k.is_some()
-                || req.min_p.is_some()
-            {
-                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
-            }
-            if let Some(ctx) = req.num_ctx {
-                c.set_num_ctx(Some(ctx));
-            }
-            return Ok((Box::new(c), ollama_client));
-        }
-
-        // Local mode — env switch decides between Ollama and llama-swap.
-        if crate::ai::local_backend_is_llamacpp()
-            && let Some(arc) = self.llamacpp.as_ref()
-        {
-            let mut c: LlamaCppClient = (**arc).clone();
-            if let Some(m) = custom_model {
-                c.primary_model = m.to_string();
-            }
-            if req.temperature.is_some()
-                || req.top_p.is_some()
-                || req.top_k.is_some()
-                || req.min_p.is_some()
-            {
-                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
-            }
-            if let Some(ctx) = req.num_ctx {
-                c.set_num_ctx(Some(ctx));
-            }
-            return Ok((Box::new(c), ollama_client));
-        }
-
-        if let Some(m) = custom_model
-            && m != self.ollama.primary_model
-        {
-            ollama_client = OllamaClient::new(
-                self.ollama.primary_url.clone(),
-                self.ollama.fallback_url.clone(),
-                m.to_string(),
-                Some(m.to_string()),
-            );
-        }
-        if req.temperature.is_some()
-            || req.top_p.is_some()
-            || req.top_k.is_some()
-            || req.min_p.is_some()
-        {
-            ollama_client.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
-        }
-        if let Some(ctx) = req.num_ctx {
-            ollama_client.set_num_ctx(Some(ctx));
-        }
-        Ok((Box::new(ollama_client.clone()), ollama_client))
-    }
-
     /// Generate a short title via the same chat backend so voice stays
     /// consistent with the body. Mirrors generate_agentic_insight_for_photo's
     /// titling pass.
     async fn generate_title(
         &self,
-        chat_backend: &dyn LlmClient,
+        backend: &ResolvedBackend,
         final_content: &str,
     ) -> Result<String> {
         let title_prompt = format!(
@@ -1248,7 +1058,8 @@ impl InsightChatService {
              Capture the key moment or theme. Return ONLY the title, nothing else.",
             final_content
         );
-        let title_raw = chat_backend
+        let title_raw = backend
+            .chat()
             .generate(
                 &title_prompt,
                 Some(
@@ -1266,18 +1077,13 @@ impl InsightChatService {
     /// final assistant content.
     async fn run_streaming_agentic_loop(
         &self,
-        chat_backend: &dyn LlmClient,
-        ollama_client: &OllamaClient,
+        backend: &ResolvedBackend,
         messages: &mut Vec<ChatMessage>,
         tools: Vec<Tool>,
         image_base64: &Option<String>,
         normalized: &str,
         user_id: i32,
         active_persona: &str,
-        // Provenance — stamped onto any store_fact tool call made
-        // during this loop. Mirrors the non-streaming chat path.
-        model_used: &str,
-        effective_backend: &str,
         max_iterations: usize,
         tx: &tokio::sync::mpsc::Sender<ChatStreamEvent>,
     ) -> Result<AgenticLoopOutcome> {
@@ -1296,7 +1102,8 @@ impl InsightChatService {
                 })
                 .await;
 
-            let mut stream = chat_backend
+            let mut stream = backend
+                .chat()
                 .chat_with_tools_stream(messages.clone(), tools.clone())
                 .await?;
 
@@ -1353,13 +1160,11 @@ impl InsightChatService {
                         .execute_tool(
                             &tool_call.function.name,
                             &tool_call.function.arguments,
-                            ollama_client,
+                            backend,
                             image_base64,
                             normalized,
                             user_id,
                             active_persona,
-                            model_used,
-                            effective_backend,
                             &cx,
                         )
                         .await;
@@ -1394,7 +1199,8 @@ impl InsightChatService {
             messages.push(ChatMessage::user(
                 "Please write your final answer now without calling any more tools.",
             ));
-            let mut stream = chat_backend
+            let mut stream = backend
+                .chat()
                 .chat_with_tools_stream(messages.clone(), vec![])
                 .await?;
             let mut final_message: Option<ChatMessage> = None;
diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs
index 075001a..f95c6dc 100644
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -1594,29 +1594,24 @@ Return ONLY the summary, nothing else."#,
         &self,
         tool_name: &str,
         arguments: &serde_json::Value,
-        ollama: &OllamaClient,
+        backend: &ResolvedBackend,
         image_base64: &Option<String>,
         file_path: &str,
         user_id: i32,
         persona_id: &str,
-        // Provenance — written into entity_facts.created_by_* when
-        // the loop calls store_fact. The caller knows the actual
-        // chat-runtime model and backend (which may differ from
-        // ollama.primary_model in hybrid mode where chat lives on
-        // OpenRouter while Ollama still handles vision).
-        model: &str,
-        backend: &str,
         cx: &opentelemetry::Context,
     ) -> String {
+        let model = backend.model();
+        let backend_label = backend.kind.as_str();
         let result = match tool_name {
-            "search_rag" => self.tool_search_rag(arguments, ollama, cx).await,
+            "search_rag" => self.tool_search_rag(arguments, backend.local(), cx).await,
             "search_messages" => self.tool_search_messages(arguments, cx).await,
             "get_sms_messages" => self.tool_get_sms_messages(arguments, cx).await,
             "get_calendar_events" => self.tool_get_calendar_events(arguments, cx).await,
             "get_location_history" => self.tool_get_location_history(arguments, cx).await,
             "get_file_tags" => self.tool_get_file_tags(arguments, cx).await,
             "get_faces_in_photo" => self.tool_get_faces_in_photo(arguments, cx).await,
-            "describe_photo" => self.tool_describe_photo(ollama, image_base64).await,
+            "describe_photo" => self.tool_describe_photo(backend.local(), image_base64).await,
             "reverse_geocode" => self.tool_reverse_geocode(arguments).await,
             "get_personal_place_at" => self.tool_get_personal_place_at(arguments).await,
             "recall_entities" => self.tool_recall_entities(arguments, cx).await,
@@ -1624,19 +1619,19 @@ Return ONLY the summary, nothing else."#,
                 self.tool_recall_facts_for_photo(arguments, user_id, persona_id, cx)
                     .await
             }
-            "store_entity" => self.tool_store_entity(arguments, ollama, cx).await,
+            "store_entity" => self.tool_store_entity(arguments, cx).await,
             "store_fact" => {
                 self.tool_store_fact(
-                    arguments, file_path, user_id, persona_id, model, backend, cx,
+                    arguments, file_path, user_id, persona_id, model, backend_label, cx,
                 )
                 .await
             }
             "update_fact" => {
-                self.tool_update_fact(arguments, user_id, persona_id, model, backend, cx)
+                self.tool_update_fact(arguments, user_id, persona_id, model, backend_label, cx)
                     .await
             }
             "supersede_fact" => {
-                self.tool_supersede_fact(arguments, user_id, persona_id, model, backend, cx)
+                self.tool_supersede_fact(arguments, user_id, persona_id, model, backend_label, cx)
                     .await
             }
             "get_current_datetime" => Self::tool_get_current_datetime(),
@@ -1654,7 +1649,7 @@ Return ONLY the summary, nothing else."#,
     async fn tool_search_rag(
         &self,
         args: &serde_json::Value,
-        ollama: &OllamaClient,
+        local: &dyn LlmClient,
         _cx: &opentelemetry::Context,
     ) -> String {
         let query = match args.get("query").and_then(|v| v.as_str()) {
@@ -1718,7 +1713,7 @@ Return ONLY the summary, nothing else."#,
         };
 
         let final_results = if rerank_enabled && results.len() > limit {
-            match self.rerank_with_llm(&query, &results, limit, ollama).await {
+            match self.rerank_with_llm(&query, &results, limit, local).await {
                 Ok(reordered) => reordered,
                 Err(e) => {
                     log::warn!("rerank failed, using vector order: {}", e);
@@ -1744,7 +1739,7 @@ Return ONLY the summary, nothing else."#,
         query: &str,
         candidates: &[String],
         limit: usize,
-        ollama: &OllamaClient,
+        local: &dyn LlmClient,
     ) -> Result<Vec<String>> {
         let query_preview: String = query.chars().take(60).collect();
         log::info!(
@@ -1785,15 +1780,7 @@ Return ONLY the summary, nothing else."#,
         let system = Some(
             "You are a terse relevance ranker. You output only numbers separated by commas.",
         );
-        let response = if crate::ai::local_backend_is_llamacpp() {
-            if let Some(ref lc) = self.llamacpp {
-                lc.generate(&prompt, system, None).await?
-            } else {
-                ollama.generate_no_think(&prompt, system).await?
-            }
-        } else {
-            ollama.generate_no_think(&prompt, system).await?
-        };
+        let response = local.generate(&prompt, system, None).await?;
         log::info!(
             "rerank: finished in {} ms (prompt={} chars)",
             started.elapsed().as_millis(),
@@ -2365,31 +2352,17 @@ Return ONLY the summary, nothing else."#,
         out
     }
 
-    /// Tool: describe_photo — generate a visual description of the photo.
-    /// Routes through llama-swap when `LLM_BACKEND=llamacpp`, Ollama otherwise.
     async fn tool_describe_photo(
         &self,
-        ollama: &OllamaClient,
+        local: &dyn LlmClient,
         image_base64: &Option<String>,
     ) -> String {
         log::info!("tool_describe_photo: generating visual description");
-
         match image_base64 {
-            Some(img) => {
-                let result = if crate::ai::local_backend_is_llamacpp() {
-                    if let Some(ref lc) = self.llamacpp {
-                        lc.describe_image(img).await
-                    } else {
-                        ollama.generate_photo_description(img).await
-                    }
-                } else {
-                    ollama.generate_photo_description(img).await
-                };
-                match result {
-                    Ok(desc) => desc,
-                    Err(e) => format!("Error describing photo: {}", e),
-                }
-            }
+            Some(img) => match local.describe_image(img).await {
+                Ok(desc) => desc,
+                Err(e) => format!("Error describing photo: {}", e),
+            },
             None => "No image available for description.".to_string(),
         }
     }
@@ -2635,7 +2608,6 @@ Return ONLY the summary, nothing else."#,
     async fn tool_store_entity(
         &self,
         args: &serde_json::Value,
-        _ollama: &OllamaClient,
         cx: &opentelemetry::Context,
     ) -> String {
         use crate::database::models::InsertEntity;
@@ -3775,243 +3747,25 @@ Return ONLY the summary, nothing else."#,
         span.set_attribute(KeyValue::new("file_path", file_path.clone()));
         span.set_attribute(KeyValue::new("max_iterations", max_iterations as i64));
 
-        // 1a. Resolve backend label (defaults to "local").
-        let backend_label = backend
-            .as_deref()
-            .map(|s| s.trim().to_lowercase())
-            .filter(|s| !s.is_empty())
-            .unwrap_or_else(|| "local".to_string());
-        if !matches!(backend_label.as_str(), "local" | "hybrid") {
-            return Err(anyhow::anyhow!(
-                "unknown backend '{}'; expected 'local' or 'hybrid'",
-                backend_label
-            ));
-        }
-        span.set_attribute(KeyValue::new("backend", backend_label.clone()));
-        let is_hybrid = backend_label == "hybrid";
-        // `LLM_BACKEND=llamacpp` swaps Ollama out for llama-swap as the
-        // "local" stack — chat + embeddings route through llama-swap.
-        // llamacpp models receive images directly (vision-capable); only
-        // hybrid mode (OpenRouter chat) uses describe-then-inline.
-        let local_via_llamacpp =
-            crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
-        let describes_then_inlines = is_hybrid;
-        let ollama_is_chat = !is_hybrid && !local_via_llamacpp;
-
-        // 1b. Always build an Ollama client. In local mode it owns the chat
-        //     loop; in hybrid/llamacpp mode it still handles tool-local calls
-        //     (e.g. future embedding-backed tools). The chat backend is
-        //     selected separately below.
-        //     Sampling overrides only apply when Ollama is the chat backend.
-        let apply_sampling_to_ollama = ollama_is_chat;
-        let mut ollama_client = if let Some(ref model) = custom_model
-            && ollama_is_chat
-        {
-            log::info!("Using custom model for agentic: {}", model);
-            span.set_attribute(KeyValue::new("custom_model", model.clone()));
-            OllamaClient::new(
-                self.ollama.primary_url.clone(),
-                self.ollama.fallback_url.clone(),
-                model.clone(),
-                Some(model.clone()),
-            )
-        } else {
-            if ollama_is_chat {
-                span.set_attribute(KeyValue::new("model", self.ollama.primary_model.clone()));
-            }
-            self.ollama.clone()
-        };
-
-        if apply_sampling_to_ollama {
-            if let Some(ctx) = num_ctx {
-                log::info!("Using custom context size: {}", ctx);
-                span.set_attribute(KeyValue::new("num_ctx", ctx as i64));
-                ollama_client.set_num_ctx(Some(ctx));
-            }
-
-            if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() {
-                log::info!(
-                    "Using sampling params — temperature: {:?}, top_p: {:?}, top_k: {:?}, min_p: {:?}",
-                    temperature,
-                    top_p,
-                    top_k,
-                    min_p
-                );
-                if let Some(t) = temperature {
-                    span.set_attribute(KeyValue::new("temperature", t as f64));
-                }
-                if let Some(p) = top_p {
-                    span.set_attribute(KeyValue::new("top_p", p as f64));
-                }
-                if let Some(k) = top_k {
-                    span.set_attribute(KeyValue::new("top_k", k as i64));
-                }
-                if let Some(m) = min_p {
-                    span.set_attribute(KeyValue::new("min_p", m as f64));
-                }
-                ollama_client.set_sampling_params(temperature, top_p, top_k, min_p);
-            }
-        }
-
-        // 1c. In hybrid mode, clone the configured OpenRouter client and
-        //     apply per-request overrides.
-        let openrouter_client: Option<OpenRouterClient> = if is_hybrid {
-            let arc = self.openrouter.as_ref().ok_or_else(|| {
-                anyhow::anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
-            })?;
-            let mut c: OpenRouterClient = (**arc).clone();
-            if let Some(ref m) = custom_model {
-                c.primary_model = m.clone();
-                span.set_attribute(KeyValue::new("custom_model", m.clone()));
-            }
-            span.set_attribute(KeyValue::new("openrouter_model", c.primary_model.clone()));
-            if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() {
-                if let Some(t) = temperature {
-                    span.set_attribute(KeyValue::new("temperature", t as f64));
-                }
-                if let Some(p) = top_p {
-                    span.set_attribute(KeyValue::new("top_p", p as f64));
-                }
-                if let Some(k) = top_k {
-                    span.set_attribute(KeyValue::new("top_k", k as i64));
-                }
-                if let Some(m) = min_p {
-                    span.set_attribute(KeyValue::new("min_p", m as f64));
-                }
-                c.set_sampling_params(temperature, top_p, top_k, min_p);
-            }
-            if let Some(ctx) = num_ctx {
-                span.set_attribute(KeyValue::new("num_ctx", ctx as i64));
-                c.set_num_ctx(Some(ctx));
-            }
-            Some(c)
-        } else {
-            None
-        };
-
-        // 1d. When `LLM_BACKEND=llamacpp` and we're in local mode (not
-        //     hybrid — hybrid keeps chat on OpenRouter), clone the llamacpp
-        //     client and apply per-request overrides. Same shape as the
-        //     openrouter branch above; describe_image will route through
-        //     the vision slot configured on the client.
-        let llamacpp_client: Option<LlamaCppClient> = if local_via_llamacpp && !is_hybrid {
-            let arc = self.llamacpp.as_ref().ok_or_else(|| {
-                anyhow::anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured")
-            })?;
-            let mut c: LlamaCppClient = (**arc).clone();
-            if let Some(ref m) = custom_model {
-                c.primary_model = m.clone();
-                span.set_attribute(KeyValue::new("custom_model", m.clone()));
-            }
-            span.set_attribute(KeyValue::new("llamacpp_model", c.primary_model.clone()));
-            if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() {
-                if let Some(t) = temperature {
-                    span.set_attribute(KeyValue::new("temperature", t as f64));
-                }
-                if let Some(p) = top_p {
-                    span.set_attribute(KeyValue::new("top_p", p as f64));
-                }
-                if let Some(k) = top_k {
-                    span.set_attribute(KeyValue::new("top_k", k as i64));
-                }
-                if let Some(m) = min_p {
-                    span.set_attribute(KeyValue::new("min_p", m as f64));
-                }
-                c.set_sampling_params(temperature, top_p, top_k, min_p);
-            }
-            if let Some(ctx) = num_ctx {
-                span.set_attribute(KeyValue::new("num_ctx", ctx as i64));
-                c.set_num_ctx(Some(ctx));
-            }
-            Some(c)
-        } else {
-            None
+        // 1. Resolve backend + build clients.
+        let kind = BackendKind::parse(
+            backend.as_deref().unwrap_or("local"),
+        )?;
+        span.set_attribute(KeyValue::new("backend", kind.as_str()));
+        let overrides = SamplingOverrides {
+            model: custom_model,
+            num_ctx,
+            temperature,
+            top_p,
+            top_k,
+            min_p,
         };
+        let backend = self.resolve_backend(kind, &overrides).await?;
+        span.set_attribute(KeyValue::new("model", backend.model().to_string()));
+        span.set_attribute(KeyValue::new("images_inline", backend.images_inline));
 
         let insight_cx = current_cx.with_span(span);
 
-        // 2. Verify chat model supports tool calling.
-        //    - local: existing Ollama model availability + capability check.
-        //    - hybrid: trust the operator's curated allowlist
-        //      (OPENROUTER_ALLOWED_MODELS) — no live precheck. A bad model id
-        //      surfaces as a chat-call error on the next step.
-        let has_vision = if describes_then_inlines {
-            // Hybrid: chat model never sees images — describe-then-inject.
-            true
-        } else if local_via_llamacpp {
-            // llama-swap models receive images directly via OpenAI content
-            // parts. Capability probing isn't available (no `/api/show`),
-            // so assume vision support; a misconfigured model surfaces as
-            // a chat-call error.
-            true
-        } else {
-            if let Some(ref model_name) = custom_model {
-                let available_on_primary =
-                    OllamaClient::is_model_available(&ollama_client.primary_url, model_name)
-                        .await
-                        .unwrap_or(false);
-
-                let available_on_fallback =
-                    if let Some(ref fallback_url) = ollama_client.fallback_url {
-                        OllamaClient::is_model_available(fallback_url, model_name)
-                            .await
-                            .unwrap_or(false)
-                    } else {
-                        false
-                    };
-
-                if !available_on_primary && !available_on_fallback {
-                    anyhow::bail!(
-                        "model not available: '{}' not found on any configured server",
-                        model_name
-                    );
-                }
-            }
-
-            let model_name_for_caps = &ollama_client.primary_model;
-            let capabilities = match OllamaClient::check_model_capabilities(
-                &ollama_client.primary_url,
-                model_name_for_caps,
-            )
-            .await
-            {
-                Ok(caps) => caps,
-                Err(_) => {
-                    let fallback_url = ollama_client.fallback_url.as_deref().ok_or_else(|| {
-                        anyhow::anyhow!(
-                            "Failed to check model capabilities for '{}': model not found on primary server and no fallback configured",
-                            model_name_for_caps
-                        )
-                    })?;
-                    OllamaClient::check_model_capabilities(fallback_url, model_name_for_caps)
-                        .await
-                        .map_err(|e| {
-                            anyhow::anyhow!(
-                                "Failed to check model capabilities for '{}': {}",
-                                model_name_for_caps,
-                                e
-                            )
-                        })?
-                }
-            };
-
-            if !capabilities.has_tool_calling {
-                return Err(anyhow::anyhow!(
-                    "tool calling not supported by model '{}'",
-                    ollama_client.primary_model
-                ));
-            }
-
-            insight_cx
-                .span()
-                .set_attribute(KeyValue::new("model_has_vision", capabilities.has_vision));
-            insight_cx
-                .span()
-                .set_attribute(KeyValue::new("model_has_tool_calling", true));
-
-            capabilities.has_vision
-        };
-
         // 3. Fetch EXIF
         let exif = {
             let mut exif_dao = self.exif_dao.lock().expect("Unable to lock ExifDao");
@@ -4103,60 +3857,33 @@ Return ONLY the summary, nothing else."#,
             }
         };
 
-        // 7. Load image if vision capable.
-        //    In hybrid mode we ALSO describe it locally now so the
-        //    description can be inlined as text — the OpenRouter chat model
-        //    never receives the base64 image directly.
-        let image_base64 = if has_vision {
-            match self.load_image_as_base64(&file_path) {
-                Ok(b64) => {
-                    log::info!("Loaded image for vision-capable agentic model");
-                    Some(b64)
-                }
-                Err(e) => {
-                    log::warn!("Failed to load image for agentic vision: {}", e);
-                    None
-                }
+        // 7. Load image. Always attempted — vision-capable models get the
+        //    base64 inline; hybrid mode describes it locally and injects text.
+        let image_base64 = match self.load_image_as_base64(&file_path) {
+            Ok(b64) => {
+                log::info!("Loaded image for agentic model");
+                Some(b64)
+            }
+            Err(e) => {
+                log::warn!("Failed to load image for agentic: {}", e);
+                None
             }
-        } else {
-            None
         };
 
-        // describe-then-inline path (hybrid only). Vision describe routes
-        // through whichever local backend is configured — llama-swap when
-        // `local_via_llamacpp`, otherwise Ollama.
-        let inlined_visual_description: Option<String> = if describes_then_inlines {
+        // Describe-then-inline (hybrid only). Vision describe routes through
+        // the local backend so non-text work stays off OpenRouter.
+        let inlined_visual_description: Option<String> = if !backend.images_inline {
             match image_base64.as_deref() {
-                Some(b64) => {
-                    let described = if local_via_llamacpp {
-                        self.llamacpp
-                            .as_ref()
-                            .expect("local_via_llamacpp guarantees Some")
-                            .describe_image(b64)
-                            .await
-                    } else {
-                        self.ollama.describe_image(b64).await
-                    };
-
-                    match described {
-                        Ok(desc) => {
-                            log::info!(
-                                "{}: vision describe succeeded ({} chars)",
-                                backend_label,
-                                desc.len()
-                            );
-                            Some(desc)
-                        }
-                        Err(e) => {
-                            log::warn!(
-                                "{}: vision describe failed, continuing without: {}",
-                                backend_label,
-                                e
-                            );
-                            None
-                        }
+                Some(b64) => match backend.local().describe_image(b64).await {
+                    Ok(desc) => {
+                        log::info!("{}: vision describe succeeded ({} chars)", kind, desc.len());
+                        Some(desc)
                     }
-                }
+                    Err(e) => {
+                        log::warn!("{}: vision describe failed, continuing without: {}", kind, e);
+                        None
+                    }
+                },
                 None => None,
             }
         } else {
@@ -4228,34 +3955,24 @@ Return ONLY the summary, nothing else."#,
             date = date_taken.format("%B %d, %Y"),
         );
 
-        // 10. Define tools. Gate flags computed from current data presence;
-        //     hybrid mode omits describe_photo since the chat model receives
-        //     the visual description inline (so we pass `false` for
-        //     has_vision in that mode regardless of the model's actual
-        //     capability).
-        let gate_opts = self.current_gate_opts(has_vision && !describes_then_inlines);
+        // 10. Define tools. describe_photo offered only when the chat model
+        //     sees images directly (images_inline); in hybrid mode the visual
+        //     description is already inlined as text.
+        let gate_opts = self.current_gate_opts(backend.images_inline);
         let tools = Self::build_tool_definitions(gate_opts);
 
-        // 11. Build initial messages. In describe-then-inline modes images
-        //     are never attached to the wire message — the description is part
-        //     of `user_content`.
+        // 11. Build initial messages. images_inline → attach base64 to the
+        //     user message; describe-then-inline → text was already injected.
         let system_msg = ChatMessage::system(system_content);
         let mut user_msg = ChatMessage::user(user_content);
-        if !describes_then_inlines && let Some(ref img) = image_base64 {
-            user_msg.images = Some(vec![img.clone()]);
+        if backend.images_inline {
+            if let Some(ref img) = image_base64 {
+                user_msg.images = Some(vec![img.clone()]);
+            }
         }
 
         let mut messages = vec![system_msg, user_msg];
 
-        // 12. Agentic loop — dispatch through the selected backend.
-        let chat_backend: &dyn LlmClient = if let Some(ref lc_c) = llamacpp_client {
-            lc_c
-        } else if let Some(ref or_c) = openrouter_client {
-            or_c
-        } else {
-            &ollama_client
-        };
-
         let loop_span = tracer.start_with_context("ai.agentic.loop", &insight_cx);
         let loop_cx = insight_cx.with_span(loop_span);
 
@@ -4268,7 +3985,8 @@ Return ONLY the summary, nothing else."#,
             iterations_used = iteration + 1;
             log::info!("Agentic iteration {}/{}", iteration + 1, max_iterations);
 
-            let (response, prompt_tokens, eval_tokens) = chat_backend
+            let (response, prompt_tokens, eval_tokens) = backend
+                .chat()
                 .chat_with_tools(messages.clone(), tools.clone())
                 .await?;
 
@@ -4308,13 +4026,11 @@ Return ONLY the summary, nothing else."#,
                         .execute_tool(
                             &tool_call.function.name,
                             &tool_call.function.arguments,
-                            &ollama_client,
+                            &backend,
                             &image_base64,
                             &file_path,
                             user_id,
                             &persona_id,
-                            chat_backend.primary_model(),
-                            &backend_label,
                             &loop_cx,
                         )
                         .await;
@@ -4338,7 +4054,8 @@ Return ONLY the summary, nothing else."#,
                 "Based on the context gathered, please write the final photo insight: a title and a detailed personal summary. Write in first person as {}.",
                 user_display_name()
             )));
-            let (final_response, prompt_tokens, eval_tokens) = chat_backend
+            let (final_response, prompt_tokens, eval_tokens) = backend
+                .chat()
                 .chat_with_tools(messages.clone(), vec![])
                 .await?;
             last_prompt_eval_count = prompt_tokens;
@@ -4360,7 +4077,8 @@ Return ONLY the summary, nothing else."#,
         let title_system = custom_system_prompt.as_deref().unwrap_or(
             "You are my long term memory assistant. Use only the information provided. Do not invent details.",
         );
-        let title_raw = chat_backend
+        let title_raw = backend
+            .chat()
             .generate(&title_prompt, Some(title_system), None)
             .await?;
         let title = title_raw.trim().trim_matches('"').to_string();
@@ -4383,7 +4101,7 @@ Return ONLY the summary, nothing else."#,
         };
 
         // 15. Store insight (returns the persisted row including its new id)
-        let model_version = chat_backend.primary_model().to_string();
+        let model_version = backend.model().to_string();
         let fewshot_source_ids_json = if fewshot_source_ids.is_empty() {
             None
         } else {
@@ -4398,7 +4116,7 @@ Return ONLY the summary, nothing else."#,
             model_version,
             is_current: true,
             training_messages,
-            backend: backend_label.clone(),
+            backend: kind.as_str().to_string(),
             fewshot_source_ids: fewshot_source_ids_json,
             content_hash: None,
         };
diff --git a/src/state.rs b/src/state.rs
index c4f810a..8cfccbb 100644
--- a/src/state.rs
+++ b/src/state.rs
@@ -290,9 +290,6 @@ impl Default for AppState {
             Arc::new(tokio::sync::Mutex::new(std::collections::HashMap::new()));
         let insight_chat = Arc::new(InsightChatService::new(
             Arc::new(insight_generator.clone()),
-            ollama.clone(),
-            openrouter.clone(),
-            llamacpp.clone(),
             insight_dao.clone(),
             chat_locks,
         ));
@@ -470,9 +467,6 @@ impl AppState {
             Arc::new(tokio::sync::Mutex::new(std::collections::HashMap::new()));
         let insight_chat = Arc::new(InsightChatService::new(
             Arc::new(insight_generator.clone()),
-            ollama.clone(),
-            None,
-            None,
             insight_dao.clone(),
             chat_locks,
         ));
-- 
2.49.1


From fb388c29d78152b53b42cd24f29855aacd0cd109 Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Sun, 24 May 2026 15:03:12 -0400
Subject: [PATCH 06/11] docs: update env + CLAUDE.md for direct-vision llamacpp
 + ResolvedBackend

llamacpp models now receive images directly instead of
describe-then-inline. LLAMA_SWAP_VISION_MODEL defaults to the
primary model. Document the ResolvedBackend dispatch pattern.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .env.example | 10 ++++++----
 CLAUDE.md    | 45 +++++++++++++++++++++++++++++----------------
 2 files changed, 35 insertions(+), 20 deletions(-)

diff --git a/.env.example b/.env.example
index 81fab4e..f7a1004 100644
--- a/.env.example
+++ b/.env.example
@@ -66,15 +66,17 @@ AGENTIC_CHAT_MAX_ITERATIONS=6
 # ── AI Insights — llama.cpp / llama-swap (optional) ─────────────────────
 # Set LLAMA_SWAP_URL plus LLM_BACKEND=llamacpp to swap the local stack
 # off Ollama. Talks OpenAI-compatible /v1 to a llama-swap proxy fronting
-# per-slot llama-server instances (chat / vision / embed). The chat slot
-# is treated as text-only — images are pre-described via the vision slot
-# and inlined into the prompt.
+# per-slot llama-server instances. Chat models receive images directly
+# via content-parts (vision-capable models assumed); a separate vision
+# slot is used only by the describe_photo tool and describe-image utility.
 # LLAMA_SWAP_URL=http://localhost:9292/v1
 # LLAMA_SWAP_PRIMARY_MODEL=chat
+# Optional dedicated vision slot for describe_image. Defaults to
+# PRIMARY_MODEL so describe_photo works without extra config.
 # LLAMA_SWAP_VISION_MODEL=vision
 # LLAMA_SWAP_EMBEDDING_MODEL=embed
 # Comma-separated allowlist surfaced by /insights/models when
-# LLM_BACKEND=llamacpp.
+# LLM_BACKEND=llamacpp. All report has_vision=true.
 # LLAMA_SWAP_ALLOWED_MODELS=chat,vision,embed
 # LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180
 
diff --git a/CLAUDE.md b/CLAUDE.md
index d06b29f..7f1da76 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -640,17 +640,16 @@ OPENROUTER_APP_TITLE=ImageApi                  # Optional attribution header
 LLM_BACKEND=ollama
 
 # llama.cpp / llama-swap (used when LLM_BACKEND=llamacpp). OpenAI-compatible
-# proxy hosting one or more llama-server processes (chat / vision / embed slots).
+# proxy hosting one or more llama-server processes. Chat models receive
+# images directly via content-parts (all models assumed vision-capable).
 LLAMA_SWAP_URL=http://localhost:9292/v1         # Required when LLM_BACKEND=llamacpp
 LLAMA_SWAP_PRIMARY_MODEL=chat                   # Chat slot id (matches config.yaml)
-LLAMA_SWAP_VISION_MODEL=vision                  # Vision slot id; describe_image routes here.
-                                                # The only slot reported as has_vision=true in
-                                                # /insights/models — chat slots are treated as
-                                                # text-only (images pre-described and inlined).
+LLAMA_SWAP_VISION_MODEL=                        # Dedicated vision slot for describe_image / describe_photo
+                                                # tool. Defaults to PRIMARY_MODEL when unset.
 LLAMA_SWAP_EMBEDDING_MODEL=embed                # Embedding slot id
 LLAMA_SWAP_ALLOWED_MODELS=chat,coder            # Curated allowlist surfaced by GET /insights/models
-                                                # when LLM_BACKEND=llamacpp. Empty = picker shows
-                                                # only the configured primary model.
+                                                # when LLM_BACKEND=llamacpp. All report has_vision=true.
+                                                # Empty = picker shows only the configured primary model.
 LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180          # Per-request timeout; bump for slow CPU offload
 
 # Insight Chat Continuation
@@ -675,21 +674,35 @@ This allows runtime verification of model availability before generating insight
 **Local backend switch (`LLM_BACKEND`):**
 
 One env var decides which "local" stack the server runs against — `ollama`
-(default) or `llamacpp`. It's global on purpose: chat, vision describe, and
+(default) or `llamacpp`. It's global on purpose: chat, vision, and
 embeddings all route through the same backend, so the embedding-vector
 column in SQLite stays in one vector space. Don't flip mid-deploy without
 re-embedding the affected rows — similarity search will collapse.
 
-- `LLM_BACKEND=ollama`: chat and embeddings use Ollama; vision describe
-  uses Ollama's multimodal model.
-- `LLM_BACKEND=llamacpp`: chat hits llama-swap's `chat` slot (which is
-  treated as text-only — images are pre-described via the `vision` slot
-  and inlined), embeddings hit the `embed` slot, vision describe hits the
-  `vision` slot. Requires `LLAMA_SWAP_URL`.
+- `LLM_BACKEND=ollama`: chat, vision, and embeddings use Ollama. Vision
+  capability is probed per-model via `/api/show`.
+- `LLM_BACKEND=llamacpp`: chat models receive images directly via OpenAI
+  content-parts (all models assumed vision-capable). Embeddings hit the
+  `embed` slot. A dedicated `LLAMA_SWAP_VISION_MODEL` slot (defaults to
+  the chat model) handles `describe_image` for the `describe_photo` tool.
+  Requires `LLAMA_SWAP_URL`.
 
 The per-request `backend=hybrid` override is orthogonal: it always sends
-chat to OpenRouter, but the describe pass still routes through whichever
-`LLM_BACKEND` is configured.
+chat to OpenRouter (text-only, images are pre-described and inlined), but
+the describe + embed passes still route through whichever `LLM_BACKEND`
+is configured.
+
+**Backend dispatch (`ResolvedBackend`):**
+
+`InsightGenerator::resolve_backend(kind, overrides)` is the single entry
+point that builds clients for a request. Returns a `ResolvedBackend` with
+two roles: `.chat()` (the agentic/chat client) and `.local()` (local-only
+utility calls: rerank, describe_image, embeddings). `BackendKind` is an
+enum (`Local` | `Hybrid`) replacing the stringly-typed `"local"` /
+`"hybrid"` labels. `SamplingOverrides` groups model/ctx/temp/top_p/top_k/
+min_p per-request overrides. All downstream code (`execute_tool`,
+`run_streaming_agentic_loop`, etc.) takes `&ResolvedBackend` rather than
+individual client references.
 
 `GET /insights/models` returns the local-backend models with capabilities
 in the same envelope shape regardless of `LLM_BACKEND`: Ollama servers
-- 
2.49.1


From 208344ad98c4c3c232a1ff79df15a51ee8f6255e Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Sun, 24 May 2026 19:27:29 -0400
Subject: [PATCH 07/11] ai: mirror chat model on local client to prevent
 mid-turn model swap

When the user selects a model from the picker, the local client's
primary_model and vision_model now match the chat model. Prevents
llama-swap exclusive mode from swapping models when describe_photo
or rerank fires during an agentic turn.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/ai/insight_generator.rs | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs
index f95c6dc..80a7627 100644
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -3635,8 +3635,16 @@ Return ONLY the summary, nothing else."#,
         };
 
         // ── local client (utility calls: rerank, describe_image, etc.) ─
+        // For llamacpp: mirror the chat model selection so rerank /
+        // describe_image hit the same model that's already loaded —
+        // avoids a mid-turn model swap in llama-swap exclusive mode.
         let local: Box<dyn LlmClient> = if local_via_llamacpp {
-            Box::new(self.llamacpp.as_ref().unwrap().as_ref().clone())
+            let mut lc = self.llamacpp.as_ref().unwrap().as_ref().clone();
+            if let Some(ref m) = overrides.model {
+                lc.primary_model = m.clone();
+                lc.set_vision_model(m.clone());
+            }
+            Box::new(lc)
         } else {
             Box::new(self.ollama.clone())
         };
-- 
2.49.1


From 9dba659d1e35b52f7a5e9f3ee4d09f65dcafcc39 Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Sun, 24 May 2026 19:29:51 -0400
Subject: [PATCH 08/11] test: add llamacpp model-slot consistency and
 content-null tests

Cover the properties that prevent mid-turn model swaps in llama-swap
exclusive mode: vision_model defaults to primary, cloned local client
mirrors the user-selected model, embeddings stay on their own slot.
Also test the content:null serialization for tool-calling messages.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/ai/llamacpp.rs | 87 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 87 insertions(+)

diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs
index 2a03aed..77cd05f 100644
--- a/src/ai/llamacpp.rs
+++ b/src/ai/llamacpp.rs
@@ -985,4 +985,91 @@ mod tests {
         assert!(vision.has_vision);
         assert!(other.has_vision);
     }
+
+    #[test]
+    fn vision_model_defaults_to_primary() {
+        let c = LlamaCppClient::new(None, Some("qwen3:32b".into()));
+        assert_eq!(c.primary_model, "qwen3:32b");
+        assert_eq!(c.vision_model, "qwen3:32b");
+    }
+
+    #[test]
+    fn vision_model_explicit_override_diverges_from_primary() {
+        let mut c = LlamaCppClient::new(None, Some("qwen3:32b".into()));
+        c.set_vision_model("minicpm-v".into());
+        assert_eq!(c.primary_model, "qwen3:32b");
+        assert_eq!(c.vision_model, "minicpm-v");
+    }
+
+    #[test]
+    fn cloned_local_with_model_override_keeps_all_slots_consistent() {
+        // Simulates what resolve_backend does for the `local` client:
+        // clone the configured client, then override primary + vision
+        // to match the user-selected chat model. This prevents mid-turn
+        // model swaps in llama-swap exclusive mode.
+        let mut base = LlamaCppClient::new(None, Some("chat".into()));
+        base.set_vision_model("vision".into());
+        base.set_embedding_model("embed".into());
+
+        let mut local = base.clone();
+        let user_selected = "qwen3:32b";
+        local.primary_model = user_selected.to_string();
+        local.set_vision_model(user_selected.to_string());
+
+        // Chat generation (rerank) routes through primary_model.
+        assert_eq!(local.primary_model, user_selected);
+        // describe_image routes through vision_model.
+        assert_eq!(local.vision_model, user_selected);
+        // Embeddings stay on the dedicated slot — separate endpoint,
+        // no model swap conflict.
+        assert_eq!(local.embedding_model, "embed");
+    }
+
+    #[test]
+    fn assistant_tool_calls_emit_null_content() {
+        let msg = ChatMessage {
+            role: "assistant".into(),
+            content: String::new(),
+            tool_calls: Some(vec![ToolCall {
+                id: Some("call_1".into()),
+                function: ToolCallFunction {
+                    name: "search".into(),
+                    arguments: json!({}),
+                },
+            }]),
+            images: None,
+        };
+        let wire = LlamaCppClient::messages_to_openai(&[msg]);
+        assert!(wire[0]["content"].is_null(), "empty content + tool_calls should emit null");
+    }
+
+    #[test]
+    fn assistant_with_content_and_tool_calls_preserves_content() {
+        let msg = ChatMessage {
+            role: "assistant".into(),
+            content: "Let me search for that.".into(),
+            tool_calls: Some(vec![ToolCall {
+                id: Some("call_1".into()),
+                function: ToolCallFunction {
+                    name: "search".into(),
+                    arguments: json!({}),
+                },
+            }]),
+            images: None,
+        };
+        let wire = LlamaCppClient::messages_to_openai(&[msg]);
+        assert_eq!(wire[0]["content"], "Let me search for that.");
+    }
+
+    #[test]
+    fn assistant_without_tool_calls_keeps_empty_string_content() {
+        let msg = ChatMessage {
+            role: "assistant".into(),
+            content: String::new(),
+            tool_calls: None,
+            images: None,
+        };
+        let wire = LlamaCppClient::messages_to_openai(&[msg]);
+        assert_eq!(wire[0]["content"], "");
+    }
 }
-- 
2.49.1


From b9175e27189f63de4c19f2341db388c7d8b4493d Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameron.cordes@pm.me>
Date: Mon, 25 May 2026 15:33:03 -0400
Subject: [PATCH 09/11] image: add xlarge (4096px) on-demand preview tier
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

New `PhotoSize::XLarge` variant sits between `Large` (2048px) and
`Full` (original). On-demand generated and disk-cached at
`_xlarge/<hash>.jpg`, same waterfall as `Large` (embedded RAW preview
→ ffmpeg → image crate). Sources below 4096px serve at native size.

Reduces decoded bitmap memory from ~192MB (48MP full) to ~64MB for
the mobile viewer's zoom tier.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/content_hash.rs   |   9 ++++
 src/data/mod.rs       |   1 +
 src/handlers/image.rs | 101 +++++++++++++++++++++++++++++++++++++++---
 src/thumbnails.rs     |  95 +++++++++++++++++++++++++++++++++++++--
 4 files changed, 196 insertions(+), 10 deletions(-)

diff --git a/src/content_hash.rs b/src/content_hash.rs
index d68dbef..a2a4632 100644
--- a/src/content_hash.rs
+++ b/src/content_hash.rs
@@ -62,6 +62,15 @@ pub fn large_preview_path(thumbs_dir: &Path, hash: &str) -> PathBuf {
         .join(format!("{}.jpg", hash))
 }
 
+/// Hash-keyed xlarge-preview path: `<thumbs_dir>/_xlarge/<hash[..2]>/<hash>.jpg`.
+pub fn xlarge_preview_path(thumbs_dir: &Path, hash: &str) -> PathBuf {
+    let shard = shard_prefix(hash);
+    thumbs_dir
+        .join("_xlarge")
+        .join(shard)
+        .join(format!("{}.jpg", hash))
+}
+
 /// Hash-keyed HLS output directory: `<video_dir>/<hash[..2]>/<hash>/`.
 /// The playlist lives at `playlist.m3u8` inside this directory and its
 /// segments are co-located so HLS relative references Just Work. See
diff --git a/src/data/mod.rs b/src/data/mod.rs
index 4780d6c..931b3c6 100644
--- a/src/data/mod.rs
+++ b/src/data/mod.rs
@@ -194,6 +194,7 @@ pub enum MediaType {
 #[serde(rename_all = "lowercase")]
 pub enum PhotoSize {
     Full,
+    XLarge,
     Large,
     Thumb,
 }
diff --git a/src/handlers/image.rs b/src/handlers/image.rs
index ca0f598..c1dbffb 100644
--- a/src/handlers/image.rs
+++ b/src/handlers/image.rs
@@ -83,12 +83,14 @@ pub async fn get_image(
     if let Some((library, path)) = resolved {
         let image_size = req.size.unwrap_or(PhotoSize::Full);
 
-        // `size=large` is only meaningful for stills — there's no useful
-        // "2048px video preview" tier. Videos fall back to the existing
-        // thumb pipeline (which already handles gif/static selection).
-        // `mut` so the Large branch can downgrade itself to `Full` after a
-        // generation failure (RAW-preview branch below keys off `Full`).
-        let mut image_size = if image_size == PhotoSize::Large && file_types::is_video_file(&path) {
+        // `size=large|xlarge` is only meaningful for stills — there's no
+        // useful "resized video preview" tier. Videos fall back to the
+        // existing thumb pipeline (which already handles gif/static
+        // selection). `mut` so preview branches can downgrade to `Full`
+        // after a generation failure.
+        let mut image_size = if (image_size == PhotoSize::Large || image_size == PhotoSize::XLarge)
+            && file_types::is_video_file(&path)
+        {
             PhotoSize::Thumb
         } else {
             image_size
@@ -196,6 +198,93 @@ pub async fn get_image(
             image_size = PhotoSize::Full;
         }
 
+        if image_size == PhotoSize::XLarge {
+            let relative_path = path
+                .strip_prefix(&library.root_path)
+                .expect("Error stripping library root prefix from xlarge preview");
+            let relative_path_str = relative_path.to_string_lossy().replace('\\', "/");
+            let thumbs = Path::new(&app_state.thumbnail_path);
+            let xlarge_dir = thumbs.join("_xlarge");
+
+            let hash_xlarge_path: Option<PathBuf> = {
+                let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
+                match dao.get_exif(&context, &relative_path_str) {
+                    Ok(Some(row)) => row
+                        .content_hash
+                        .as_deref()
+                        .map(|h| content_hash::xlarge_preview_path(thumbs, h)),
+                    _ => None,
+                }
+            };
+            let scoped_legacy_xlarge_path =
+                content_hash::library_scoped_legacy_path(&xlarge_dir, library.id, relative_path);
+
+            let existing = hash_xlarge_path
+                .as_ref()
+                .filter(|p| p.exists())
+                .cloned()
+                .or_else(|| {
+                    if scoped_legacy_xlarge_path.exists() {
+                        Some(scoped_legacy_xlarge_path.clone())
+                    } else {
+                        None
+                    }
+                });
+
+            if let Some(found) = existing {
+                if let Ok(file) = NamedFile::open(&found) {
+                    span.set_status(Status::Ok);
+                    return file
+                        .use_etag(true)
+                        .use_last_modified(true)
+                        .prefer_utf8(true)
+                        .into_response(&request);
+                }
+            }
+
+            let dest = hash_xlarge_path
+                .clone()
+                .unwrap_or_else(|| scoped_legacy_xlarge_path.clone());
+            let src = path.clone();
+            let dest_for_block = dest.clone();
+            let generated = web::block(move || {
+                if let Some(parent) = dest_for_block.parent() {
+                    std::fs::create_dir_all(parent)?;
+                }
+                let tmp = dest_for_block.with_extension("jpg.tmp");
+                crate::thumbnails::generate_xlarge_preview(&src, &tmp)?;
+                std::fs::rename(&tmp, &dest_for_block)?;
+                Ok::<(), std::io::Error>(())
+            })
+            .await;
+
+            match generated {
+                Ok(Ok(())) => {
+                    if let Ok(file) = NamedFile::open(&dest) {
+                        span.set_status(Status::Ok);
+                        return file
+                            .use_etag(true)
+                            .use_last_modified(true)
+                            .prefer_utf8(true)
+                            .into_response(&request);
+                    }
+                }
+                Ok(Err(e)) => {
+                    warn!(
+                        "XLarge preview generation failed for {:?}: {} — falling back to original",
+                        path, e
+                    );
+                }
+                Err(e) => {
+                    warn!(
+                        "XLarge preview blocking-pool error for {:?}: {} — falling back to original",
+                        path, e
+                    );
+                }
+            }
+            image_size = PhotoSize::Full;
+        }
+
         if image_size == PhotoSize::Thumb {
             let relative_path = path
                 .strip_prefix(&library.root_path)
diff --git a/src/thumbnails.rs b/src/thumbnails.rs
index 51b6200..a7334b0 100644
--- a/src/thumbnails.rs
+++ b/src/thumbnails.rs
@@ -36,12 +36,19 @@ use crate::video::actors::{generate_image_thumbnail_ffmpeg, generate_video_thumb
 /// `size=full` and the handler streams the original bytes.
 pub const LARGE_PREVIEW_MAX_DIM: u32 = 2048;
 
-/// JPEG quality for the large preview tier. 85 is the conventional
-/// "indistinguishable from source at viewing size" point — well above the
-/// `image` crate's default ~75, but well below quality-90+ territory where
-/// file size doubles for no perceptible win.
+/// JPEG quality for the large and xlarge preview tiers. 85 is the
+/// conventional "indistinguishable from source at viewing size" point —
+/// well above the `image` crate's default ~75, but well below quality-90+
+/// territory where file size doubles for no perceptible win.
 const LARGE_PREVIEW_JPEG_QUALITY: u8 = 85;
 
+/// Maximum long-edge size (px) for the xlarge preview tier. Bridges the
+/// gap between `large` (2048px, ~16MB decoded) and the original bytes
+/// (potentially 48+ MP / ~192MB decoded). At 4096px the decoded bitmap is
+/// ~64MB — enough for 2-3× pinch-zoom on any phone before the viewer
+/// needs to stream the true original.
+pub const XLARGE_PREVIEW_MAX_DIM: u32 = 4096;
+
 lazy_static! {
     pub static ref IMAGE_GAUGE: IntGauge = IntGauge::new(
         "imageserver_image_total",
@@ -205,6 +212,86 @@ fn generate_large_preview_ffmpeg(src: &Path, dest: &Path) -> std::io::Result<()>
     Ok(())
 }
 
+/// Generate the on-demand xlarge-preview tier (≈4096 long edge JPEG).
+///
+/// Same waterfall as [`generate_large_preview`] but targeting
+/// [`XLARGE_PREVIEW_MAX_DIM`]. Sources whose long edge is already below
+/// the cap are encoded at native size (no upscale).
+pub fn generate_xlarge_preview(src: &Path, dest: &Path) -> std::io::Result<()> {
+    let orientation = exif::read_orientation(src).unwrap_or(1);
+
+    if let Some(preview) = exif::extract_embedded_jpeg_preview(src) {
+        let img = image::load_from_memory(&preview).map_err(|e| {
+            std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                format!("decode embedded preview {:?}: {}", src, e),
+            )
+        })?;
+        let img = exif::apply_orientation(img, orientation);
+        return encode_xlarge_jpeg(img, dest);
+    }
+
+    if file_types::needs_ffmpeg_thumbnail(src) {
+        return generate_xlarge_preview_ffmpeg(src, dest);
+    }
+
+    let img = image::open(src).map_err(|e| {
+        std::io::Error::new(std::io::ErrorKind::InvalidData, format!("{:?}: {}", src, e))
+    })?;
+    let img = exif::apply_orientation(img, orientation);
+    encode_xlarge_jpeg(img, dest)
+}
+
+fn encode_xlarge_jpeg(img: image::DynamicImage, dest: &Path) -> std::io::Result<()> {
+    let (w, h) = img.dimensions();
+    let max_dim = w.max(h);
+    let scaled = if max_dim > XLARGE_PREVIEW_MAX_DIM {
+        img.thumbnail(XLARGE_PREVIEW_MAX_DIM, XLARGE_PREVIEW_MAX_DIM)
+    } else {
+        img
+    };
+    let file = std::fs::File::create(dest)
+        .map_err(|e| std::io::Error::other(format!("create {:?}: {}", dest, e)))?;
+    let mut writer = std::io::BufWriter::new(file);
+    let mut encoder = JpegEncoder::new_with_quality(&mut writer, LARGE_PREVIEW_JPEG_QUALITY);
+    encoder
+        .encode_image(&scaled)
+        .map_err(|e| std::io::Error::other(format!("encode {:?}: {}", dest, e)))?;
+    Ok(())
+}
+
+fn generate_xlarge_preview_ffmpeg(src: &Path, dest: &Path) -> std::io::Result<()> {
+    let vf = format!(
+        "scale='if(gt(iw,ih),min(iw,{cap}),-1)':'if(gt(iw,ih),-1,min(ih,{cap}))'",
+        cap = XLARGE_PREVIEW_MAX_DIM
+    );
+    let output = Command::new("ffmpeg")
+        .arg("-y")
+        .arg("-i")
+        .arg(src)
+        .arg("-vframes")
+        .arg("1")
+        .arg("-vf")
+        .arg(&vf)
+        .arg("-q:v")
+        .arg("5")
+        .arg("-f")
+        .arg("image2")
+        .arg("-c:v")
+        .arg("mjpeg")
+        .arg(dest)
+        .output()?;
+
+    if !output.status.success() {
+        return Err(std::io::Error::other(format!(
+            "ffmpeg failed ({}): {}",
+            output.status,
+            String::from_utf8_lossy(&output.stderr).trim()
+        )));
+    }
+    Ok(())
+}
+
 pub fn create_thumbnails(libs: &[libraries::Library], excluded_dirs: &[String]) {
     let tracer = global_tracer();
     let span = tracer.start("creating thumbnails");
-- 
2.49.1


From 0a627f48806732629e6cf4535f4dd904cd755eac Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Mon, 25 May 2026 21:46:18 -0400
Subject: [PATCH 10/11] Add contact name filter to SMS search tool + misc
 improvements

- sms search tool: accept contact name, trim/validate, skip when
  contact_id is set, pass to API client
- sms_client: new contact field in SmsSearchParams, URL-encode on wire
- Tool description clarifies contact_id takes precedence when both given
- Add parse_title_body helper for LLM response parsing
- llamacpp backend improvements
---
 src/ai/insight_chat.rs      |  33 +-------
 src/ai/insight_generator.rs | 147 ++++++++++++++++++++++++++----------
 src/ai/llamacpp.rs          |  46 +++++++++++
 src/ai/sms_client.rs        |   6 ++
 4 files changed, 165 insertions(+), 67 deletions(-)

diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs
index 6d52f8b..86671af 100644
--- a/src/ai/insight_chat.rs
+++ b/src/ai/insight_chat.rs
@@ -817,7 +817,8 @@ impl InsightChatService {
 
         let mut amended_insight_id: Option<i32> = None;
         if req.amend {
-            let title = self.generate_title(&backend, &final_content).await?;
+            let (title, body) = crate::ai::insight_generator::parse_title_body(&final_content);
+            let final_content = body;
 
             // Amended rows intentionally do not inherit the parent's
             // `fewshot_source_ids`. The parent's few-shot influence is still
@@ -1001,7 +1002,7 @@ impl InsightChatService {
             final_content,
         } = outcome;
 
-        let title = self.generate_title(&backend, &final_content).await?;
+        let (title, body) = crate::ai::insight_generator::parse_title_body(&final_content);
 
         let json = serde_json::to_string(&messages)
             .map_err(|e| anyhow!("failed to serialize chat history: {}", e))?;
@@ -1009,7 +1010,7 @@ impl InsightChatService {
             library_id: req.library_id,
             file_path: normalized.clone(),
             title,
-            summary: final_content,
+            summary: body,
             generated_at: Utc::now().timestamp(),
             model_version: model_used.clone(),
             is_current: true,
@@ -1045,32 +1046,6 @@ impl InsightChatService {
         Ok(())
     }
 
-    /// Generate a short title via the same chat backend so voice stays
-    /// consistent with the body. Mirrors generate_agentic_insight_for_photo's
-    /// titling pass.
-    async fn generate_title(
-        &self,
-        backend: &ResolvedBackend,
-        final_content: &str,
-    ) -> Result<String> {
-        let title_prompt = format!(
-            "Create a short title (maximum 8 words) for the following journal entry:\n\n{}\n\n\
-             Capture the key moment or theme. Return ONLY the title, nothing else.",
-            final_content
-        );
-        let title_raw = backend
-            .chat()
-            .generate(
-                &title_prompt,
-                Some(
-                    "You are my long term memory assistant. Use only the information provided. Do not invent details.",
-                ),
-                None,
-            )
-            .await?;
-        Ok(title_raw.trim().trim_matches('"').to_string())
-    }
-
     /// Drive the agentic loop with streaming SSE events. Shared between
     /// bootstrap and continuation. Mutates `messages` in place (response
     /// turns + tool results are appended) and returns counters + the
diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs
index 80a7627..b0cd54c 100644
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -28,6 +28,39 @@ use crate::otel::global_tracer;
 use crate::tags::TagDao;
 use crate::utils::{earliest_fs_time, normalize_path};
 
+/// Parse a "Title: ...\n\n<body>" response into (title, body).
+/// Falls back to the first sentence as the title if the model didn't
+/// follow the format.
+pub(crate) fn parse_title_body(raw: &str) -> (String, String) {
+    let trimmed = raw.trim();
+
+    // Try "Title: <title>\n\n<body>" or "Title: <title>\n<body>"
+    if let Some(rest) = trimmed.strip_prefix("Title:").or_else(|| trimmed.strip_prefix("title:")) {
+        let rest = rest.trim_start();
+        if let Some(split_pos) = rest.find("\n\n").or_else(|| rest.find('\n')) {
+            let title = rest[..split_pos].trim();
+            let body = rest[split_pos..].trim();
+            if !title.is_empty() && !body.is_empty() {
+                return (title.to_string(), body.to_string());
+            }
+        }
+    }
+
+    // Fallback: first sentence (up to first `. ` or `.\n`) becomes the title
+    if let Some(pos) = trimmed.find(". ").or_else(|| trimmed.find(".\n")) {
+        let title = &trimmed[..pos];
+        let body = trimmed[pos + 1..].trim();
+        if title.len() <= 100 && !body.is_empty() {
+            return (title.to_string(), body.to_string());
+        }
+    }
+
+    // Last resort: truncate to 60 chars for title, full text as body
+    let title: String = trimmed.chars().take(60).collect();
+    let title = title.trim_end().to_string();
+    (title, trimmed.to_string())
+}
+
 /// Combine an optional personal Apollo Place with an optional Nominatim
 /// reverse-geocoded city, falling back to bare coordinates when neither
 /// resolves. Free function so we can test it cheaply without spinning up
@@ -1927,6 +1960,11 @@ Return ONLY the summary, nothing else."#,
             .unwrap_or(20)
             .clamp(1, 50) as usize;
         let contact_id = args.get("contact_id").and_then(|v| v.as_i64());
+        let contact = args.get("contact")
+            .and_then(|v| v.as_str())
+            .map(|s| s.trim().to_string())
+            .filter(|s| !s.is_empty())
+            .filter(|_| contact_id.is_none());
         let start_ts = args.get("start_ts").and_then(|v| v.as_i64());
         let end_ts = args.get("end_ts").and_then(|v| v.as_i64());
         let is_mms = args.get("is_mms").and_then(|v| v.as_bool());
@@ -1934,10 +1972,11 @@ Return ONLY the summary, nothing else."#,
         let has_date_filter = start_ts.is_some() || end_ts.is_some();
 
         log::info!(
-            "tool_search_messages: query='{}', mode={}, contact_id={:?}, range=[{:?}, {:?}], is_mms={:?}, has_media={:?}, limit={}",
+            "tool_search_messages: query='{}', mode={}, contact_id={:?}, contact={:?}, range=[{:?}, {:?}], is_mms={:?}, has_media={:?}, limit={}",
             query,
             mode,
             contact_id,
+            contact,
             start_ts,
             end_ts,
             is_mms,
@@ -1952,6 +1991,7 @@ Return ONLY the summary, nothing else."#,
             mode: mode.as_str(),
             limit: user_limit,
             contact_id,
+            contact,
             date_from: start_ts,
             date_to: end_ts,
             is_mms,
@@ -3033,28 +3073,31 @@ Return ONLY the summary, nothing else."#,
         }
 
         tools.push(Tool::function(
-            "search_messages",
+"search_messages",
             "Search SMS/MMS messages — bodies and (for MMS) attachment text + filenames. \
-             Modes: `fts5` (keyword + phrase + prefix + AND/OR/NOT + NEAR proximity), \
-             `semantic` (embedding similarity, requires generated embeddings), `hybrid` (RRF merge, recommended; \
-             degrades to fts5 when embeddings absent). Optional filters: `start_ts` / `end_ts` (real-UTC unix \
-             seconds), `contact_id`, `is_mms` (true = MMS only, false = SMS only), `has_media` (true = messages \
-             with image/video/audio attachments only). For pure date / contact browsing without keywords, prefer \
-             `get_sms_messages`. \
-             \n\nFTS5 query syntax (works in fts5 + hybrid modes):\n\
-             - Phrase:   `\"trader joe's\"` — exact word sequence (use double quotes).\n\
-             - Prefix:   `restaur*` — matches restaurant, restaurants, restauranteur, ….\n\
-             - Boolean:  `dinner AND tahoe`, `wedding OR reception OR ceremony`, `vacation NOT work` (operators must be UPPERCASE).\n\
-             - Proximity: `NEAR(meeting work, 5)` — both terms within 5 tokens of each other.\n\
-             - Combine:  `(reception OR ceremony) AND tahoe*` — group with parens.\n\
-             Unquoted multi-word queries are treated as implicit AND. Apostrophes / hyphens / colons are safe — they no longer downgrade to a slow LIKE scan. Use `mode: \"fts5\"` when you want the operators above to be authoritative; `hybrid` still respects them but may surface semantically-similar non-keyword hits alongside.\n\n\
-             Examples:\n\
-             - `{query: \"trader joe's\"}` — phrase across all time.\n\
-             - `{query: \"dinner\", contact_id: 42, start_ts: 1700000000, end_ts: 1700604800}` — keyword within a contact and a week.\n\
-             - `{query: \"vacation\", has_media: true}` — only matches that include photos / videos.\n\
-             - `{query: \"wedding OR reception OR ceremony\", mode: \"fts5\"}` — any of several synonyms.\n\
-             - `{query: \"restaur*\", mode: \"fts5\"}` — prefix expansion for varying word forms.\n\
-             - `{query: \"NEAR(birthday cake, 5)\", mode: \"fts5\"}` — terms close together but in any order.",
+              Modes: `fts5` (keyword + phrase + prefix + AND/OR/NOT + NEAR proximity), \
+              `semantic` (embedding similarity, requires generated embeddings), `hybrid` (RRF merge, recommended; \
+              degrades to fts5 when embeddings absent). Optional filters: `start_ts` / `end_ts` (real-UTC unix \
+              seconds), `contact` (contact name, case-insensitive), `contact_id` (numeric), `is_mms` \
+              (true = MMS only, false = SMS only), `has_media` (true = messages with image/video/audio \
+              attachments only). Prefer `contact` over `contact_id` — the name is resolved server-side. \
+              If both are provided, `contact_id` takes precedence. \
+              For pure date / contact browsing without keywords, prefer `get_sms_messages`. \
+              \n\nFTS5 query syntax (works in fts5 + hybrid modes):\n\
+              - Phrase:   `\"trader joe's\"` — exact word sequence (use double quotes).\n\
+              - Prefix:   `restaur*` — matches restaurant, restaurants, restauranteur, ….\n\
+              - Boolean:  `dinner AND tahoe`, `wedding OR reception OR ceremony`, `vacation NOT work` (operators must be UPPERCASE).\n\
+              - Proximity: `NEAR(meeting work, 5)` — both terms within 5 tokens of each other.\n\
+              - Combine:  `(reception OR ceremony) AND tahoe*` — group with parens.\n\
+              Unquoted multi-word queries are treated as implicit AND. Apostrophes / hyphens / colons are safe — they no longer downgrade to a slow LIKE scan. Use `mode: \"fts5\"` when you want the operators above to be authoritative; `hybrid` still respects them but may surface semantically-similar non-keyword hits alongside.\n\n\
+              Examples:\n\
+              - `{query: \"trader joe's\"}` — phrase across all time.\n\
+              - `{query: \"dinner\", contact: \"Mom\"}` — keyword scoped to Mom's messages.\n\
+              - `{query: \"dinner\", contact_id: 42, start_ts: 1700000000, end_ts: 1700604800}` — keyword within a contact and a week.\n\
+              - `{query: \"vacation\", has_media: true}` — only matches that include photos / videos.\n\
+              - `{query: \"wedding OR reception OR ceremony\", mode: \"fts5\"}` — any of several synonyms.\n\
+              - `{query: \"restaur*\", mode: \"fts5\"}` — prefix expansion for varying word forms.\n\
+              - `{query: \"NEAR(birthday cake, 5)\", mode: \"fts5\"}` — terms close together but in any order.",
             serde_json::json!({
                 "type": "object",
                 "required": ["query"],
@@ -3063,6 +3106,7 @@ Return ONLY the summary, nothing else."#,
                     "mode":  { "type": "string", "enum": ["fts5", "semantic", "hybrid"], "description": "Search strategy. Default: hybrid." },
                     "limit": { "type": "integer", "description": "Max results (default 20, max 50)." },
                     "contact_id": { "type": "integer", "description": "Optional numeric contact id to scope the search." },
+                    "contact":    { "type": "string", "description": "Optional contact name (case-insensitive). Resolved to contact_id server-side. Use this when you know the name but not the ID." },
                     "start_ts": { "type": "integer", "description": "Optional inclusive lower bound, real-UTC unix seconds." },
                     "end_ts":   { "type": "integer", "description": "Optional inclusive upper bound, real-UTC unix seconds." },
                     "is_mms":   { "type": "boolean", "description": "Optional: true to restrict to MMS, false to restrict to SMS." },
@@ -3534,7 +3578,8 @@ Return ONLY the summary, nothing else."#,
              - When you identify people / places / events / things, use store_entity + store_fact to grow the persistent memory.\n\
              - Before store_entity, call recall_entities to check whether a similar name already exists; reuse the existing entity_id rather than creating a near-duplicate (e.g. \"Sara\" vs \"Sarah J.\"). The DAO will collapse obvious cosine matches, but choosing the existing id keeps facts and photo links consolidated.\n\
              - Predicates should be relationship-shaped verbs that encode a queryable claim — `lives_in`, `works_at`, `attended`, `is_friend_of`, `is_parent_of`, `interested_in`, `married_to`, `owns`. DO NOT use vague speech-act predicates like `expressed`, `said`, `mentioned`, `stated`, `quoted`, `noted`, `discussed`, `thought`, `wondered`. DO NOT store quotations or sentence fragments as `object_value` — paraphrase into a structured claim. Bad: `(Cameron, expressed, \"I'm tempted to get a part-time job there\")`. Good: `(Cameron, considered_employment_at, <Place>)` or `(Cameron, interested_in, \"part-time work\")`.\n\
-             - A tool returning no results is informative; continue with the others.",
+             - A tool returning no results is informative; continue with the others.\n\
+             - When writing your final answer, start with \"Title: <short title>\" (max 8 words) on the first line, then a blank line, then the body.",
         );
 
         let mut out = identity;
@@ -4059,7 +4104,7 @@ Return ONLY the summary, nothing else."#,
                 iterations_used
             );
             messages.push(ChatMessage::user(format!(
-                "Based on the context gathered, please write the final photo insight: a title and a detailed personal summary. Write in first person as {}.",
+                "Based on the context gathered, please write the final photo insight. Start with \"Title: <short title>\" on the first line (max 8 words), then a blank line, then the detailed personal summary. Write in first person as {}.",
                 user_display_name()
             )));
             let (final_response, prompt_tokens, eval_tokens) = backend
@@ -4077,21 +4122,11 @@ Return ONLY the summary, nothing else."#,
             .set_attribute(KeyValue::new("iterations_used", iterations_used as i64));
         loop_cx.span().set_status(Status::Ok);
 
-        // 13. Generate title via the same backend so voice stays consistent.
-        let title_prompt = format!(
-            "Create a short title (maximum 8 words) for the following journal entry:\n\n{}\n\nCapture the key moment or theme. Return ONLY the title, nothing else.",
-            final_content
-        );
-        let title_system = custom_system_prompt.as_deref().unwrap_or(
-            "You are my long term memory assistant. Use only the information provided. Do not invent details.",
-        );
-        let title_raw = backend
-            .chat()
-            .generate(&title_prompt, Some(title_system), None)
-            .await?;
-        let title = title_raw.trim().trim_matches('"').to_string();
+        // 13. Parse title from the model's inline response.
+        let (title, body) = parse_title_body(&final_content);
+        final_content = body;
 
-        log::info!("Agentic generated title: {}", title);
+        log::info!("Agentic parsed title: {}", title);
         let summary_preview: String = final_content.chars().take(200).collect();
         log::info!(
             "Agentic generated summary ({} chars): {}",
@@ -4742,4 +4777,40 @@ mod tests {
         assert!(out.contains("-> empty (pivoted)"));
         assert!(out.contains("Final insight: Final title"));
     }
+
+    #[test]
+    fn parse_title_body_standard_format() {
+        let (t, b) = parse_title_body("Title: Summer at the Lake\n\nWe spent the afternoon...");
+        assert_eq!(t, "Summer at the Lake");
+        assert_eq!(b, "We spent the afternoon...");
+    }
+
+    #[test]
+    fn parse_title_body_single_newline() {
+        let (t, b) = parse_title_body("Title: Morning Walk\nThe sun was rising...");
+        assert_eq!(t, "Morning Walk");
+        assert_eq!(b, "The sun was rising...");
+    }
+
+    #[test]
+    fn parse_title_body_lowercase_prefix() {
+        let (t, b) = parse_title_body("title: Garden Party\n\nEveryone gathered...");
+        assert_eq!(t, "Garden Party");
+        assert_eq!(b, "Everyone gathered...");
+    }
+
+    #[test]
+    fn parse_title_body_fallback_first_sentence() {
+        let (t, b) = parse_title_body("A warm summer day. We gathered at the park for a picnic.");
+        assert_eq!(t, "A warm summer day");
+        assert_eq!(b, "We gathered at the park for a picnic.");
+    }
+
+    #[test]
+    fn parse_title_body_fallback_truncate() {
+        let input = "A single long paragraph with no periods or title prefix that just keeps going on and on";
+        let (t, b) = parse_title_body(input);
+        assert!(t.len() <= 60);
+        assert_eq!(b, input);
+    }
 }
diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs
index 77cd05f..8ea1063 100644
--- a/src/ai/llamacpp.rs
+++ b/src/ai/llamacpp.rs
@@ -354,6 +354,8 @@ impl LlamaCppClient {
             .and_then(|v| v.as_i64())
             .map(|n| n as i32);
 
+        log_timings(&parsed, prompt_tokens, completion_tokens);
+
         Ok((chat_msg, prompt_tokens, completion_tokens))
     }
 }
@@ -456,6 +458,7 @@ impl LlmClient for LlamaCppClient {
             let mut role = "assistant".to_string();
             let mut prompt_tokens: Option<i32> = None;
             let mut completion_tokens: Option<i32> = None;
+            let mut last_frame: Option<Value> = None;
             let mut done_seen = false;
 
             while let Some(chunk) = byte_stream.next().await {
@@ -505,6 +508,7 @@ impl LlmClient for LlamaCppClient {
                                 .get("completion_tokens")
                                 .and_then(|n| n.as_i64())
                                 .map(|n| n as i32);
+                            last_frame = Some(v.clone());
                         }
 
                         let Some(choices) = v.get("choices").and_then(|c| c.as_array())
@@ -587,6 +591,10 @@ impl LlmClient for LlamaCppClient {
                 Some(v)
             };
 
+            if let Some(ref frame) = last_frame {
+                log_timings(frame, prompt_tokens, completion_tokens);
+            }
+
             let message = ChatMessage {
                 role,
                 content: accumulated_content,
@@ -720,6 +728,44 @@ impl LlamaCppClient {
 /// Extract a diagnostic fragment from a llama-swap / llama-server response
 /// that doesn't match the expected `{choices: [...]}` shape. llama-server
 /// returns errors as `{"error": {"message": "...", "code": N, "type": "..."}}`;
+fn log_timings(parsed: &Value, prompt_tokens: Option<i32>, completion_tokens: Option<i32>) {
+    let timings = match parsed.get("timings") {
+        Some(t) => t,
+        None => return,
+    };
+    let prompt_tps = timings.get("prompt_per_second").and_then(|v| v.as_f64());
+    let gen_tps = timings.get("predicted_per_second").and_then(|v| v.as_f64());
+    let prompt_ms = timings.get("prompt_ms").and_then(|v| v.as_f64());
+    let gen_ms = timings.get("predicted_ms").and_then(|v| v.as_f64());
+
+    let mut parts: Vec<String> = Vec::new();
+    if let Some(c) = prompt_tokens {
+        let mut s = format!("prompt={} tok", c);
+        if let Some(ms) = prompt_ms {
+            s.push_str(&format!(" ({:.0} ms", ms));
+            if let Some(tps) = prompt_tps {
+                s.push_str(&format!(", {:.1} tok/s", tps));
+            }
+            s.push(')');
+        }
+        parts.push(s);
+    }
+    if let Some(c) = completion_tokens {
+        let mut s = format!("gen={} tok", c);
+        if let Some(ms) = gen_ms {
+            s.push_str(&format!(" ({:.0} ms", ms));
+            if let Some(tps) = gen_tps {
+                s.push_str(&format!(", {:.1} tok/s", tps));
+            }
+            s.push(')');
+        }
+        parts.push(s);
+    }
+    if !parts.is_empty() {
+        log::info!("llama-swap chat metrics — {}", parts.join(", "));
+    }
+}
+
 /// llama-swap itself sometimes wraps subprocess failures with its own
 /// `{"error": "..."}` flat shape. Surface either when present, otherwise fall
 /// back to a truncated raw-JSON view.
diff --git a/src/ai/sms_client.rs b/src/ai/sms_client.rs
index 6661bac..d5e175f 100644
--- a/src/ai/sms_client.rs
+++ b/src/ai/sms_client.rs
@@ -281,6 +281,9 @@ impl SmsApiClient {
         if let Some(cid) = params.contact_id {
             url.push_str(&format!("&contact_id={}", cid));
         }
+        if let Some(ref c) = params.contact {
+            url.push_str(&format!("&contact={}", urlencoding::encode(c)));
+        }
         if let Some(off) = params.offset {
             url.push_str(&format!("&offset={}", off));
         }
@@ -413,6 +416,9 @@ pub struct SmsSearchParams<'a> {
     pub mode: &'a str,
     pub limit: usize,
     pub contact_id: Option<i64>,
+    /// Contact name (case-insensitive). Resolved to a numeric ID by the
+    /// SMS-API server when `contact_id` is not set.
+    pub contact: Option<String>,
     /// Unix-seconds inclusive lower bound on `date`.
     pub date_from: Option<i64>,
     /// Unix-seconds inclusive upper bound on `date`.
-- 
2.49.1


From b03ee6034237e99248c8f4f4d7f5cb91b972ef2e Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Tue, 26 May 2026 09:55:16 -0400
Subject: [PATCH 11/11] fix: prevent hybrid mode from leaking OpenRouter model
 to local llamacpp client

When backend=hybrid with LLM_BACKEND=llamacpp, the user-selected model
(an OpenRouter id like "google/gemini-3-flash-preview") was being applied
to the local LlamaCppClient's primary_model and vision_model. This caused
describe_image to send the OpenRouter model name to llama-swap, which
returned 400 because it has no such slot.

Guard the local-client model override with !is_hybrid so it only applies
in local-only mode (where the user is selecting a different local model).
Bump to v1.2.0.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Cargo.lock                  |   2 +-
 Cargo.toml                  |   2 +-
 src/ai/backend.rs           |  38 +++++++---
 src/ai/insight_chat.rs      |  37 +++++-----
 src/ai/insight_generator.rs | 138 ++++++++++++++++++++++++------------
 src/ai/llamacpp.rs          |  32 +++++++--
 src/ai/mod.rs               |   2 +-
 7 files changed, 172 insertions(+), 79 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 3f6d0c2..5d3e4ce 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2051,7 +2051,7 @@ dependencies = [
 
 [[package]]
 name = "image-api"
-version = "1.1.0"
+version = "1.2.0"
 dependencies = [
  "actix",
  "actix-cors",
diff --git a/Cargo.toml b/Cargo.toml
index 7713970..7324001 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "image-api"
-version = "1.1.0"
+version = "1.2.0"
 authors = ["Cameron Cordes <cameronc.dev@gmail.com>"]
 edition = "2024"
 
diff --git a/src/ai/backend.rs b/src/ai/backend.rs
index 0fe7747..0515f1c 100644
--- a/src/ai/backend.rs
+++ b/src/ai/backend.rs
@@ -13,7 +13,10 @@ impl BackendKind {
         match s.trim().to_lowercase().as_str() {
             "local" | "" => Ok(Self::Local),
             "hybrid" => Ok(Self::Hybrid),
-            other => Err(anyhow!("unknown backend '{}'; expected 'local' or 'hybrid'", other)),
+            other => Err(anyhow!(
+                "unknown backend '{}'; expected 'local' or 'hybrid'",
+                other
+            )),
         }
     }
 
@@ -65,7 +68,12 @@ impl ResolvedBackend {
         kind: BackendKind,
         images_inline: bool,
     ) -> Self {
-        Self { chat, local, kind, images_inline }
+        Self {
+            chat,
+            local,
+            kind,
+            images_inline,
+        }
     }
 
     pub fn chat(&self) -> &dyn LlmClient {
@@ -97,21 +105,35 @@ mod tests {
 
     #[test]
     fn backend_kind_as_str_roundtrips() {
-        assert_eq!(BackendKind::parse(BackendKind::Local.as_str()).unwrap(), BackendKind::Local);
-        assert_eq!(BackendKind::parse(BackendKind::Hybrid.as_str()).unwrap(), BackendKind::Hybrid);
+        assert_eq!(
+            BackendKind::parse(BackendKind::Local.as_str()).unwrap(),
+            BackendKind::Local
+        );
+        assert_eq!(
+            BackendKind::parse(BackendKind::Hybrid.as_str()).unwrap(),
+            BackendKind::Hybrid
+        );
     }
 
     #[test]
     fn sampling_overrides_has_sampling() {
         let empty = SamplingOverrides {
-            model: None, num_ctx: None, temperature: None,
-            top_p: None, top_k: None, min_p: None,
+            model: None,
+            num_ctx: None,
+            temperature: None,
+            top_p: None,
+            top_k: None,
+            min_p: None,
         };
         assert!(!empty.has_sampling());
 
         let with_temp = SamplingOverrides {
-            model: None, num_ctx: Some(4096), temperature: Some(0.7),
-            top_p: None, top_k: None, min_p: None,
+            model: None,
+            num_ctx: Some(4096),
+            temperature: Some(0.7),
+            top_p: None,
+            top_k: None,
+            min_p: None,
         };
         assert!(with_temp.has_sampling());
     }
diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs
index 86671af..9837733 100644
--- a/src/ai/insight_chat.rs
+++ b/src/ai/insight_chat.rs
@@ -308,7 +308,9 @@ impl InsightChatService {
 
         let stored_model = insight.model_version.clone();
         let overrides = SamplingOverrides {
-            model: req.model.clone()
+            model: req
+                .model
+                .clone()
                 .or_else(|| Some(stored_model.clone()))
                 .filter(|m| !m.is_empty()),
             num_ctx: req.num_ctx,
@@ -731,7 +733,9 @@ impl InsightChatService {
 
         let stored_model = insight.model_version.clone();
         let overrides = SamplingOverrides {
-            model: req.model.clone()
+            model: req
+                .model
+                .clone()
                 .or_else(|| Some(stored_model.clone()))
                 .filter(|m| !m.is_empty()),
             num_ctx: req.num_ctx,
@@ -928,17 +932,15 @@ impl InsightChatService {
         // images_inline backends send images directly to the chat model.
         let visual_block = if !backend.images_inline {
             match image_base64.as_deref() {
-                Some(b64) => {
-                    match backend.local().describe_image(b64).await {
-                        Ok(desc) => {
-                            format!("Visual description (from local vision model):\n{}\n", desc)
-                        }
-                        Err(e) => {
-                            log::warn!("{} bootstrap: describe_image failed: {}", kind.as_str(), e);
-                            String::new()
-                        }
+                Some(b64) => match backend.local().describe_image(b64).await {
+                    Ok(desc) => {
+                        format!("Visual description (from local vision model):\n{}\n", desc)
                     }
-                }
+                    Err(e) => {
+                        log::warn!("{} bootstrap: describe_image failed: {}", kind.as_str(), e);
+                        String::new()
+                    }
+                },
                 None => String::new(),
             }
         } else {
@@ -1328,10 +1330,7 @@ fn resolve_bootstrap_backend(supplied: Option<&str>) -> Result<String> {
         .filter(|s| !s.is_empty())
         .unwrap_or_else(|| "local".to_string());
     if !matches!(lower.as_str(), "local" | "hybrid") {
-        bail!(
-            "unknown backend '{}'; expected 'local' or 'hybrid'",
-            lower
-        );
+        bail!("unknown backend '{}'; expected 'local' or 'hybrid'", lower);
     }
     Ok(lower)
 }
@@ -1971,7 +1970,11 @@ mod tests {
         // Both "openrouter" and the former "llamacpp" value are unknown now.
         for label in &["openrouter", "llamacpp"] {
             let err = validate_cross_replay("local", label).unwrap_err();
-            assert!(format!("{}", err).contains("unknown backend"), "label={}", label);
+            assert!(
+                format!("{}", err).contains("unknown backend"),
+                "label={}",
+                label
+            );
         }
     }
 
diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs
index b0cd54c..8e39c59 100644
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -11,9 +11,9 @@ use std::sync::{Arc, Mutex};
 
 use crate::ai::apollo_client::{ApolloClient, ApolloPlace};
 use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides};
+use crate::ai::llamacpp::LlamaCppClient;
 use crate::ai::llm_client::LlmClient;
 use crate::ai::ollama::{ChatMessage, OllamaClient, Tool};
-use crate::ai::llamacpp::LlamaCppClient;
 use crate::ai::openrouter::OpenRouterClient;
 use crate::ai::sms_client::{SmsApiClient, SmsSearchHit, SmsSearchParams};
 use crate::ai::user_display_name;
@@ -35,7 +35,10 @@ pub(crate) fn parse_title_body(raw: &str) -> (String, String) {
     let trimmed = raw.trim();
 
     // Try "Title: <title>\n\n<body>" or "Title: <title>\n<body>"
-    if let Some(rest) = trimmed.strip_prefix("Title:").or_else(|| trimmed.strip_prefix("title:")) {
+    if let Some(rest) = trimmed
+        .strip_prefix("Title:")
+        .or_else(|| trimmed.strip_prefix("title:"))
+    {
         let rest = rest.trim_start();
         if let Some(split_pos) = rest.find("\n\n").or_else(|| rest.find('\n')) {
             let title = rest[..split_pos].trim();
@@ -1644,7 +1647,10 @@ Return ONLY the summary, nothing else."#,
             "get_location_history" => self.tool_get_location_history(arguments, cx).await,
             "get_file_tags" => self.tool_get_file_tags(arguments, cx).await,
             "get_faces_in_photo" => self.tool_get_faces_in_photo(arguments, cx).await,
-            "describe_photo" => self.tool_describe_photo(backend.local(), image_base64).await,
+            "describe_photo" => {
+                self.tool_describe_photo(backend.local(), image_base64)
+                    .await
+            }
             "reverse_geocode" => self.tool_reverse_geocode(arguments).await,
             "get_personal_place_at" => self.tool_get_personal_place_at(arguments).await,
             "recall_entities" => self.tool_recall_entities(arguments, cx).await,
@@ -1655,7 +1661,13 @@ Return ONLY the summary, nothing else."#,
             "store_entity" => self.tool_store_entity(arguments, cx).await,
             "store_fact" => {
                 self.tool_store_fact(
-                    arguments, file_path, user_id, persona_id, model, backend_label, cx,
+                    arguments,
+                    file_path,
+                    user_id,
+                    persona_id,
+                    model,
+                    backend_label,
+                    cx,
                 )
                 .await
             }
@@ -1810,9 +1822,8 @@ Return ONLY the summary, nothing else."#,
         );
 
         let started = std::time::Instant::now();
-        let system = Some(
-            "You are a terse relevance ranker. You output only numbers separated by commas.",
-        );
+        let system =
+            Some("You are a terse relevance ranker. You output only numbers separated by commas.");
         let response = local.generate(&prompt, system, None).await?;
         log::info!(
             "rerank: finished in {} ms (prompt={} chars)",
@@ -1960,7 +1971,8 @@ Return ONLY the summary, nothing else."#,
             .unwrap_or(20)
             .clamp(1, 50) as usize;
         let contact_id = args.get("contact_id").and_then(|v| v.as_i64());
-        let contact = args.get("contact")
+        let contact = args
+            .get("contact")
             .and_then(|v| v.as_str())
             .map(|s| s.trim().to_string())
             .filter(|s| !s.is_empty())
@@ -2710,22 +2722,17 @@ Return ONLY the summary, nothing else."#,
         // Generate embedding for name + description (best-effort) via the
         // configured local backend.
         let embed_text = format!("{} {}", name, description);
-        let embedding: Option<Vec<u8>> = match crate::ai::embed_one(
-            &self.ollama,
-            self.llamacpp.as_deref(),
-            &embed_text,
-        )
-        .await
-        {
-            Ok(vec) => {
-                let bytes: Vec<u8> = vec.iter().flat_map(|f| f.to_le_bytes()).collect();
-                Some(bytes)
-            }
-            Err(e) => {
-                log::warn!("Embedding generation failed for entity '{}': {}", name, e);
-                None
-            }
-        };
+        let embedding: Option<Vec<u8>> =
+            match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &embed_text).await {
+                Ok(vec) => {
+                    let bytes: Vec<u8> = vec.iter().flat_map(|f| f.to_le_bytes()).collect();
+                    Some(bytes)
+                }
+                Err(e) => {
+                    log::warn!("Embedding generation failed for entity '{}': {}", name, e);
+                    None
+                }
+            };
 
         let now = chrono::Utc::now().timestamp();
         let insert = InsertEntity {
@@ -3606,8 +3613,7 @@ Return ONLY the summary, nothing else."#,
         kind: BackendKind,
         overrides: &SamplingOverrides,
     ) -> Result<ResolvedBackend> {
-        let local_via_llamacpp =
-            crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
+        let local_via_llamacpp = crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
         let is_hybrid = kind == BackendKind::Hybrid;
 
         // ── chat client ────────────────────────────────────────────────
@@ -3680,12 +3686,15 @@ Return ONLY the summary, nothing else."#,
         };
 
         // ── local client (utility calls: rerank, describe_image, etc.) ─
-        // For llamacpp: mirror the chat model selection so rerank /
-        // describe_image hit the same model that's already loaded —
-        // avoids a mid-turn model swap in llama-swap exclusive mode.
+        // For llamacpp in local mode: mirror the chat model selection so
+        // rerank / describe_image hit the same model that's already
+        // loaded — avoids a mid-turn model swap in llama-swap exclusive
+        // mode. In hybrid mode the override is an OpenRouter model id
+        // (e.g. "google/gemini-3-flash-preview") which llama-swap can't
+        // serve — keep the configured local slots.
         let local: Box<dyn LlmClient> = if local_via_llamacpp {
             let mut lc = self.llamacpp.as_ref().unwrap().as_ref().clone();
-            if let Some(ref m) = overrides.model {
+            if !is_hybrid && let Some(ref m) = overrides.model {
                 lc.primary_model = m.clone();
                 lc.set_vision_model(m.clone());
             }
@@ -3713,14 +3722,14 @@ Return ONLY the summary, nothing else."#,
                         .await
                         .unwrap_or(false);
 
-                let available_on_fallback =
-                    if let Some(ref fallback_url) = self.ollama.fallback_url {
-                        OllamaClient::is_model_available(fallback_url, model)
-                            .await
-                            .unwrap_or(false)
-                    } else {
-                        false
-                    };
+                let available_on_fallback = if let Some(ref fallback_url) = self.ollama.fallback_url
+                {
+                    OllamaClient::is_model_available(fallback_url, model)
+                        .await
+                        .unwrap_or(false)
+                } else {
+                    false
+                };
 
                 if !available_on_primary && !available_on_fallback {
                     anyhow::bail!(
@@ -3761,10 +3770,7 @@ Return ONLY the summary, nothing else."#,
             };
 
             if !capabilities.has_tool_calling {
-                anyhow::bail!(
-                    "tool calling not supported by model '{}'",
-                    ollama_for_caps
-                );
+                anyhow::bail!("tool calling not supported by model '{}'", ollama_for_caps);
             }
 
             capabilities.has_vision
@@ -3801,9 +3807,7 @@ Return ONLY the summary, nothing else."#,
         span.set_attribute(KeyValue::new("max_iterations", max_iterations as i64));
 
         // 1. Resolve backend + build clients.
-        let kind = BackendKind::parse(
-            backend.as_deref().unwrap_or("local"),
-        )?;
+        let kind = BackendKind::parse(backend.as_deref().unwrap_or("local"))?;
         span.set_attribute(KeyValue::new("backend", kind.as_str()));
         let overrides = SamplingOverrides {
             model: custom_model,
@@ -3933,7 +3937,11 @@ Return ONLY the summary, nothing else."#,
                         Some(desc)
                     }
                     Err(e) => {
-                        log::warn!("{}: vision describe failed, continuing without: {}", kind, e);
+                        log::warn!(
+                            "{}: vision describe failed, continuing without: {}",
+                            kind,
+                            e
+                        );
                         None
                     }
                 },
@@ -4813,4 +4821,42 @@ mod tests {
         assert!(t.len() <= 60);
         assert_eq!(b, input);
     }
+
+    /// Regression: hybrid mode was leaking the OpenRouter model override
+    /// into the local llamacpp client, causing describe_image to send
+    /// e.g. "google/gemini-3-flash-preview" to llama-swap (which 400s).
+    #[test]
+    fn resolve_backend_hybrid_does_not_leak_model_to_local_llamacpp() {
+        use crate::ai::llamacpp::LlamaCppClient;
+
+        let mut base =
+            LlamaCppClient::new(Some("http://localhost:9292/v1".into()), Some("chat".into()));
+        base.set_vision_model("vision".into());
+        base.set_embedding_model("embed".into());
+
+        let openrouter_model = "google/gemini-3-flash-preview";
+        let overrides_model: Option<String> = Some(openrouter_model.into());
+        let is_hybrid = true;
+
+        // Replicate the resolve_backend local-client construction
+        // (lines ~3686-3695 of this file).
+        let mut lc = base.clone();
+        if let Some(ref m) = overrides_model {
+            if !is_hybrid {
+                lc.primary_model = m.clone();
+                lc.set_vision_model(m.clone());
+            }
+        }
+
+        // In hybrid mode the local client must keep its configured slots.
+        assert_eq!(
+            lc.vision_model, "vision",
+            "hybrid mode must not override vision_model with OpenRouter model"
+        );
+        assert_eq!(
+            lc.primary_model, "chat",
+            "hybrid mode must not override primary_model with OpenRouter model"
+        );
+        assert_eq!(lc.embedding_model, "embed");
+    }
 }
diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs
index 8ea1063..e2ba00d 100644
--- a/src/ai/llamacpp.rs
+++ b/src/ai/llamacpp.rs
@@ -409,10 +409,7 @@ impl LlmClient for LlamaCppClient {
             tools.len()
         );
         let mut body = serde_json::Map::new();
-        body.insert(
-            "model".into(),
-            Value::String(self.primary_model.clone()),
-        );
+        body.insert("model".into(), Value::String(self.primary_model.clone()));
         body.insert(
             "messages".into(),
             Value::Array(Self::messages_to_openai(&messages)),
@@ -1071,6 +1068,28 @@ mod tests {
         assert_eq!(local.embedding_model, "embed");
     }
 
+    #[test]
+    fn hybrid_mode_local_client_preserves_vision_model() {
+        // In hybrid mode, overrides.model is an OpenRouter model id
+        // (e.g. "google/gemini-3-flash-preview"). The local llamacpp
+        // client must NOT adopt that as its vision_model — it should
+        // keep the configured LLAMA_SWAP_VISION_MODEL so describe_image
+        // hits the correct local slot instead of sending an unknown
+        // model name to llama-swap.
+        let mut base = LlamaCppClient::new(None, Some("chat".into()));
+        base.set_vision_model("vision".into());
+        base.set_embedding_model("embed".into());
+
+        // Simulate what resolve_backend SHOULD do for hybrid mode:
+        // clone but do NOT override primary_model / vision_model.
+        let local = base.clone();
+
+        // The local client keeps its configured slots.
+        assert_eq!(local.primary_model, "chat");
+        assert_eq!(local.vision_model, "vision");
+        assert_eq!(local.embedding_model, "embed");
+    }
+
     #[test]
     fn assistant_tool_calls_emit_null_content() {
         let msg = ChatMessage {
@@ -1086,7 +1105,10 @@ mod tests {
             images: None,
         };
         let wire = LlamaCppClient::messages_to_openai(&[msg]);
-        assert!(wire[0]["content"].is_null(), "empty content + tool_calls should emit null");
+        assert!(
+            wire[0]["content"].is_null(),
+            "empty content + tool_calls should emit null"
+        );
     }
 
     #[test]
diff --git a/src/ai/mod.rs b/src/ai/mod.rs
index 09cbfda..c991c71 100644
--- a/src/ai/mod.rs
+++ b/src/ai/mod.rs
@@ -25,11 +25,11 @@ pub use handlers::{
     get_insight_handler, get_openrouter_models_handler, rate_insight_handler,
 };
 pub use insight_generator::InsightGenerator;
+pub use llamacpp::LlamaCppClient;
 #[allow(unused_imports)]
 pub use llm_client::{
     ChatMessage, LlmClient, ModelCapabilities, Tool, ToolCall, ToolCallFunction, ToolFunction,
 };
-pub use llamacpp::LlamaCppClient;
 pub use ollama::{EMBEDDING_MODEL, OllamaClient};
 pub use sms_client::{SmsApiClient, SmsMessage};
 
-- 
2.49.1