From f0927f535510ccfecf91c1896041b93c2c156603 Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Wed, 20 May 2026 17:52:33 -0400
Subject: [PATCH] ai: add llamacpp backend (llama-swap) as third LLM client

Wires a new LlamaCppClient (OpenAI-compatible /v1 wire format) alongside
OllamaClient and OpenRouterClient. Per-slot routing for chat/vision/embed
via env (LLAMA_SWAP_URL + *_MODEL vars); capability inference uses an
env allowlist since /v1/models doesn't report modality.

InsightGenerator + InsightChatService gain three-way dispatch on
chat_backend = "local" | "hybrid" | "llamacpp". Hybrid and llamacpp
share the describe-then-inline path (text-only chat after a separate
vision describe). HYBRID_VISION_BACKEND=llamacpp lets hybrid route its
describe pass through llama-swap's vision slot while chat still goes
to OpenRouter.

Cross-replay matrix added (validate_cross_replay): local<->llamacpp
and hybrid<->llamacpp allowed; local->hybrid and llamacpp->hybrid
rejected. New /insights/llamacpp/models handler mirrors the OpenRouter
shape.
---
 CLAUDE.md                     |  49 +-
 src/ai/handlers.rs            |  30 ++
 src/ai/insight_chat.rs        | 266 ++++++---
 src/ai/insight_generator.rs   | 171 ++++--
 src/ai/llamacpp.rs            | 978 ++++++++++++++++++++++++++++++++++
 src/ai/mod.rs                 |   4 +-
 src/bin/populate_knowledge.rs |   1 +
 src/main.rs                   |   1 +
 src/state.rs                  |  70 +++
 9 files changed, 1468 insertions(+), 102 deletions(-)
 create mode 100644 src/ai/llamacpp.rs

diff --git a/CLAUDE.md b/CLAUDE.md
index 7e605cc..d3419e6 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -475,6 +475,7 @@ POST /insights/generate/agentic      (tool-calling loop; body: { file_path, back
 GET  /insights?path=...&library=...
 GET  /insights/models                (local Ollama models + capabilities)
 GET  /insights/openrouter/models     (curated OpenRouter allowlist)
+GET  /insights/llamacpp/models       (curated llama-swap slot allowlist)
 POST /insights/rate                  (thumbs up/down for training data)
 
 // Insight Chat Continuation
@@ -631,6 +632,23 @@ OPENROUTER_EMBEDDING_MODEL=openai/text-embedding-3-small  # Optional, embeddings
 OPENROUTER_HTTP_REFERER=https://your-site.example    # Optional attribution header
 OPENROUTER_APP_TITLE=ImageApi                  # Optional attribution header
 
+# llama.cpp / llama-swap (Llamacpp Backend) - sibling to Ollama; OpenAI-compatible
+# proxy hosting one or more llama-server processes (chat / vision / embed slots).
+LLAMA_SWAP_URL=http://localhost:9292/v1         # Required to enable llamacpp backend
+LLAMA_SWAP_PRIMARY_MODEL=chat                   # Chat slot id (matches config.yaml)
+LLAMA_SWAP_VISION_MODEL=vision                  # Vision slot id; describe_image routes here
+LLAMA_SWAP_EMBEDDING_MODEL=embed                # Embedding slot id (when local embeddings via llamacpp)
+LLAMA_SWAP_VISION_MODELS=qwen-vl,llava          # Comma-separated slot ids known to have vision.
+                                                # Drives `has_vision` in /insights/llamacpp/models.
+                                                # `LLAMA_SWAP_VISION_MODEL` is auto-included.
+LLAMA_SWAP_ALLOWED_MODELS=chat,coder            # Curated allowlist exposed to clients via
+                                                # GET /insights/llamacpp/models. Empty = no picker.
+LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180          # Per-request timeout; bump for slow CPU offload
+HYBRID_VISION_BACKEND=llamacpp                  # Optional override for hybrid mode's describe_image:
+                                                # `ollama` (default) or `llamacpp`. When `llamacpp`,
+                                                # hybrid still routes chat to OpenRouter but uses
+                                                # llama-swap's vision slot to describe images.
+
 # Insight Chat Continuation
 AGENTIC_CHAT_MAX_ITERATIONS=6                  # Cap on tool-calling iterations per chat turn (default 6)
 ```
@@ -652,8 +670,11 @@ This allows runtime verification of model availability before generating insight
 
 **Hybrid Backend (OpenRouter):**
 - Per-request opt-in via `backend=hybrid` on `POST /insights/generate/agentic`.
-- Local Ollama still describes the image (vision); the description is inlined
-  into the chat prompt and the agentic loop runs on OpenRouter.
+- Vision describe happens before the agentic loop; the description is inlined
+  into the chat prompt and the agentic loop runs on OpenRouter. By default
+  vision uses local Ollama, but `HYBRID_VISION_BACKEND=llamacpp` flips it to
+  llama-swap's vision slot (useful when you want chat on a frontier model and
+  vision on a local-but-not-Ollama path).
 - `request.model` (if provided) overrides `OPENROUTER_DEFAULT_MODEL` for that
   call. The mobile picker reads from `OPENROUTER_ALLOWED_MODELS`.
 - No live capability precheck — the operator-curated allowlist is trusted.
@@ -661,6 +682,30 @@ This allows runtime verification of model availability before generating insight
 - `GET /insights/openrouter/models` returns `{ models, default_model, configured }`
   for client picker UIs.
 
+**Llamacpp Backend (llama-swap):**
+- Per-request opt-in via `backend=llamacpp` on `POST /insights/generate/agentic`.
+- Sibling to Ollama: a local OpenAI-compatible proxy (mostlygeek/llama-swap)
+  fronting one or more `llama-server` processes. The chat slot is text-only
+  by default; vision and embeddings have their own slots (`LLAMA_SWAP_VISION_MODEL`,
+  `LLAMA_SWAP_EMBEDDING_MODEL`) that llama-swap routes to by model id. The
+  bundled `docker-compose.yml` + `llama-swap/config.yaml` in the opencode root
+  is the reference deploy.
+- Operates in the same describe-then-inline shape as hybrid: the chat model
+  never sees raw images. Vision describe routes through llama-swap's vision
+  slot (`describe_image` on `LlamaCppClient`).
+- `request.model` (if provided) overrides `LLAMA_SWAP_PRIMARY_MODEL` for that
+  call (must match a slot id in llama-swap's `config.yaml`). The mobile picker
+  reads from `LLAMA_SWAP_ALLOWED_MODELS`.
+- No live capability precheck — slot ids are trusted. Tool calling is assumed
+  for every slot (llama-swap entries typically launch with `--jinja`).
+- `GET /insights/llamacpp/models` returns `{ models, default_model, configured }`.
+- Cross-replay matrix (chat continuation): `local ↔ llamacpp` allowed (the
+  LlamaCppClient passes images through to the chat slot — you're responsible
+  for a vision-capable slot if the stored transcript carries images);
+  `hybrid ↔ llamacpp` allowed (both transcripts are text-only); `local →
+  hybrid` and `llamacpp → hybrid` rejected (mid-conversation description
+  source change isn't supported).
+
 **Insight Chat Continuation:**
 
 After an agentic insight is generated, the full `Vec<ChatMessage>` transcript is
diff --git a/src/ai/handlers.rs b/src/ai/handlers.rs
index 0e46057..4809b25 100644
--- a/src/ai/handlers.rs
+++ b/src/ai/handlers.rs
@@ -549,6 +549,36 @@ pub async fn get_openrouter_models_handler(
     HttpResponse::Ok().json(response)
 }
 
+#[derive(serde::Serialize)]
+pub struct LlamaCppModelsResponse {
+    pub models: Vec<String>,
+    pub default_model: Option<String>,
+    pub configured: bool,
+}
+
+/// GET /insights/llamacpp/models - Curated llama-swap model ids exposed
+/// to clients for the llamacpp backend. Returned verbatim from
+/// `LLAMA_SWAP_ALLOWED_MODELS`; no live call to llama-swap. Use
+/// `LLAMA_SWAP_URL` plus `LLAMA_SWAP_PRIMARY_MODEL` on the server side to
+/// pick the actual chat slot.
+#[get("/insights/llamacpp/models")]
+pub async fn get_llamacpp_models_handler(
+    _claims: Claims,
+    app_state: web::Data<crate::state::AppState>,
+) -> impl Responder {
+    let configured = app_state.llamacpp.is_some();
+    let default_model = app_state
+        .llamacpp
+        .as_ref()
+        .map(|c| c.primary_model.clone());
+    let response = LlamaCppModelsResponse {
+        models: app_state.llamacpp_allowed_models.clone(),
+        default_model,
+        configured,
+    };
+    HttpResponse::Ok().json(response)
+}
+
 /// POST /insights/rate - Rate an insight (thumbs up/down for training data)
 #[post("/insights/rate")]
 pub async fn rate_insight_handler(
diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs
index b2a7af8..4e87d52 100644
--- a/src/ai/insight_chat.rs
+++ b/src/ai/insight_chat.rs
@@ -9,6 +9,7 @@ use tokio::sync::Mutex as TokioMutex;
 use crate::ai::insight_generator::InsightGenerator;
 use crate::ai::llm_client::{ChatMessage, LlmClient, LlmStreamEvent, Tool};
 use crate::ai::ollama::OllamaClient;
+use crate::ai::llamacpp::LlamaCppClient;
 use crate::ai::openrouter::OpenRouterClient;
 use crate::database::InsightDao;
 use crate::database::models::InsertPhotoInsight;
@@ -93,6 +94,7 @@ pub struct InsightChatService {
     generator: Arc<InsightGenerator>,
     ollama: OllamaClient,
     openrouter: Option<Arc<OpenRouterClient>>,
+    llamacpp: Option<Arc<LlamaCppClient>>,
     insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
     chat_locks: ChatLockMap,
 }
@@ -102,6 +104,7 @@ impl InsightChatService {
         generator: Arc<InsightGenerator>,
         ollama: OllamaClient,
         openrouter: Option<Arc<OpenRouterClient>>,
+        llamacpp: Option<Arc<LlamaCppClient>>,
         insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
         chat_locks: ChatLockMap,
     ) -> Self {
@@ -109,6 +112,7 @@ impl InsightChatService {
             generator,
             ollama,
             openrouter,
+            llamacpp,
             insight_dao,
             chat_locks,
         }
@@ -303,23 +307,15 @@ impl InsightChatService {
             .map(|s| s.trim().to_lowercase())
             .filter(|s| !s.is_empty())
             .unwrap_or_else(|| stored_backend.clone());
-        if !matches!(effective_backend.as_str(), "local" | "hybrid") {
-            bail!(
-                "unknown backend '{}'; expected 'local' or 'hybrid'",
-                effective_backend
-            );
-        }
-        if stored_backend == "local" && effective_backend == "hybrid" {
-            bail!(
-                "switching from local to hybrid mid-chat isn't supported yet; \
-                 regenerate the insight in hybrid mode if you want OpenRouter chat"
-            );
-        }
+        validate_cross_replay(&stored_backend, &effective_backend)?;
         let is_hybrid = effective_backend == "hybrid";
+        let is_llamacpp = effective_backend == "llamacpp";
+        let describes_then_inlines = is_hybrid || is_llamacpp;
         span.set_attribute(KeyValue::new("backend", effective_backend.clone()));
 
         // 4. Build the chat backend client. Ollama in local mode, a freshly
-        //    cloned OpenRouter client in hybrid mode (clone so per-request
+        //    cloned OpenRouter client in hybrid mode, a freshly cloned
+        //    LlamaCppClient in llamacpp mode (clone so per-request
         //    sampling/model overrides don't leak into shared state).
         let max_iterations = req
             .max_iterations
@@ -336,6 +332,7 @@ impl InsightChatService {
 
         let mut ollama_client = self.ollama.clone();
         let mut openrouter_client: Option<OpenRouterClient> = None;
+        let mut llamacpp_client: Option<LlamaCppClient> = None;
 
         if is_hybrid {
             let arc = self.openrouter.as_ref().ok_or_else(|| {
@@ -356,6 +353,25 @@ impl InsightChatService {
                 c.set_num_ctx(Some(ctx));
             }
             openrouter_client = Some(c);
+        } else if is_llamacpp {
+            let arc = self.llamacpp.as_ref().ok_or_else(|| {
+                anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured")
+            })?;
+            let mut c: LlamaCppClient = (**arc).clone();
+            if let Some(ref m) = custom_model {
+                c.primary_model = m.clone();
+            }
+            if req.temperature.is_some()
+                || req.top_p.is_some()
+                || req.top_k.is_some()
+                || req.min_p.is_some()
+            {
+                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
+            }
+            if let Some(ctx) = req.num_ctx {
+                c.set_num_ctx(Some(ctx));
+            }
+            llamacpp_client = Some(c);
         } else {
             // Local-mode model swap. Build a new client when the chat model
             // differs from the configured one (mirrors the agentic pattern).
@@ -381,7 +397,9 @@ impl InsightChatService {
             }
         }
 
-        let chat_backend: &dyn LlmClient = if let Some(ref c) = openrouter_client {
+        let chat_backend: &dyn LlmClient = if let Some(ref c) = llamacpp_client {
+            c
+        } else if let Some(ref c) = openrouter_client {
             c
         } else {
             &ollama_client
@@ -389,18 +407,19 @@ impl InsightChatService {
         let model_used = chat_backend.primary_model().to_string();
         span.set_attribute(KeyValue::new("model", model_used.clone()));
 
-        // 5. Decide vision + tool set. In hybrid we always omit
-        //    `describe_photo` (matches the original generation flow). In
-        //    local we trust the stored history's first-user shape: if it
-        //    carries `images`, the original model was vision-capable, and
-        //    we keep `describe_photo` available.
+        // 5. Decide vision + tool set. In describe-then-inline modes
+        //    (hybrid, llamacpp) we always omit `describe_photo` (matches the
+        //    original generation flow). In local we trust the stored
+        //    history's first-user shape: if it carries `images`, the
+        //    original model was vision-capable, and we keep `describe_photo`
+        //    available.
         let local_first_user_has_image = messages
             .iter()
             .find(|m| m.role == "user")
             .and_then(|m| m.images.as_ref())
             .map(|imgs| !imgs.is_empty())
             .unwrap_or(false);
-        let offer_describe_tool = !is_hybrid && local_first_user_has_image;
+        let offer_describe_tool = !describes_then_inlines && local_first_user_has_image;
         // current_gate_opts(has_vision) sets gate_opts.has_vision = has_vision
         // and probes the per-table presence flags. Pass `offer_describe_tool`
         // directly — the `!is_hybrid && local_first_user_has_image` decision
@@ -799,19 +818,10 @@ impl InsightChatService {
             .map(|s| s.trim().to_lowercase())
             .filter(|s| !s.is_empty())
             .unwrap_or_else(|| stored_backend.clone());
-        if !matches!(effective_backend.as_str(), "local" | "hybrid") {
-            bail!(
-                "unknown backend '{}'; expected 'local' or 'hybrid'",
-                effective_backend
-            );
-        }
-        if stored_backend == "local" && effective_backend == "hybrid" {
-            bail!(
-                "switching from local to hybrid mid-chat isn't supported yet; \
-                 regenerate the insight in hybrid mode if you want OpenRouter chat"
-            );
-        }
+        validate_cross_replay(&stored_backend, &effective_backend)?;
         let is_hybrid = effective_backend == "hybrid";
+        let is_llamacpp = effective_backend == "llamacpp";
+        let describes_then_inlines = is_hybrid || is_llamacpp;
 
         let max_iterations = req
             .max_iterations
@@ -826,20 +836,21 @@ impl InsightChatService {
             .filter(|m| !m.is_empty());
 
         let (chat_backend_holder, ollama_client) =
-            self.build_chat_clients(is_hybrid, custom_model.as_deref(), &req)?;
+            self.build_chat_clients(&effective_backend, custom_model.as_deref(), &req)?;
         let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref();
         let model_used = chat_backend.primary_model().to_string();
 
         // Tool set — local mode + first user turn carries an image →
-        // offer describe_photo. Hybrid: visual description was inlined
-        // when the insight was bootstrapped, no describe tool needed.
+        // offer describe_photo. Describe-then-inline modes (hybrid /
+        // llamacpp): visual description was inlined when the insight was
+        // bootstrapped, no describe tool needed.
         let local_first_user_has_image = messages
             .iter()
             .find(|m| m.role == "user")
             .and_then(|m| m.images.as_ref())
             .map(|imgs| !imgs.is_empty())
             .unwrap_or(false);
-        let offer_describe_tool = !is_hybrid && local_first_user_has_image;
+        let offer_describe_tool = !describes_then_inlines && local_first_user_has_image;
         let gate_opts = self.generator.current_gate_opts_for_persona(
             offer_describe_tool,
             Some((req.user_id, &active_persona)),
@@ -976,6 +987,8 @@ impl InsightChatService {
             .unwrap_or_else(|| "default".to_string());
         let effective_backend = resolve_bootstrap_backend(req.backend.as_deref())?;
         let is_hybrid = effective_backend == "hybrid";
+        let is_llamacpp = effective_backend == "llamacpp";
+        let describes_then_inlines = is_hybrid || is_llamacpp;
 
         let max_iterations = req
             .max_iterations
@@ -984,7 +997,7 @@ impl InsightChatService {
 
         let custom_model = req.model.clone().filter(|m| !m.is_empty());
         let (chat_backend_holder, ollama_client) =
-            self.build_chat_clients(is_hybrid, custom_model.as_deref(), &req)?;
+            self.build_chat_clients(&effective_backend, custom_model.as_deref(), &req)?;
         let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref();
         let model_used = chat_backend.primary_model().to_string();
 
@@ -1007,21 +1020,48 @@ impl InsightChatService {
                 _ => None,
             });
 
-        // Hybrid backend: pre-describe the image via local Ollama vision
-        // so OpenRouter chat models (which can't see images directly) get
-        // the visual description as text. Mirrors the same pre-describe
-        // pass that `generate_agentic_insight_for_photo` does for hybrid.
-        let visual_block = if is_hybrid {
+        // Describe-then-inline backends (hybrid, llamacpp): pre-describe the
+        // image so a text-only chat model gets the visual description inline.
+        // Vision source: llamacpp's vision slot in llamacpp mode; in hybrid
+        // mode Ollama by default, llamacpp via `HYBRID_VISION_BACKEND=llamacpp`.
+        let visual_block = if describes_then_inlines {
             match image_base64.as_deref() {
-                Some(b64) => match self.ollama.describe_image(b64).await {
-                    Ok(desc) => {
-                        format!("Visual description (from local vision model):\n{}\n", desc)
+                Some(b64) => {
+                    let use_llamacpp_vision = if is_llamacpp {
+                        true
+                    } else {
+                        matches!(
+                            std::env::var("HYBRID_VISION_BACKEND")
+                                .ok()
+                                .as_deref()
+                                .map(|s| s.trim().to_lowercase())
+                                .as_deref(),
+                            Some("llamacpp")
+                        )
+                    };
+                    let described = if use_llamacpp_vision {
+                        match self.llamacpp.as_ref() {
+                            Some(c) => c.describe_image(b64).await,
+                            None => {
+                                log::warn!(
+                                    "bootstrap: requested llamacpp vision but LLAMA_SWAP_URL unset; falling back to Ollama"
+                                );
+                                self.ollama.describe_image(b64).await
+                            }
+                        }
+                    } else {
+                        self.ollama.describe_image(b64).await
+                    };
+                    match described {
+                        Ok(desc) => {
+                            format!("Visual description (from local vision model):\n{}\n", desc)
+                        }
+                        Err(e) => {
+                            log::warn!("{} bootstrap: describe_image failed: {}", effective_backend, e);
+                            String::new()
+                        }
                     }
-                    Err(e) => {
-                        log::warn!("hybrid bootstrap: local describe_image failed: {}", e);
-                        String::new()
-                    }
-                },
+                }
                 None => String::new(),
             }
         } else {
@@ -1031,7 +1071,7 @@ impl InsightChatService {
         // Tool gates. Local + image present → expose describe_photo so
         // the chat model can re-look at the photo on demand. Hybrid:
         // already inlined, no tool needed.
-        let offer_describe_tool = !is_hybrid && image_base64.is_some();
+        let offer_describe_tool = !describes_then_inlines && image_base64.is_some();
         let gate_opts = self.generator.current_gate_opts_for_persona(
             offer_describe_tool,
             Some((req.user_id, &active_persona)),
@@ -1057,7 +1097,7 @@ impl InsightChatService {
         );
         let system_msg = ChatMessage::system(system_content);
         let mut user_msg = ChatMessage::user(req.user_message.clone());
-        if !is_hybrid && let Some(ref img) = image_base64 {
+        if !describes_then_inlines && let Some(ref img) = image_base64 {
             user_msg.images = Some(vec![img.clone()]);
         }
         let mut messages = vec![system_msg, user_msg];
@@ -1130,19 +1170,22 @@ impl InsightChatService {
         Ok(())
     }
 
-    /// Set up chat clients (Ollama + optional OpenRouter) shared by
-    /// bootstrap and continuation. Returns the chat-side backend client
-    /// (boxed because hybrid and local return different concrete types)
-    /// and the Ollama client used for describe-image / local tool calls.
+    /// Set up chat clients (Ollama + optional OpenRouter / LlamaCpp) shared
+    /// by bootstrap and continuation. Returns the chat-side backend client
+    /// (boxed because each backend has a different concrete type) and the
+    /// Ollama client used for describe-image / local tool calls.
+    ///
+    /// `effective_backend` must be one of `"local"`, `"hybrid"`, `"llamacpp"`
+    /// (validated upstream).
     fn build_chat_clients(
         &self,
-        is_hybrid: bool,
+        effective_backend: &str,
         custom_model: Option<&str>,
         req: &ChatTurnRequest,
     ) -> Result<(Box<dyn LlmClient>, OllamaClient)> {
         let mut ollama_client = self.ollama.clone();
 
-        if is_hybrid {
+        if effective_backend == "hybrid" {
             let arc = self.openrouter.as_ref().ok_or_else(|| {
                 anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
             })?;
@@ -1163,6 +1206,27 @@ impl InsightChatService {
             return Ok((Box::new(c), ollama_client));
         }
 
+        if effective_backend == "llamacpp" {
+            let arc = self.llamacpp.as_ref().ok_or_else(|| {
+                anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured")
+            })?;
+            let mut c: LlamaCppClient = (**arc).clone();
+            if let Some(m) = custom_model {
+                c.primary_model = m.to_string();
+            }
+            if req.temperature.is_some()
+                || req.top_p.is_some()
+                || req.top_k.is_some()
+                || req.min_p.is_some()
+            {
+                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
+            }
+            if let Some(ctx) = req.num_ctx {
+                c.set_num_ctx(Some(ctx));
+            }
+            return Ok((Box::new(c), ollama_client));
+        }
+
         if let Some(m) = custom_model
             && m != self.ollama.primary_model
         {
@@ -1459,6 +1523,49 @@ fn resolve_date_taken_for_context(
         .map(|dt| dt.format("%Y-%m-%d").to_string())
 }
 
+/// Validate a stored→effective backend transition for a chat continuation.
+/// Continuation runs against a transcript that was generated with a specific
+/// backend; some transitions break the conversation shape:
+///
+/// - `local → hybrid` — the stored transcript has images embedded in the
+///   first user message; the openrouter chat client surfaces them through
+///   the wire, but vision-only models routed via the hybrid path may not
+///   accept that shape consistently across providers. Reject to keep the
+///   `regenerate-in-hybrid-mode` workflow as the supported answer.
+/// - `llamacpp → hybrid` — the stored transcript already has an inlined
+///   visual description produced by llama-swap's vision slot. Switching
+///   to hybrid mid-conversation would mix description sources across
+///   subsequent turns (any new image in the chat continuation would be
+///   described by ollama-vision while the original was described by
+///   llama-vision). Reject for consistency.
+///
+/// All other transitions are allowed. `local ↔ llamacpp` works because
+/// LlamaCppClient passes image content-parts through to the chat slot —
+/// the user is responsible for picking a vision-capable chat model in
+/// that case. `hybrid ↔ llamacpp` works because both transcripts are
+/// text-only (visual description inlined at bootstrap).
+fn validate_cross_replay(stored: &str, effective: &str) -> Result<()> {
+    if !matches!(effective, "local" | "hybrid" | "llamacpp") {
+        bail!(
+            "unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'",
+            effective
+        );
+    }
+    if stored == "local" && effective == "hybrid" {
+        bail!(
+            "switching from local to hybrid mid-chat isn't supported yet; \
+             regenerate the insight in hybrid mode if you want OpenRouter chat"
+        );
+    }
+    if stored == "llamacpp" && effective == "hybrid" {
+        bail!(
+            "switching from llamacpp to hybrid mid-chat isn't supported yet; \
+             regenerate the insight in hybrid mode if you want OpenRouter chat"
+        );
+    }
+    Ok(())
+}
+
 /// Pick the backend label for bootstrap. Bootstrap has no stored insight
 /// to defer to (that's continuation's behaviour), so the default is
 /// `"local"`. Returns an error if the supplied label is non-empty but
@@ -1469,8 +1576,11 @@ fn resolve_bootstrap_backend(supplied: Option<&str>) -> Result<String> {
         .map(|s| s.trim().to_lowercase())
         .filter(|s| !s.is_empty())
         .unwrap_or_else(|| "local".to_string());
-    if !matches!(lower.as_str(), "local" | "hybrid") {
-        bail!("unknown backend '{}'; expected 'local' or 'hybrid'", lower);
+    if !matches!(lower.as_str(), "local" | "hybrid" | "llamacpp") {
+        bail!(
+            "unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'",
+            lower
+        );
     }
     Ok(lower)
 }
@@ -2074,6 +2184,10 @@ mod tests {
     fn bootstrap_backend_accepts_local_and_hybrid_case_insensitively() {
         assert_eq!(resolve_bootstrap_backend(Some("LOCAL")).unwrap(), "local");
         assert_eq!(resolve_bootstrap_backend(Some("Hybrid")).unwrap(), "hybrid");
+        assert_eq!(
+            resolve_bootstrap_backend(Some("Llamacpp")).unwrap(),
+            "llamacpp"
+        );
         assert_eq!(
             resolve_bootstrap_backend(Some("  local  ")).unwrap(),
             "local"
@@ -2088,6 +2202,38 @@ mod tests {
         assert!(msg.contains("openrouter"));
     }
 
+    #[test]
+    fn cross_replay_rejects_local_to_hybrid() {
+        let err = validate_cross_replay("local", "hybrid").unwrap_err();
+        assert!(format!("{}", err).contains("local to hybrid"));
+    }
+
+    #[test]
+    fn cross_replay_rejects_llamacpp_to_hybrid() {
+        let err = validate_cross_replay("llamacpp", "hybrid").unwrap_err();
+        assert!(format!("{}", err).contains("llamacpp to hybrid"));
+    }
+
+    #[test]
+    fn cross_replay_allows_local_llamacpp_and_hybrid_llamacpp_transitions() {
+        // Local ↔ llamacpp: user is responsible for picking a vision-capable
+        // chat slot when the transcript has images.
+        assert!(validate_cross_replay("local", "llamacpp").is_ok());
+        assert!(validate_cross_replay("llamacpp", "local").is_ok());
+        // Hybrid ↔ llamacpp: both transcripts are text-only.
+        assert!(validate_cross_replay("hybrid", "llamacpp").is_ok());
+        // Same-backend replays are always fine.
+        assert!(validate_cross_replay("local", "local").is_ok());
+        assert!(validate_cross_replay("hybrid", "hybrid").is_ok());
+        assert!(validate_cross_replay("llamacpp", "llamacpp").is_ok());
+    }
+
+    #[test]
+    fn cross_replay_rejects_unknown_effective() {
+        let err = validate_cross_replay("local", "openrouter").unwrap_err();
+        assert!(format!("{}", err).contains("unknown backend"));
+    }
+
     #[test]
     fn bootstrap_system_message_includes_path_and_persona() {
         let out = build_bootstrap_system_message("you are helpful", "pics/IMG.jpg", None, None, "");
diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs
index 2e2da33..2a11e29 100644
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -12,6 +12,7 @@ use std::sync::{Arc, Mutex};
 use crate::ai::apollo_client::{ApolloClient, ApolloPlace};
 use crate::ai::llm_client::LlmClient;
 use crate::ai::ollama::{ChatMessage, OllamaClient, Tool};
+use crate::ai::llamacpp::LlamaCppClient;
 use crate::ai::openrouter::OpenRouterClient;
 use crate::ai::sms_client::{SmsApiClient, SmsSearchHit, SmsSearchParams};
 use crate::ai::user_display_name;
@@ -68,6 +69,9 @@ pub struct InsightGenerator {
     /// Optional OpenRouter client, used when `backend=hybrid` is requested.
     /// `None` when `OPENROUTER_API_KEY` is not configured.
     openrouter: Option<Arc<OpenRouterClient>>,
+    /// Optional llama-swap client, used when `backend=llamacpp` is requested.
+    /// `None` when `LLAMA_SWAP_URL` is not configured.
+    llamacpp: Option<Arc<LlamaCppClient>>,
     sms_client: SmsApiClient,
     /// Optional integration with Apollo's user-defined Places. When the
     /// integration is disabled (`APOLLO_API_BASE_URL` unset), every
@@ -120,6 +124,7 @@ impl InsightGenerator {
     pub fn new(
         ollama: OllamaClient,
         openrouter: Option<Arc<OpenRouterClient>>,
+        llamacpp: Option<Arc<LlamaCppClient>>,
         sms_client: SmsApiClient,
         apollo_client: ApolloClient,
         insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
@@ -137,6 +142,7 @@ impl InsightGenerator {
         Self {
             ollama,
             openrouter,
+            llamacpp,
             sms_client,
             apollo_client,
             insight_dao,
@@ -3574,23 +3580,31 @@ Return ONLY the summary, nothing else."#,
             .map(|s| s.trim().to_lowercase())
             .filter(|s| !s.is_empty())
             .unwrap_or_else(|| "local".to_string());
-        if !matches!(backend_label.as_str(), "local" | "hybrid") {
+        if !matches!(backend_label.as_str(), "local" | "hybrid" | "llamacpp") {
             return Err(anyhow::anyhow!(
-                "unknown backend '{}'; expected 'local' or 'hybrid'",
+                "unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'",
                 backend_label
             ));
         }
         span.set_attribute(KeyValue::new("backend", backend_label.clone()));
         let is_hybrid = backend_label == "hybrid";
+        let is_llamacpp = backend_label == "llamacpp";
+        // In hybrid + llamacpp modes the chat model never sees the image
+        // directly; we describe-then-inline locally before the agentic loop
+        // starts. Tracked as a single flag so vision/tool-gate logic doesn't
+        // have to branch twice.
+        let describes_then_inlines = is_hybrid || is_llamacpp;
 
         // 1b. Always build an Ollama client. In local mode it owns the chat
-        //     loop; in hybrid mode it still handles describe_image + any
-        //     tool-local calls (e.g. if a future tool needs embeddings).
-        //     Sampling overrides only apply in local mode — in hybrid the
-        //     user's params belong to the OpenRouter chat client.
-        let apply_sampling_to_ollama = !is_hybrid;
+        //     loop; in hybrid/llamacpp mode it still handles tool-local calls
+        //     (e.g. future embedding-backed tools). The chat backend is
+        //     selected separately below.
+        //     Sampling overrides only apply in local mode — in
+        //     hybrid/llamacpp the user's params belong to the alternate chat
+        //     client.
+        let apply_sampling_to_ollama = !describes_then_inlines;
         let mut ollama_client = if let Some(ref model) = custom_model
-            && !is_hybrid
+            && !describes_then_inlines
         {
             log::info!("Using custom model for agentic: {}", model);
             span.set_attribute(KeyValue::new("custom_model", model.clone()));
@@ -3601,7 +3615,7 @@ Return ONLY the summary, nothing else."#,
                 Some(model.clone()),
             )
         } else {
-            if !is_hybrid {
+            if !describes_then_inlines {
                 span.set_attribute(KeyValue::new("model", self.ollama.primary_model.clone()));
             }
             self.ollama.clone()
@@ -3674,6 +3688,44 @@ Return ONLY the summary, nothing else."#,
             None
         };
 
+        // 1d. In llamacpp mode, clone the configured LlamaCpp client and
+        //     apply per-request overrides. Same shape as the openrouter
+        //     branch above; describe_image will route through the vision
+        //     slot configured on the client.
+        let llamacpp_client: Option<LlamaCppClient> = if is_llamacpp {
+            let arc = self.llamacpp.as_ref().ok_or_else(|| {
+                anyhow::anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured")
+            })?;
+            let mut c: LlamaCppClient = (**arc).clone();
+            if let Some(ref m) = custom_model {
+                c.primary_model = m.clone();
+                span.set_attribute(KeyValue::new("custom_model", m.clone()));
+            }
+            span.set_attribute(KeyValue::new("llamacpp_model", c.primary_model.clone()));
+            if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() {
+                if let Some(t) = temperature {
+                    span.set_attribute(KeyValue::new("temperature", t as f64));
+                }
+                if let Some(p) = top_p {
+                    span.set_attribute(KeyValue::new("top_p", p as f64));
+                }
+                if let Some(k) = top_k {
+                    span.set_attribute(KeyValue::new("top_k", k as i64));
+                }
+                if let Some(m) = min_p {
+                    span.set_attribute(KeyValue::new("min_p", m as f64));
+                }
+                c.set_sampling_params(temperature, top_p, top_k, min_p);
+            }
+            if let Some(ctx) = num_ctx {
+                span.set_attribute(KeyValue::new("num_ctx", ctx as i64));
+                c.set_num_ctx(Some(ctx));
+            }
+            Some(c)
+        } else {
+            None
+        };
+
         let insight_cx = current_cx.with_span(span);
 
         // 2. Verify chat model supports tool calling.
@@ -3681,10 +3733,11 @@ Return ONLY the summary, nothing else."#,
         //    - hybrid: trust the operator's curated allowlist
         //      (OPENROUTER_ALLOWED_MODELS) — no live precheck. A bad model id
         //      surfaces as a chat-call error on the next step.
-        let has_vision = if is_hybrid {
-            // In hybrid mode the chat model never sees images directly — we
-            // describe-then-inject, so `has_vision` drives only whether we
-            // bother loading the image to describe it, which we always do.
+        let has_vision = if describes_then_inlines {
+            // In hybrid + llamacpp modes the chat model never sees images
+            // directly — we describe-then-inject, so `has_vision` drives only
+            // whether we bother loading the image to describe it, which we
+            // always do.
             true
         } else {
             if let Some(ref model_name) = custom_model {
@@ -3864,24 +3917,61 @@ Return ONLY the summary, nothing else."#,
             None
         };
 
-        let hybrid_visual_description: Option<String> = if is_hybrid {
+        // describe-then-inline path. In hybrid mode the vision backend
+        // defaults to Ollama but can be flipped to llamacpp via
+        // `HYBRID_VISION_BACKEND=llamacpp` (so chat goes to OpenRouter while
+        // vision/audio routes through llama-swap). In llamacpp mode we always
+        // use the llamacpp client's configured vision slot.
+        let inlined_visual_description: Option<String> = if describes_then_inlines {
             match image_base64.as_deref() {
-                Some(b64) => match self.ollama.describe_image(b64).await {
-                    Ok(desc) => {
-                        log::info!(
-                            "Hybrid: local vision describe succeeded ({} chars)",
-                            desc.len()
-                        );
-                        Some(desc)
+                Some(b64) => {
+                    let use_llamacpp_vision = if is_llamacpp {
+                        true
+                    } else {
+                        // is_hybrid branch — consult env switch
+                        matches!(
+                            std::env::var("HYBRID_VISION_BACKEND")
+                                .ok()
+                                .as_deref()
+                                .map(|s| s.trim().to_lowercase())
+                                .as_deref(),
+                            Some("llamacpp")
+                        )
+                    };
+
+                    let described = if use_llamacpp_vision {
+                        match self.llamacpp.as_ref() {
+                            Some(c) => c.describe_image(b64).await,
+                            None => {
+                                log::warn!(
+                                    "describe-then-inline: requested llamacpp vision but LLAMA_SWAP_URL is unset, falling back to Ollama"
+                                );
+                                self.ollama.describe_image(b64).await
+                            }
+                        }
+                    } else {
+                        self.ollama.describe_image(b64).await
+                    };
+
+                    match described {
+                        Ok(desc) => {
+                            log::info!(
+                                "{}: vision describe succeeded ({} chars)",
+                                backend_label,
+                                desc.len()
+                            );
+                            Some(desc)
+                        }
+                        Err(e) => {
+                            log::warn!(
+                                "{}: vision describe failed, continuing without: {}",
+                                backend_label,
+                                e
+                            );
+                            None
+                        }
                     }
-                    Err(e) => {
-                        log::warn!(
-                            "Hybrid: local vision describe failed, continuing without: {}",
-                            e
-                        );
-                        None
-                    }
-                },
+                }
                 None => None,
             }
         } else {
@@ -3934,7 +4024,7 @@ Return ONLY the summary, nothing else."#,
             .map(|c| format!("Contact/Person: {}", c))
             .unwrap_or_else(|| "Contact/Person: unknown".to_string());
 
-        let visual_block = hybrid_visual_description
+        let visual_block = inlined_visual_description
             .as_deref()
             .map(|d| format!("Visual description (from local vision model):\n{}\n\n", d))
             .unwrap_or_default();
@@ -3954,25 +4044,28 @@ Return ONLY the summary, nothing else."#,
         );
 
         // 10. Define tools. Gate flags computed from current data presence;
-        //     hybrid mode omits describe_photo since the chat model receives
-        //     the visual description inline (so we pass `false` for has_vision
-        //     in hybrid mode regardless of the model's actual capability).
-        let gate_opts = self.current_gate_opts(has_vision && !is_hybrid);
+        //     describe-then-inline modes (hybrid, llamacpp) omit describe_photo
+        //     since the chat model receives the visual description inline (so
+        //     we pass `false` for has_vision in those modes regardless of the
+        //     model's actual capability).
+        let gate_opts = self.current_gate_opts(has_vision && !describes_then_inlines);
         let tools = Self::build_tool_definitions(gate_opts);
 
-        // 11. Build initial messages. In hybrid mode images are never
-        //     attached to the wire message — the description is part of
-        //     `user_content`.
+        // 11. Build initial messages. In describe-then-inline modes images
+        //     are never attached to the wire message — the description is part
+        //     of `user_content`.
         let system_msg = ChatMessage::system(system_content);
         let mut user_msg = ChatMessage::user(user_content);
-        if !is_hybrid && let Some(ref img) = image_base64 {
+        if !describes_then_inlines && let Some(ref img) = image_base64 {
             user_msg.images = Some(vec![img.clone()]);
         }
 
         let mut messages = vec![system_msg, user_msg];
 
         // 12. Agentic loop — dispatch through the selected backend.
-        let chat_backend: &dyn LlmClient = if let Some(ref or_c) = openrouter_client {
+        let chat_backend: &dyn LlmClient = if let Some(ref lc_c) = llamacpp_client {
+            lc_c
+        } else if let Some(ref or_c) = openrouter_client {
             or_c
         } else {
             &ollama_client
diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs
new file mode 100644
index 0000000..100020c
--- /dev/null
+++ b/src/ai/llamacpp.rs
@@ -0,0 +1,978 @@
+// LlamaCppClient — talks to a llama-swap proxy that fronts one or more
+// llama-server processes. llama-swap exposes an OpenAI-compatible HTTP
+// surface (`/v1/chat/completions`, `/v1/embeddings`, `/v1/models`), so the
+// wire translation mirrors `OpenRouterClient` almost exactly.
+//
+// Differences from OpenRouter:
+// - No bearer auth or attribution headers; llama-swap is LAN-only.
+// - Three model slots (`primary_model` = chat, `vision_model`, `embedding_model`)
+//   each map to a model id in the llama-swap config. `describe_image` and
+//   `generate_embeddings` issue requests with the appropriate slot id in the
+//   `model` field, which is how llama-swap selects which backend process to
+//   run.
+// - `/v1/models` returns only the configured slot ids — capabilities aren't
+//   reported by the API, so `vision_models` is a config-time allowlist (env
+//   `LLAMA_SWAP_VISION_MODELS`) used to set `has_vision` on responses.
+//   `has_tool_calling` is assumed true for every slot, since llama-swap entries
+//   default to launching llama-server with `--jinja`.
+//
+// First consumer lands alongside the three-way backend dispatch in
+// insight_generator / insight_chat.
+#![allow(dead_code)]
+
+use anyhow::{Context, Result, anyhow, bail};
+use async_trait::async_trait;
+use reqwest::Client;
+use serde::Deserialize;
+use serde_json::{Value, json};
+use std::time::Duration;
+
+use crate::ai::llm_client::{
+    ChatMessage, LlmClient, LlmStreamEvent, ModelCapabilities, Tool, ToolCall, ToolCallFunction,
+};
+use futures::stream::{BoxStream, StreamExt};
+
+const DEFAULT_BASE_URL: &str = "http://localhost:9292/v1";
+const DEFAULT_PRIMARY_MODEL: &str = "chat";
+const DEFAULT_VISION_MODEL: &str = "vision";
+const DEFAULT_EMBEDDING_MODEL: &str = "embed";
+const DEFAULT_REQUEST_TIMEOUT_SECS: u64 = 180;
+
+/// OpenAI-compatible client targeting a llama-swap proxy in front of one or
+/// more llama-server processes. See the module doc-comment for the slot model.
+#[derive(Clone)]
+pub struct LlamaCppClient {
+    client: Client,
+    pub base_url: String,
+    /// Chat model slot id (e.g. `"chat"`). Used for `generate` /
+    /// `chat_with_tools` / `chat_with_tools_stream`.
+    pub primary_model: String,
+    /// Embedding model slot id (e.g. `"embed"`). Used for
+    /// `generate_embeddings`.
+    pub embedding_model: String,
+    /// Vision model slot id (e.g. `"vision"`). Used for `describe_image` and
+    /// included in `vision_models` automatically so capability lookups for
+    /// the default vision slot report `has_vision = true` even when the env
+    /// allowlist is empty.
+    pub vision_model: String,
+    /// Operator-curated set of slot ids known to be multimodal. Drives the
+    /// `has_vision` field in `list_models` / `model_capabilities`, since
+    /// llama-swap's `/v1/models` doesn't report modality. Empty allowlist
+    /// still marks `vision_model` as vision-capable.
+    pub vision_models: Vec<String>,
+    num_ctx: Option<i32>,
+    temperature: Option<f32>,
+    top_p: Option<f32>,
+    top_k: Option<i32>,
+    min_p: Option<f32>,
+}
+
+impl LlamaCppClient {
+    pub fn new(base_url: Option<String>, primary_model: Option<String>) -> Self {
+        let timeout_secs = std::env::var("LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS")
+            .ok()
+            .and_then(|v| v.parse::<u64>().ok())
+            .unwrap_or(DEFAULT_REQUEST_TIMEOUT_SECS);
+        Self {
+            client: Client::builder()
+                .connect_timeout(Duration::from_secs(10))
+                .timeout(Duration::from_secs(timeout_secs))
+                .build()
+                .unwrap_or_else(|_| Client::new()),
+            base_url: base_url.unwrap_or_else(|| DEFAULT_BASE_URL.to_string()),
+            primary_model: primary_model.unwrap_or_else(|| DEFAULT_PRIMARY_MODEL.to_string()),
+            embedding_model: DEFAULT_EMBEDDING_MODEL.to_string(),
+            vision_model: DEFAULT_VISION_MODEL.to_string(),
+            vision_models: Vec::new(),
+            num_ctx: None,
+            temperature: None,
+            top_p: None,
+            top_k: None,
+            min_p: None,
+        }
+    }
+
+    pub fn set_embedding_model(&mut self, model: String) {
+        self.embedding_model = model;
+    }
+
+    pub fn set_vision_model(&mut self, model: String) {
+        self.vision_model = model;
+    }
+
+    pub fn set_vision_models(&mut self, models: Vec<String>) {
+        self.vision_models = models;
+    }
+
+    pub fn set_num_ctx(&mut self, num_ctx: Option<i32>) {
+        self.num_ctx = num_ctx;
+    }
+
+    pub fn set_sampling_params(
+        &mut self,
+        temperature: Option<f32>,
+        top_p: Option<f32>,
+        top_k: Option<i32>,
+        min_p: Option<f32>,
+    ) {
+        self.temperature = temperature;
+        self.top_p = top_p;
+        self.top_k = top_k;
+        self.min_p = min_p;
+    }
+
+    /// Translate canonical messages to the OpenAI-compatible wire shape.
+    /// Behaviorally identical to `OpenRouterClient::messages_to_openai` —
+    /// stringify tool-call arguments, rewrite images into content-parts, attach
+    /// `tool_call_id` to `role=tool` messages based on the preceding assistant
+    /// turn's tool calls.
+    fn messages_to_openai(messages: &[ChatMessage]) -> Vec<Value> {
+        let mut out = Vec::with_capacity(messages.len());
+        let mut last_tool_call_ids: Vec<String> = Vec::new();
+        let mut next_tool_result_idx: usize = 0;
+
+        for msg in messages {
+            let mut obj = serde_json::Map::new();
+            obj.insert("role".into(), Value::String(msg.role.clone()));
+
+            match &msg.images {
+                Some(images) if !images.is_empty() => {
+                    let mut parts: Vec<Value> = Vec::new();
+                    if !msg.content.is_empty() {
+                        parts.push(json!({"type": "text", "text": msg.content}));
+                    }
+                    for img in images {
+                        let url = image_to_data_url(img);
+                        parts.push(json!({
+                            "type": "image_url",
+                            "image_url": { "url": url }
+                        }));
+                    }
+                    obj.insert("content".into(), Value::Array(parts));
+                }
+                _ => {
+                    obj.insert("content".into(), Value::String(msg.content.clone()));
+                }
+            }
+
+            if let Some(tcs) = &msg.tool_calls
+                && msg.role == "assistant"
+            {
+                let converted: Vec<Value> = tcs
+                    .iter()
+                    .enumerate()
+                    .map(|(i, call)| {
+                        let id = call.id.clone().unwrap_or_else(|| format!("call_{}", i));
+                        let args_str = serde_json::to_string(&call.function.arguments)
+                            .unwrap_or_else(|_| "{}".to_string());
+                        json!({
+                            "id": id,
+                            "type": "function",
+                            "function": {
+                                "name": call.function.name,
+                                "arguments": args_str,
+                            }
+                        })
+                    })
+                    .collect();
+                last_tool_call_ids = converted
+                    .iter()
+                    .filter_map(|v| v.get("id").and_then(|x| x.as_str()).map(String::from))
+                    .collect();
+                next_tool_result_idx = 0;
+                obj.insert("tool_calls".into(), Value::Array(converted));
+            }
+
+            if msg.role == "tool" {
+                let id = last_tool_call_ids
+                    .get(next_tool_result_idx)
+                    .cloned()
+                    .unwrap_or_else(|| "call_0".to_string());
+                obj.insert("tool_call_id".into(), Value::String(id));
+                next_tool_result_idx += 1;
+            }
+
+            out.push(Value::Object(obj));
+        }
+
+        out
+    }
+
+    /// Parse an OpenAI-compatible assistant message back into canonical shape.
+    /// llama.cpp emits `reasoning_content` on thinking models; we drop it for
+    /// parity with OpenRouter (which also strips upstream reasoning fields).
+    fn openai_message_to_chat(msg: &Value) -> Result<ChatMessage> {
+        let obj = msg
+            .as_object()
+            .ok_or_else(|| anyhow!("response message is not an object"))?;
+        let role = obj
+            .get("role")
+            .and_then(|v| v.as_str())
+            .unwrap_or("assistant")
+            .to_string();
+        let content = obj
+            .get("content")
+            .and_then(|v| v.as_str())
+            .unwrap_or("")
+            .to_string();
+
+        let tool_calls = if let Some(tcs) = obj.get("tool_calls").and_then(|v| v.as_array()) {
+            let mut parsed = Vec::with_capacity(tcs.len());
+            for tc in tcs {
+                let id = tc.get("id").and_then(|v| v.as_str()).map(String::from);
+                let function = tc
+                    .get("function")
+                    .ok_or_else(|| anyhow!("tool_call missing function field"))?;
+                let name = function
+                    .get("name")
+                    .and_then(|v| v.as_str())
+                    .unwrap_or_default()
+                    .to_string();
+                let args_value = match function.get("arguments") {
+                    Some(Value::String(s)) => {
+                        serde_json::from_str::<Value>(s).unwrap_or_else(|_| json!({}))
+                    }
+                    Some(v @ Value::Object(_)) => v.clone(),
+                    _ => json!({}),
+                };
+                parsed.push(ToolCall {
+                    id,
+                    function: ToolCallFunction {
+                        name,
+                        arguments: args_value,
+                    },
+                });
+            }
+            Some(parsed)
+        } else {
+            None
+        };
+
+        Ok(ChatMessage {
+            role,
+            content,
+            tool_calls,
+            images: None,
+        })
+    }
+
+    fn build_options(&self) -> Vec<(&'static str, Value)> {
+        let mut v = Vec::new();
+        if let Some(t) = self.temperature {
+            v.push(("temperature", json!(t)));
+        }
+        if let Some(p) = self.top_p {
+            v.push(("top_p", json!(p)));
+        }
+        if let Some(k) = self.top_k {
+            v.push(("top_k", json!(k)));
+        }
+        if let Some(m) = self.min_p {
+            v.push(("min_p", json!(m)));
+        }
+        // num_ctx isn't an OpenAI param; llama-server bakes ctx in at launch
+        // via -c, so we silently drop the override here. The config.yaml
+        // entry is the source of truth for context size.
+        let _ = self.num_ctx;
+        v
+    }
+
+    /// Issue a chat request with an explicit model id override. Used by
+    /// `describe_image` to route through the vision slot without mutating
+    /// `self.primary_model`.
+    async fn chat_completion_with_model(
+        &self,
+        model: &str,
+        messages: Vec<ChatMessage>,
+        tools: Vec<Tool>,
+    ) -> Result<(ChatMessage, Option<i32>, Option<i32>)> {
+        let url = format!("{}/chat/completions", self.base_url);
+        let mut body = serde_json::Map::new();
+        body.insert("model".into(), Value::String(model.to_string()));
+        body.insert(
+            "messages".into(),
+            Value::Array(Self::messages_to_openai(&messages)),
+        );
+        body.insert("stream".into(), Value::Bool(false));
+        if !tools.is_empty() {
+            body.insert(
+                "tools".into(),
+                serde_json::to_value(&tools).context("serializing tools")?,
+            );
+        }
+        for (k, v) in self.build_options() {
+            body.insert(k.into(), v);
+        }
+
+        let resp = self
+            .client
+            .post(&url)
+            .json(&Value::Object(body))
+            .send()
+            .await
+            .with_context(|| format!("POST {} failed", url))?;
+
+        if !resp.status().is_success() {
+            let status = resp.status();
+            let body = resp.text().await.unwrap_or_default();
+            bail!("llama-swap chat request failed: {} — {}", status, body);
+        }
+
+        let parsed: Value = resp.json().await.context("parsing chat response")?;
+        let choice = parsed
+            .get("choices")
+            .and_then(|v| v.as_array())
+            .and_then(|a| a.first())
+            .ok_or_else(|| {
+                anyhow!(
+                    "response missing choices[0]: {}",
+                    extract_error_detail(&parsed)
+                )
+            })?;
+        let msg = choice.get("message").ok_or_else(|| {
+            anyhow!(
+                "choices[0] missing message: {}",
+                extract_error_detail(&parsed)
+            )
+        })?;
+        let chat_msg = Self::openai_message_to_chat(msg)?;
+
+        let usage = parsed.get("usage");
+        let prompt_tokens = usage
+            .and_then(|u| u.get("prompt_tokens"))
+            .and_then(|v| v.as_i64())
+            .map(|n| n as i32);
+        let completion_tokens = usage
+            .and_then(|u| u.get("completion_tokens"))
+            .and_then(|v| v.as_i64())
+            .map(|n| n as i32);
+
+        Ok((chat_msg, prompt_tokens, completion_tokens))
+    }
+}
+
+#[async_trait]
+impl LlmClient for LlamaCppClient {
+    async fn generate(
+        &self,
+        prompt: &str,
+        system: Option<&str>,
+        images: Option<Vec<String>>,
+    ) -> Result<String> {
+        let mut messages: Vec<ChatMessage> = Vec::new();
+        if let Some(sys) = system {
+            messages.push(ChatMessage::system(sys));
+        }
+        let mut user = ChatMessage::user(prompt);
+        user.images = images;
+        messages.push(user);
+
+        let (reply, _, _) = self.chat_with_tools(messages, Vec::new()).await?;
+        Ok(reply.content)
+    }
+
+    async fn chat_with_tools(
+        &self,
+        messages: Vec<ChatMessage>,
+        tools: Vec<Tool>,
+    ) -> Result<(ChatMessage, Option<i32>, Option<i32>)> {
+        log::info!(
+            "llama-swap chat_with_tools: model={} messages={} tools={}",
+            self.primary_model,
+            messages.len(),
+            tools.len()
+        );
+        self.chat_completion_with_model(&self.primary_model.clone(), messages, tools)
+            .await
+    }
+
+    async fn chat_with_tools_stream(
+        &self,
+        messages: Vec<ChatMessage>,
+        tools: Vec<Tool>,
+    ) -> Result<BoxStream<'static, Result<LlmStreamEvent>>> {
+        let url = format!("{}/chat/completions", self.base_url);
+        let mut body = serde_json::Map::new();
+        body.insert(
+            "model".into(),
+            Value::String(self.primary_model.clone()),
+        );
+        body.insert(
+            "messages".into(),
+            Value::Array(Self::messages_to_openai(&messages)),
+        );
+        body.insert("stream".into(), Value::Bool(true));
+        body.insert(
+            "stream_options".into(),
+            serde_json::json!({ "include_usage": true }),
+        );
+        if !tools.is_empty() {
+            body.insert(
+                "tools".into(),
+                serde_json::to_value(&tools).context("serializing tools")?,
+            );
+        }
+        for (k, v) in self.build_options() {
+            body.insert(k.into(), v);
+        }
+
+        let resp = self
+            .client
+            .post(&url)
+            .json(&Value::Object(body))
+            .send()
+            .await
+            .with_context(|| format!("POST {} failed", url))?;
+
+        if !resp.status().is_success() {
+            let status = resp.status();
+            let body = resp.text().await.unwrap_or_default();
+            bail!("llama-swap stream request failed: {} — {}", status, body);
+        }
+
+        let byte_stream = resp.bytes_stream();
+        let stream = async_stream::stream! {
+            let mut byte_stream = byte_stream;
+            let mut buf: Vec<u8> = Vec::new();
+            let mut accumulated_content = String::new();
+            let mut tool_state: std::collections::BTreeMap<
+                usize,
+                (Option<String>, Option<String>, String),
+            > = std::collections::BTreeMap::new();
+            let mut role = "assistant".to_string();
+            let mut prompt_tokens: Option<i32> = None;
+            let mut completion_tokens: Option<i32> = None;
+            let mut done_seen = false;
+
+            while let Some(chunk) = byte_stream.next().await {
+                let chunk = match chunk {
+                    Ok(b) => b,
+                    Err(e) => {
+                        yield Err(anyhow!("stream read failed: {}", e));
+                        return;
+                    }
+                };
+                buf.extend_from_slice(&chunk);
+
+                while let Some(sep) = find_double_newline(&buf) {
+                    let frame = buf.drain(..sep + 2).collect::<Vec<_>>();
+                    let frame_str = match std::str::from_utf8(&frame) {
+                        Ok(s) => s,
+                        Err(_) => continue,
+                    };
+                    for line in frame_str.lines() {
+                        let line = line.trim_end_matches('\r');
+                        let payload = match line.strip_prefix("data: ") {
+                            Some(p) => p,
+                            None => continue,
+                        };
+                        if payload == "[DONE]" {
+                            done_seen = true;
+                            break;
+                        }
+                        let v: Value = match serde_json::from_str(payload) {
+                            Ok(v) => v,
+                            Err(e) => {
+                                log::warn!(
+                                    "malformed llama-swap SSE frame: {} ({})",
+                                    payload,
+                                    e
+                                );
+                                continue;
+                            }
+                        };
+
+                        if let Some(usage) = v.get("usage") {
+                            prompt_tokens = usage
+                                .get("prompt_tokens")
+                                .and_then(|n| n.as_i64())
+                                .map(|n| n as i32);
+                            completion_tokens = usage
+                                .get("completion_tokens")
+                                .and_then(|n| n.as_i64())
+                                .map(|n| n as i32);
+                        }
+
+                        let Some(choices) = v.get("choices").and_then(|c| c.as_array())
+                        else {
+                            continue;
+                        };
+                        let Some(choice) = choices.first() else { continue };
+                        let delta = match choice.get("delta") {
+                            Some(d) => d,
+                            None => continue,
+                        };
+                        if let Some(r) = delta.get("role").and_then(|v| v.as_str()) {
+                            role = r.to_string();
+                        }
+                        if let Some(content) =
+                            delta.get("content").and_then(|v| v.as_str())
+                            && !content.is_empty()
+                        {
+                            accumulated_content.push_str(content);
+                            yield Ok(LlmStreamEvent::TextDelta(content.to_string()));
+                        }
+                        if let Some(tcs) = delta.get("tool_calls").and_then(|v| v.as_array()) {
+                            for tc_delta in tcs {
+                                let idx = tc_delta
+                                    .get("index")
+                                    .and_then(|n| n.as_u64())
+                                    .unwrap_or(0) as usize;
+                                let entry = tool_state
+                                    .entry(idx)
+                                    .or_insert((None, None, String::new()));
+                                if let Some(id) =
+                                    tc_delta.get("id").and_then(|v| v.as_str())
+                                {
+                                    entry.0 = Some(id.to_string());
+                                }
+                                if let Some(func) = tc_delta.get("function") {
+                                    if let Some(name) =
+                                        func.get("name").and_then(|v| v.as_str())
+                                    {
+                                        entry.1 = Some(name.to_string());
+                                    }
+                                    if let Some(args) =
+                                        func.get("arguments").and_then(|v| v.as_str())
+                                    {
+                                        entry.2.push_str(args);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    if done_seen {
+                        break;
+                    }
+                }
+                if done_seen {
+                    break;
+                }
+            }
+
+            let tool_calls: Option<Vec<ToolCall>> = if tool_state.is_empty() {
+                None
+            } else {
+                let mut v = Vec::with_capacity(tool_state.len());
+                for (_idx, (id, name, args)) in tool_state {
+                    let arguments: Value = if args.trim().is_empty() {
+                        Value::Object(Default::default())
+                    } else {
+                        serde_json::from_str(&args).unwrap_or_else(|_| {
+                            Value::Object(Default::default())
+                        })
+                    };
+                    v.push(ToolCall {
+                        id,
+                        function: ToolCallFunction {
+                            name: name.unwrap_or_default(),
+                            arguments,
+                        },
+                    });
+                }
+                Some(v)
+            };
+
+            let message = ChatMessage {
+                role,
+                content: accumulated_content,
+                tool_calls,
+                images: None,
+            };
+            yield Ok(LlmStreamEvent::Done {
+                message,
+                prompt_eval_count: prompt_tokens,
+                eval_count: completion_tokens,
+            });
+        };
+
+        Ok(Box::pin(stream))
+    }
+
+    async fn generate_embeddings(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>> {
+        let url = format!("{}/embeddings", self.base_url);
+        let body = json!({
+            "model": self.embedding_model,
+            "input": texts,
+        });
+
+        let resp = self
+            .client
+            .post(&url)
+            .json(&body)
+            .send()
+            .await
+            .with_context(|| format!("POST {} failed", url))?;
+
+        if !resp.status().is_success() {
+            let status = resp.status();
+            let body = resp.text().await.unwrap_or_default();
+            bail!("llama-swap embedding request failed: {} — {}", status, body);
+        }
+
+        #[derive(Deserialize)]
+        struct EmbedResponse {
+            data: Vec<EmbedItem>,
+        }
+        #[derive(Deserialize)]
+        struct EmbedItem {
+            embedding: Vec<f32>,
+        }
+
+        let parsed: EmbedResponse = resp.json().await.context("parsing embed response")?;
+        Ok(parsed.data.into_iter().map(|i| i.embedding).collect())
+    }
+
+    async fn describe_image(&self, image_base64: &str) -> Result<String> {
+        let prompt = "Briefly describe what you see in this image in 1-2 sentences. \
+                      Focus on the people, location, and activity.";
+        let system = "You are a scene description assistant. Be concise and factual.";
+
+        let messages = vec![
+            ChatMessage::system(system),
+            ChatMessage {
+                role: "user".to_string(),
+                content: prompt.to_string(),
+                tool_calls: None,
+                images: Some(vec![image_base64.to_string()]),
+            },
+        ];
+
+        let (reply, _, _) = self
+            .chat_completion_with_model(&self.vision_model.clone(), messages, Vec::new())
+            .await?;
+        Ok(reply.content)
+    }
+
+    async fn list_models(&self) -> Result<Vec<ModelCapabilities>> {
+        let url = format!("{}/models", self.base_url);
+        let resp = self
+            .client
+            .get(&url)
+            .send()
+            .await
+            .with_context(|| format!("GET {} failed", url))?;
+
+        if !resp.status().is_success() {
+            let status = resp.status();
+            let body = resp.text().await.unwrap_or_default();
+            bail!("llama-swap list_models failed: {} — {}", status, body);
+        }
+
+        let parsed: Value = resp.json().await.context("parsing models response")?;
+        let data = parsed
+            .get("data")
+            .and_then(|v| v.as_array())
+            .ok_or_else(|| anyhow!("models response missing data[]"))?;
+
+        let caps: Vec<ModelCapabilities> = data
+            .iter()
+            .map(|m| self.parse_model_capabilities(m))
+            .collect();
+
+        Ok(caps)
+    }
+
+    async fn model_capabilities(&self, model: &str) -> Result<ModelCapabilities> {
+        let all = self.list_models().await?;
+        all.into_iter()
+            .find(|m| m.name == model)
+            .ok_or_else(|| anyhow!("model '{}' not found on llama-swap", model))
+    }
+
+    fn primary_model(&self) -> &str {
+        &self.primary_model
+    }
+}
+
+impl LlamaCppClient {
+    fn parse_model_capabilities(&self, m: &Value) -> ModelCapabilities {
+        let name = m
+            .get("id")
+            .and_then(|v| v.as_str())
+            .unwrap_or_default()
+            .to_string();
+        let has_vision = name == self.vision_model || self.vision_models.iter().any(|v| v == &name);
+        // Tool calling is the default for llama-swap entries we configure
+        // (--jinja flag); no negative-list mechanism yet, so report true.
+        ModelCapabilities {
+            name,
+            has_vision,
+            has_tool_calling: true,
+        }
+    }
+}
+
+/// Extract a diagnostic fragment from a llama-swap / llama-server response
+/// that doesn't match the expected `{choices: [...]}` shape. llama-server
+/// returns errors as `{"error": {"message": "...", "code": N, "type": "..."}}`;
+/// llama-swap itself sometimes wraps subprocess failures with its own
+/// `{"error": "..."}` flat shape. Surface either when present, otherwise fall
+/// back to a truncated raw-JSON view.
+fn extract_error_detail(parsed: &Value) -> String {
+    if let Some(err) = parsed.get("error") {
+        match err {
+            Value::Object(_) => {
+                let message = err
+                    .get("message")
+                    .and_then(|v| v.as_str())
+                    .unwrap_or("(no message)");
+                let code = err
+                    .get("code")
+                    .map(|v| match v {
+                        Value::String(s) => s.clone(),
+                        other => other.to_string(),
+                    })
+                    .unwrap_or_else(|| "?".to_string());
+                let short_message: String = message.chars().take(240).collect();
+                return format!("error code={} message=\"{}\"", code, short_message);
+            }
+            Value::String(s) => {
+                let short: String = s.chars().take(240).collect();
+                return format!("error=\"{}\"", short);
+            }
+            _ => {}
+        }
+    }
+    let raw = parsed.to_string();
+    raw.chars().take(300).collect()
+}
+
+fn find_double_newline(buf: &[u8]) -> Option<usize> {
+    for i in 0..buf.len().saturating_sub(1) {
+        if buf[i] == b'\n' && buf[i + 1] == b'\n' {
+            return Some(i);
+        }
+        if i + 3 < buf.len()
+            && buf[i] == b'\r'
+            && buf[i + 1] == b'\n'
+            && buf[i + 2] == b'\r'
+            && buf[i + 3] == b'\n'
+        {
+            return Some(i + 1);
+        }
+    }
+    None
+}
+
+fn image_to_data_url(img: &str) -> String {
+    if img.starts_with("data:") {
+        img.to_string()
+    } else {
+        format!("data:image/jpeg;base64,{}", img)
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn tool_call_arguments_stringified_on_send() {
+        let msg = ChatMessage {
+            role: "assistant".into(),
+            content: String::new(),
+            tool_calls: Some(vec![ToolCall {
+                id: Some("call_abc".into()),
+                function: ToolCallFunction {
+                    name: "search_sms".into(),
+                    arguments: json!({"query": "hello", "limit": 5}),
+                },
+            }]),
+            images: None,
+        };
+
+        let wire = LlamaCppClient::messages_to_openai(&[msg]);
+        let tcs = wire[0]
+            .get("tool_calls")
+            .and_then(|v| v.as_array())
+            .expect("tool_calls present");
+        let args = tcs[0]
+            .get("function")
+            .and_then(|f| f.get("arguments"))
+            .and_then(|a| a.as_str())
+            .expect("arguments stringified");
+        let parsed: Value = serde_json::from_str(args).unwrap();
+        assert_eq!(parsed["query"], "hello");
+        assert_eq!(parsed["limit"], 5);
+    }
+
+    #[test]
+    fn tool_call_arguments_parsed_on_receive() {
+        let response_msg = json!({
+            "role": "assistant",
+            "content": "",
+            "tool_calls": [{
+                "id": "call_xyz",
+                "type": "function",
+                "function": {
+                    "name": "get_weather",
+                    "arguments": "{\"city\":\"Boston\",\"units\":\"celsius\"}"
+                }
+            }]
+        });
+        let parsed = LlamaCppClient::openai_message_to_chat(&response_msg).unwrap();
+        let tcs = parsed.tool_calls.unwrap();
+        assert_eq!(tcs.len(), 1);
+        assert_eq!(tcs[0].function.name, "get_weather");
+        assert_eq!(tcs[0].function.arguments["city"], "Boston");
+        assert_eq!(tcs[0].function.arguments["units"], "celsius");
+        assert_eq!(tcs[0].id.as_deref(), Some("call_xyz"));
+    }
+
+    #[test]
+    fn tool_call_arguments_accept_native_json_on_receive() {
+        // Some llama.cpp builds emit arguments as a JSON object directly when
+        // jinja's tool-output strict-string rule isn't applied — accept both.
+        let response_msg = json!({
+            "role": "assistant",
+            "content": "",
+            "tool_calls": [{
+                "id": "call_1",
+                "type": "function",
+                "function": {
+                    "name": "foo",
+                    "arguments": {"nested": {"k": 1}}
+                }
+            }]
+        });
+        let parsed = LlamaCppClient::openai_message_to_chat(&response_msg).unwrap();
+        let tc = &parsed.tool_calls.unwrap()[0];
+        assert_eq!(tc.function.arguments["nested"]["k"], 1);
+    }
+
+    #[test]
+    fn images_become_content_parts() {
+        let mut msg = ChatMessage::user("What is in this photo?");
+        msg.images = Some(vec!["BASE64DATA".into()]);
+
+        let wire = LlamaCppClient::messages_to_openai(&[msg]);
+        let content = wire[0].get("content").and_then(|v| v.as_array()).unwrap();
+        assert_eq!(content.len(), 2);
+        assert_eq!(content[0]["type"], "text");
+        assert_eq!(content[0]["text"], "What is in this photo?");
+        assert_eq!(content[1]["type"], "image_url");
+        assert_eq!(
+            content[1]["image_url"]["url"],
+            "data:image/jpeg;base64,BASE64DATA"
+        );
+    }
+
+    #[test]
+    fn data_url_images_pass_through_unchanged() {
+        let mut msg = ChatMessage::user("");
+        msg.images = Some(vec!["data:image/png;base64,ABCDEF".into()]);
+        let wire = LlamaCppClient::messages_to_openai(&[msg]);
+        let content = wire[0].get("content").and_then(|v| v.as_array()).unwrap();
+        assert_eq!(content.len(), 1);
+        assert_eq!(
+            content[0]["image_url"]["url"],
+            "data:image/png;base64,ABCDEF"
+        );
+    }
+
+    #[test]
+    fn text_only_message_stays_string() {
+        let msg = ChatMessage::user("hello");
+        let wire = LlamaCppClient::messages_to_openai(&[msg]);
+        assert_eq!(wire[0]["content"], "hello");
+        assert!(wire[0]["content"].as_str().is_some());
+    }
+
+    #[test]
+    fn tool_result_inherits_tool_call_id_from_prior_assistant() {
+        let assistant = ChatMessage {
+            role: "assistant".into(),
+            content: String::new(),
+            tool_calls: Some(vec![ToolCall {
+                id: Some("call_42".into()),
+                function: ToolCallFunction {
+                    name: "lookup".into(),
+                    arguments: json!({}),
+                },
+            }]),
+            images: None,
+        };
+        let tool_result = ChatMessage::tool_result("found it");
+
+        let wire = LlamaCppClient::messages_to_openai(&[assistant, tool_result]);
+        assert_eq!(wire[1]["role"], "tool");
+        assert_eq!(wire[1]["tool_call_id"], "call_42");
+    }
+
+    #[test]
+    fn multiple_tool_results_map_to_sequential_call_ids() {
+        let assistant = ChatMessage {
+            role: "assistant".into(),
+            content: String::new(),
+            tool_calls: Some(vec![
+                ToolCall {
+                    id: Some("call_A".into()),
+                    function: ToolCallFunction {
+                        name: "a".into(),
+                        arguments: json!({}),
+                    },
+                },
+                ToolCall {
+                    id: Some("call_B".into()),
+                    function: ToolCallFunction {
+                        name: "b".into(),
+                        arguments: json!({}),
+                    },
+                },
+            ]),
+            images: None,
+        };
+        let r1 = ChatMessage::tool_result("a result");
+        let r2 = ChatMessage::tool_result("b result");
+
+        let wire = LlamaCppClient::messages_to_openai(&[assistant, r1, r2]);
+        assert_eq!(wire[1]["tool_call_id"], "call_A");
+        assert_eq!(wire[2]["tool_call_id"], "call_B");
+    }
+
+    #[test]
+    fn missing_tool_call_id_gets_synthetic_fallback() {
+        let assistant = ChatMessage {
+            role: "assistant".into(),
+            content: String::new(),
+            tool_calls: Some(vec![ToolCall {
+                id: None,
+                function: ToolCallFunction {
+                    name: "noid".into(),
+                    arguments: json!({}),
+                },
+            }]),
+            images: None,
+        };
+        let wire = LlamaCppClient::messages_to_openai(&[assistant]);
+        let tcs = wire[0]
+            .get("tool_calls")
+            .and_then(|v| v.as_array())
+            .unwrap();
+        assert_eq!(tcs[0]["id"], "call_0");
+    }
+
+    #[test]
+    fn capability_inference_uses_vision_model_and_allowlist() {
+        let mut c = LlamaCppClient::new(None, Some("chat".into()));
+        c.set_vision_model("vision".into());
+        c.set_vision_models(vec!["qwen-vl".into()]);
+
+        let m_chat = json!({ "id": "chat" });
+        let m_vision = json!({ "id": "vision" });
+        let m_qwen = json!({ "id": "qwen-vl" });
+        let m_other = json!({ "id": "embed" });
+
+        let chat = c.parse_model_capabilities(&m_chat);
+        let vision = c.parse_model_capabilities(&m_vision);
+        let qwen = c.parse_model_capabilities(&m_qwen);
+        let other = c.parse_model_capabilities(&m_other);
+
+        assert!(!chat.has_vision);
+        assert!(chat.has_tool_calling);
+        assert!(vision.has_vision);
+        assert!(qwen.has_vision);
+        assert!(!other.has_vision);
+    }
+}
diff --git a/src/ai/mod.rs b/src/ai/mod.rs
index 3468325..204da04 100644
--- a/src/ai/mod.rs
+++ b/src/ai/mod.rs
@@ -5,6 +5,7 @@ pub mod face_client;
 pub mod handlers;
 pub mod insight_chat;
 pub mod insight_generator;
+pub mod llamacpp;
 pub mod llm_client;
 pub mod ollama;
 pub mod openrouter;
@@ -20,7 +21,8 @@ pub use handlers::{
     chat_history_handler, chat_rewind_handler, chat_stream_handler, chat_turn_handler,
     delete_insight_handler, export_training_data_handler, generate_agentic_insight_handler,
     generate_insight_handler, get_all_insights_handler, get_available_models_handler,
-    get_insight_handler, get_openrouter_models_handler, rate_insight_handler,
+    get_insight_handler, get_llamacpp_models_handler, get_openrouter_models_handler,
+    rate_insight_handler,
 };
 pub use insight_generator::InsightGenerator;
 #[allow(unused_imports)]
diff --git a/src/bin/populate_knowledge.rs b/src/bin/populate_knowledge.rs
index 29945d7..71f2f8a 100644
--- a/src/bin/populate_knowledge.rs
+++ b/src/bin/populate_knowledge.rs
@@ -195,6 +195,7 @@ async fn main() -> anyhow::Result<()> {
     let generator = InsightGenerator::new(
         ollama,
         None,
+        None,
         sms_client,
         apollo_client,
         insight_dao.clone(),
diff --git a/src/main.rs b/src/main.rs
index 63013ce..3bd3656 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -313,6 +313,7 @@ fn main() -> std::io::Result<()> {
                 .service(ai::get_all_insights_handler)
                 .service(ai::get_available_models_handler)
                 .service(ai::get_openrouter_models_handler)
+                .service(ai::get_llamacpp_models_handler)
                 .service(ai::chat_turn_handler)
                 .service(ai::chat_stream_handler)
                 .service(ai::chat_history_handler)
diff --git a/src/state.rs b/src/state.rs
index 8f1bd4e..96d1c22 100644
--- a/src/state.rs
+++ b/src/state.rs
@@ -2,6 +2,7 @@ use crate::ai::apollo_client::ApolloClient;
 use crate::ai::clip_client::ClipClient;
 use crate::ai::face_client::FaceClient;
 use crate::ai::insight_chat::{ChatLockMap, InsightChatService};
+use crate::ai::llamacpp::LlamaCppClient;
 use crate::ai::openrouter::OpenRouterClient;
 use crate::ai::{InsightGenerator, OllamaClient, SmsApiClient};
 use crate::database::{
@@ -62,6 +63,16 @@ pub struct AppState {
     /// Curated list of OpenRouter model ids exposed to clients. Sourced from
     /// `OPENROUTER_ALLOWED_MODELS` (comma-separated). Empty when unset.
     pub openrouter_allowed_models: Vec<String>,
+    /// `None` when `LLAMA_SWAP_URL` is not configured. Consulted only when a
+    /// request explicitly opts into `backend=llamacpp`. Same shape as the
+    /// `openrouter` slot — present here so handlers can route to it without
+    /// threading through the generator.
+    #[allow(dead_code)]
+    pub llamacpp: Option<Arc<LlamaCppClient>>,
+    /// Curated list of llama-swap model ids exposed to clients. Sourced from
+    /// `LLAMA_SWAP_ALLOWED_MODELS` (comma-separated). Empty when unset; the
+    /// server then falls back to `LLAMA_SWAP_PRIMARY_MODEL`.
+    pub llamacpp_allowed_models: Vec<String>,
     pub sms_client: SmsApiClient,
     pub insight_generator: InsightGenerator,
     /// Chat continuation service. Hold an Arc so handlers can clone cheaply.
@@ -105,6 +116,8 @@ impl AppState {
         ollama: OllamaClient,
         openrouter: Option<Arc<OpenRouterClient>>,
         openrouter_allowed_models: Vec<String>,
+        llamacpp: Option<Arc<LlamaCppClient>>,
+        llamacpp_allowed_models: Vec<String>,
         sms_client: SmsApiClient,
         insight_generator: InsightGenerator,
         insight_chat: Arc<InsightChatService>,
@@ -145,6 +158,8 @@ impl AppState {
             ollama,
             openrouter,
             openrouter_allowed_models,
+            llamacpp,
+            llamacpp_allowed_models,
             sms_client,
             insight_generator,
             insight_chat,
@@ -186,6 +201,9 @@ impl Default for AppState {
         let openrouter = build_openrouter_from_env();
         let openrouter_allowed_models = parse_openrouter_allowed_models();
 
+        let llamacpp = build_llamacpp_from_env();
+        let llamacpp_allowed_models = parse_llamacpp_allowed_models();
+
         let sms_api_url =
             env::var("SMS_API_URL").unwrap_or_else(|_| "http://localhost:8000".to_string());
         let sms_api_token = env::var("SMS_API_TOKEN").ok();
@@ -250,6 +268,7 @@ impl Default for AppState {
         let insight_generator = InsightGenerator::new(
             ollama.clone(),
             openrouter.clone(),
+            llamacpp.clone(),
             sms_client.clone(),
             apollo_client.clone(),
             insight_dao.clone(),
@@ -273,6 +292,7 @@ impl Default for AppState {
             Arc::new(insight_generator.clone()),
             ollama.clone(),
             openrouter.clone(),
+            llamacpp.clone(),
             insight_dao.clone(),
             chat_locks,
         ));
@@ -294,6 +314,8 @@ impl Default for AppState {
             ollama,
             openrouter,
             openrouter_allowed_models,
+            llamacpp,
+            llamacpp_allowed_models,
             sms_client,
             insight_generator,
             insight_chat,
@@ -335,6 +357,50 @@ fn parse_openrouter_allowed_models() -> Vec<String> {
         .collect()
 }
 
+/// Build a `LlamaCppClient` from environment variables. Returns `None` when
+/// `LLAMA_SWAP_URL` is unset (the llamacpp backend is then unavailable and
+/// requests for it return a clear error). The slot ids default to the
+/// names the bundled `llama-swap/config.yaml` uses — `chat` / `vision` /
+/// `embed` — so a minimal deploy only needs to set `LLAMA_SWAP_URL`.
+fn build_llamacpp_from_env() -> Option<Arc<LlamaCppClient>> {
+    let base_url = env::var("LLAMA_SWAP_URL").ok()?;
+    let primary_model = env::var("LLAMA_SWAP_PRIMARY_MODEL").ok();
+    let mut client = LlamaCppClient::new(Some(base_url), primary_model);
+    if let Ok(model) = env::var("LLAMA_SWAP_EMBEDDING_MODEL") {
+        client.set_embedding_model(model);
+    }
+    if let Ok(model) = env::var("LLAMA_SWAP_VISION_MODEL") {
+        client.set_vision_model(model);
+    }
+    client.set_vision_models(parse_llamacpp_vision_models());
+    Some(Arc::new(client))
+}
+
+/// Parse `LLAMA_SWAP_ALLOWED_MODELS` (comma-separated) into a vec. Used to
+/// drive `/insights/llamacpp/models`; empty when unset.
+fn parse_llamacpp_allowed_models() -> Vec<String> {
+    env::var("LLAMA_SWAP_ALLOWED_MODELS")
+        .unwrap_or_default()
+        .split(',')
+        .map(|s| s.trim().to_string())
+        .filter(|s| !s.is_empty())
+        .collect()
+}
+
+/// Parse `LLAMA_SWAP_VISION_MODELS` (comma-separated) — slot ids that report
+/// `has_vision = true` in capability lookups. The configured `vision_model`
+/// (default `vision`) is always considered vision-capable regardless of this
+/// list, so a deploy that only uses the default vision slot can leave it
+/// unset.
+fn parse_llamacpp_vision_models() -> Vec<String> {
+    env::var("LLAMA_SWAP_VISION_MODELS")
+        .unwrap_or_default()
+        .split(',')
+        .map(|s| s.trim().to_string())
+        .filter(|s| !s.is_empty())
+        .collect()
+}
+
 #[cfg(test)]
 impl AppState {
     /// Creates an AppState instance for testing with temporary directories
@@ -397,6 +463,7 @@ impl AppState {
         let insight_generator = InsightGenerator::new(
             ollama.clone(),
             None,
+            None,
             sms_client.clone(),
             apollo_client.clone(),
             insight_dao.clone(),
@@ -418,6 +485,7 @@ impl AppState {
             Arc::new(insight_generator.clone()),
             ollama.clone(),
             None,
+            None,
             insight_dao.clone(),
             chat_locks,
         ));
@@ -445,6 +513,8 @@ impl AppState {
             ollama,
             None,
             Vec::new(),
+            None,
+            Vec::new(),
             sms_client,
             insight_generator,
             insight_chat,