diff --git a/src/ai/backend.rs b/src/ai/backend.rs
new file mode 100644
index 0000000..0fe7747
--- /dev/null
+++ b/src/ai/backend.rs
@@ -0,0 +1,118 @@
+use anyhow::{Result, anyhow};
+
+use crate::ai::llm_client::LlmClient;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum BackendKind {
+    Local,
+    Hybrid,
+}
+
+impl BackendKind {
+    pub fn parse(s: &str) -> Result<Self> {
+        match s.trim().to_lowercase().as_str() {
+            "local" | "" => Ok(Self::Local),
+            "hybrid" => Ok(Self::Hybrid),
+            other => Err(anyhow!("unknown backend '{}'; expected 'local' or 'hybrid'", other)),
+        }
+    }
+
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            Self::Local => "local",
+            Self::Hybrid => "hybrid",
+        }
+    }
+}
+
+impl std::fmt::Display for BackendKind {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.as_str())
+    }
+}
+
+pub struct SamplingOverrides {
+    pub model: Option<String>,
+    pub num_ctx: Option<i32>,
+    pub temperature: Option<f32>,
+    pub top_p: Option<f32>,
+    pub top_k: Option<i32>,
+    pub min_p: Option<f32>,
+}
+
+impl SamplingOverrides {
+    pub fn has_sampling(&self) -> bool {
+        self.temperature.is_some()
+            || self.top_p.is_some()
+            || self.top_k.is_some()
+            || self.min_p.is_some()
+    }
+}
+
+pub struct ResolvedBackend {
+    chat: Box<dyn LlmClient>,
+    local: Box<dyn LlmClient>,
+    pub kind: BackendKind,
+    /// `true` when the chat model receives images directly (Ollama with
+    /// vision, or llamacpp). `false` for hybrid where we describe-then-inline.
+    pub images_inline: bool,
+}
+
+impl ResolvedBackend {
+    pub fn new(
+        chat: Box<dyn LlmClient>,
+        local: Box<dyn LlmClient>,
+        kind: BackendKind,
+        images_inline: bool,
+    ) -> Self {
+        Self { chat, local, kind, images_inline }
+    }
+
+    pub fn chat(&self) -> &dyn LlmClient {
+        self.chat.as_ref()
+    }
+
+    pub fn local(&self) -> &dyn LlmClient {
+        self.local.as_ref()
+    }
+
+    pub fn model(&self) -> &str {
+        self.chat.primary_model()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parse_backend_kind() {
+        assert_eq!(BackendKind::parse("local").unwrap(), BackendKind::Local);
+        assert_eq!(BackendKind::parse("hybrid").unwrap(), BackendKind::Hybrid);
+        assert_eq!(BackendKind::parse("  Local ").unwrap(), BackendKind::Local);
+        assert_eq!(BackendKind::parse("HYBRID").unwrap(), BackendKind::Hybrid);
+        assert_eq!(BackendKind::parse("").unwrap(), BackendKind::Local);
+        assert!(BackendKind::parse("vllm").is_err());
+    }
+
+    #[test]
+    fn backend_kind_as_str_roundtrips() {
+        assert_eq!(BackendKind::parse(BackendKind::Local.as_str()).unwrap(), BackendKind::Local);
+        assert_eq!(BackendKind::parse(BackendKind::Hybrid.as_str()).unwrap(), BackendKind::Hybrid);
+    }
+
+    #[test]
+    fn sampling_overrides_has_sampling() {
+        let empty = SamplingOverrides {
+            model: None, num_ctx: None, temperature: None,
+            top_p: None, top_k: None, min_p: None,
+        };
+        assert!(!empty.has_sampling());
+
+        let with_temp = SamplingOverrides {
+            model: None, num_ctx: Some(4096), temperature: Some(0.7),
+            top_p: None, top_k: None, min_p: None,
+        };
+        assert!(with_temp.has_sampling());
+    }
+}
diff --git a/src/ai/handlers.rs b/src/ai/handlers.rs
index c86a5c8..a7a3720 100644
--- a/src/ai/handlers.rs
+++ b/src/ai/handlers.rs
@@ -495,7 +495,7 @@ pub async fn get_available_models_handler(
             .iter()
             .map(|name| ModelCapabilities {
                 name: name.clone(),
-                has_vision: name == &lc.vision_model,
+                has_vision: true,
                 has_tool_calling: true,
             })
             .collect();
diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs
index b549dab..54350df 100644
--- a/src/ai/insight_chat.rs
+++ b/src/ai/insight_chat.rs
@@ -311,7 +311,7 @@ impl InsightChatService {
         let is_hybrid = effective_backend == "hybrid";
         let local_via_llamacpp =
             crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
-        let describes_then_inlines = is_hybrid || local_via_llamacpp;
+        let describes_then_inlines = is_hybrid;
         span.set_attribute(KeyValue::new("backend", effective_backend.clone()));
 
         // 4. Build the chat backend client. Hybrid → OpenRouter; local with
@@ -408,12 +408,11 @@ impl InsightChatService {
         let model_used = chat_backend.primary_model().to_string();
         span.set_attribute(KeyValue::new("model", model_used.clone()));
 
-        // 5. Decide vision + tool set. In describe-then-inline modes
-        //    (hybrid, llamacpp) we always omit `describe_photo` (matches the
-        //    original generation flow). In local we trust the stored
-        //    history's first-user shape: if it carries `images`, the
-        //    original model was vision-capable, and we keep `describe_photo`
-        //    available.
+        // 5. Decide vision + tool set. In describe-then-inline mode
+        //    (hybrid only) we omit `describe_photo`. In local and llamacpp
+        //    we trust the stored history's first-user shape: if it carries
+        //    `images`, the original model was vision-capable, and we keep
+        //    `describe_photo` available.
         let local_first_user_has_image = messages
             .iter()
             .find(|m| m.role == "user")
@@ -821,9 +820,7 @@ impl InsightChatService {
             .unwrap_or_else(|| stored_backend.clone());
         validate_cross_replay(&stored_backend, &effective_backend)?;
         let is_hybrid = effective_backend == "hybrid";
-        let local_via_llamacpp =
-            crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
-        let describes_then_inlines = is_hybrid || local_via_llamacpp;
+        let describes_then_inlines = is_hybrid;
 
         let max_iterations = req
             .max_iterations
@@ -842,10 +839,9 @@ impl InsightChatService {
         let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref();
         let model_used = chat_backend.primary_model().to_string();
 
-        // Tool set — local mode + first user turn carries an image →
-        // offer describe_photo. Describe-then-inline modes (hybrid OR
-        // local_via_llamacpp): visual description was inlined when the
-        // insight was bootstrapped, no describe tool needed.
+        // Tool set — local/llamacpp mode + first user turn carries an image →
+        // offer describe_photo. Describe-then-inline mode (hybrid only):
+        // visual description was inlined at bootstrap, no describe tool needed.
         let local_first_user_has_image = messages
             .iter()
             .find(|m| m.role == "user")
@@ -991,7 +987,7 @@ impl InsightChatService {
         let is_hybrid = effective_backend == "hybrid";
         let local_via_llamacpp =
             crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
-        let describes_then_inlines = is_hybrid || local_via_llamacpp;
+        let describes_then_inlines = is_hybrid;
 
         let max_iterations = req
             .max_iterations
@@ -1023,10 +1019,9 @@ impl InsightChatService {
                 _ => None,
             });
 
-        // Describe-then-inline (hybrid OR local_via_llamacpp): pre-describe
-        // the image so a text-only chat model gets the visual description
-        // inline. Vision source follows `LLM_BACKEND`: llama-swap when
-        // `local_via_llamacpp`, else Ollama.
+        // Describe-then-inline (hybrid only): pre-describe the image so a
+        // text-only chat model gets the visual description inline. llamacpp
+        // sends images directly to the chat model.
         let visual_block = if describes_then_inlines {
             match image_base64.as_deref() {
                 Some(b64) => {
diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs
index 08fcfef..075001a 100644
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -10,6 +10,7 @@ use std::io::Cursor;
 use std::sync::{Arc, Mutex};
 
 use crate::ai::apollo_client::{ApolloClient, ApolloPlace};
+use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides};
 use crate::ai::llm_client::LlmClient;
 use crate::ai::ollama::{ChatMessage, OllamaClient, Tool};
 use crate::ai::llamacpp::LlamaCppClient;
@@ -1781,14 +1782,18 @@ Return ONLY the summary, nothing else."#,
         );
 
         let started = std::time::Instant::now();
-        let response = ollama
-            .generate_no_think(
-                &prompt,
-                Some(
-                    "You are a terse relevance ranker. You output only numbers separated by commas.",
-                ),
-            )
-            .await?;
+        let system = Some(
+            "You are a terse relevance ranker. You output only numbers separated by commas.",
+        );
+        let response = if crate::ai::local_backend_is_llamacpp() {
+            if let Some(ref lc) = self.llamacpp {
+                lc.generate(&prompt, system, None).await?
+            } else {
+                ollama.generate_no_think(&prompt, system).await?
+            }
+        } else {
+            ollama.generate_no_think(&prompt, system).await?
+        };
         log::info!(
             "rerank: finished in {} ms (prompt={} chars)",
             started.elapsed().as_millis(),
@@ -2360,7 +2365,8 @@ Return ONLY the summary, nothing else."#,
         out
     }
 
-    /// Tool: describe_photo — generate a visual description of the photo
+    /// Tool: describe_photo — generate a visual description of the photo.
+    /// Routes through llama-swap when `LLM_BACKEND=llamacpp`, Ollama otherwise.
     async fn tool_describe_photo(
         &self,
         ollama: &OllamaClient,
@@ -2369,10 +2375,21 @@ Return ONLY the summary, nothing else."#,
         log::info!("tool_describe_photo: generating visual description");
 
         match image_base64 {
-            Some(img) => match ollama.generate_photo_description(img).await {
-                Ok(desc) => desc,
-                Err(e) => format!("Error describing photo: {}", e),
-            },
+            Some(img) => {
+                let result = if crate::ai::local_backend_is_llamacpp() {
+                    if let Some(ref lc) = self.llamacpp {
+                        lc.describe_image(img).await
+                    } else {
+                        ollama.generate_photo_description(img).await
+                    }
+                } else {
+                    ollama.generate_photo_description(img).await
+                };
+                match result {
+                    Ok(desc) => desc,
+                    Err(e) => format!("Error describing photo: {}", e),
+                }
+            }
             None => "No image available for description.".to_string(),
         }
     }
@@ -3560,6 +3577,177 @@ Return ONLY the summary, nothing else."#,
         out
     }
 
+    /// Consolidate client construction for the agentic insight loop.
+    ///
+    /// Returns a [`ResolvedBackend`] containing the **chat** client (the model
+    /// that drives the agent loop), the **local** client (always the configured
+    /// local backend — Ollama or llama-swap — for utility calls like
+    /// describe_image, rerank, embeddings), the backend kind, and whether the
+    /// chat model receives images inline.
+    pub async fn resolve_backend(
+        &self,
+        kind: BackendKind,
+        overrides: &SamplingOverrides,
+    ) -> Result<ResolvedBackend> {
+        let local_via_llamacpp =
+            crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
+        let is_hybrid = kind == BackendKind::Hybrid;
+
+        // ── chat client ────────────────────────────────────────────────
+        let chat: Box<dyn LlmClient> = if is_hybrid {
+            // Hybrid: chat through OpenRouter.
+            let arc = self.openrouter.as_ref().ok_or_else(|| {
+                anyhow::anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
+            })?;
+            let mut c: OpenRouterClient = (**arc).clone();
+            if let Some(ref m) = overrides.model {
+                c.primary_model = m.clone();
+            }
+            if overrides.has_sampling() {
+                c.set_sampling_params(
+                    overrides.temperature,
+                    overrides.top_p,
+                    overrides.top_k,
+                    overrides.min_p,
+                );
+            }
+            if let Some(ctx) = overrides.num_ctx {
+                c.set_num_ctx(Some(ctx));
+            }
+            Box::new(c)
+        } else if local_via_llamacpp {
+            // Local via llama-swap.
+            let arc = self.llamacpp.as_ref().ok_or_else(|| {
+                anyhow::anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured")
+            })?;
+            let mut c: LlamaCppClient = (**arc).clone();
+            if let Some(ref m) = overrides.model {
+                c.primary_model = m.clone();
+            }
+            if overrides.has_sampling() {
+                c.set_sampling_params(
+                    overrides.temperature,
+                    overrides.top_p,
+                    overrides.top_k,
+                    overrides.min_p,
+                );
+            }
+            if let Some(ctx) = overrides.num_ctx {
+                c.set_num_ctx(Some(ctx));
+            }
+            Box::new(c)
+        } else {
+            // Pure Ollama local.
+            let mut ollama_client = if let Some(ref model) = overrides.model {
+                OllamaClient::new(
+                    self.ollama.primary_url.clone(),
+                    self.ollama.fallback_url.clone(),
+                    model.clone(),
+                    Some(model.clone()),
+                )
+            } else {
+                self.ollama.clone()
+            };
+            if overrides.has_sampling() {
+                ollama_client.set_sampling_params(
+                    overrides.temperature,
+                    overrides.top_p,
+                    overrides.top_k,
+                    overrides.min_p,
+                );
+            }
+            if let Some(ctx) = overrides.num_ctx {
+                ollama_client.set_num_ctx(Some(ctx));
+            }
+            Box::new(ollama_client)
+        };
+
+        // ── local client (utility calls: rerank, describe_image, etc.) ─
+        let local: Box<dyn LlmClient> = if local_via_llamacpp {
+            Box::new(self.llamacpp.as_ref().unwrap().as_ref().clone())
+        } else {
+            Box::new(self.ollama.clone())
+        };
+
+        // ── images_inline ──────────────────────────────────────────────
+        let images_inline = if is_hybrid {
+            // Hybrid: chat model never sees images — describe-then-inject.
+            false
+        } else if local_via_llamacpp {
+            // llama-swap models receive images directly via OpenAI content
+            // parts. Capability probing isn't available (no `/api/show`),
+            // so assume vision support; a misconfigured model surfaces as
+            // a chat-call error.
+            true
+        } else {
+            // Pure Ollama: probe model capabilities.
+            let ollama_for_caps = if let Some(ref model) = overrides.model {
+                // Verify custom model is available on at least one server.
+                let available_on_primary =
+                    OllamaClient::is_model_available(&self.ollama.primary_url, model)
+                        .await
+                        .unwrap_or(false);
+
+                let available_on_fallback =
+                    if let Some(ref fallback_url) = self.ollama.fallback_url {
+                        OllamaClient::is_model_available(fallback_url, model)
+                            .await
+                            .unwrap_or(false)
+                    } else {
+                        false
+                    };
+
+                if !available_on_primary && !available_on_fallback {
+                    anyhow::bail!(
+                        "model not available: '{}' not found on any configured server",
+                        model
+                    );
+                }
+                model.as_str()
+            } else {
+                self.ollama.primary_model.as_str()
+            };
+
+            let capabilities = match OllamaClient::check_model_capabilities(
+                &self.ollama.primary_url,
+                ollama_for_caps,
+            )
+            .await
+            {
+                Ok(caps) => caps,
+                Err(_) => {
+                    let fallback_url =
+                        self.ollama.fallback_url.as_deref().ok_or_else(|| {
+                            anyhow::anyhow!(
+                                "Failed to check model capabilities for '{}': model not found on primary server and no fallback configured",
+                                ollama_for_caps
+                            )
+                        })?;
+                    OllamaClient::check_model_capabilities(fallback_url, ollama_for_caps)
+                        .await
+                        .map_err(|e| {
+                            anyhow::anyhow!(
+                                "Failed to check model capabilities for '{}': {}",
+                                ollama_for_caps,
+                                e
+                            )
+                        })?
+                }
+            };
+
+            if !capabilities.has_tool_calling {
+                anyhow::bail!(
+                    "tool calling not supported by model '{}'",
+                    ollama_for_caps
+                );
+            }
+
+            capabilities.has_vision
+        };
+
+        Ok(ResolvedBackend::new(chat, local, kind, images_inline))
+    }
+
     pub async fn generate_agentic_insight_for_photo(
         &self,
         file_path: &str,
@@ -3602,26 +3790,22 @@ Return ONLY the summary, nothing else."#,
         span.set_attribute(KeyValue::new("backend", backend_label.clone()));
         let is_hybrid = backend_label == "hybrid";
         // `LLM_BACKEND=llamacpp` swaps Ollama out for llama-swap as the
-        // "local" stack — chat + vision describe + embeddings all route
-        // through llama-swap. In hybrid mode this still applies to vision
-        // describe (chat continues to go to OpenRouter). The chat slot is
-        // text-only in either case, so we describe-then-inline.
+        // "local" stack — chat + embeddings route through llama-swap.
+        // llamacpp models receive images directly (vision-capable); only
+        // hybrid mode (OpenRouter chat) uses describe-then-inline.
         let local_via_llamacpp =
             crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
-        // Describe-then-inline: hybrid (chat is OpenRouter, text-only) or
-        // any path where chat goes through llama-swap (chat slot is text-only).
-        let describes_then_inlines = is_hybrid || local_via_llamacpp;
+        let describes_then_inlines = is_hybrid;
+        let ollama_is_chat = !is_hybrid && !local_via_llamacpp;
 
         // 1b. Always build an Ollama client. In local mode it owns the chat
         //     loop; in hybrid/llamacpp mode it still handles tool-local calls
         //     (e.g. future embedding-backed tools). The chat backend is
         //     selected separately below.
-        //     Sampling overrides only apply in local mode — in
-        //     hybrid/llamacpp the user's params belong to the alternate chat
-        //     client.
-        let apply_sampling_to_ollama = !describes_then_inlines;
+        //     Sampling overrides only apply when Ollama is the chat backend.
+        let apply_sampling_to_ollama = ollama_is_chat;
         let mut ollama_client = if let Some(ref model) = custom_model
-            && !describes_then_inlines
+            && ollama_is_chat
         {
             log::info!("Using custom model for agentic: {}", model);
             span.set_attribute(KeyValue::new("custom_model", model.clone()));
@@ -3632,7 +3816,7 @@ Return ONLY the summary, nothing else."#,
                 Some(model.clone()),
             )
         } else {
-            if !describes_then_inlines {
+            if ollama_is_chat {
                 span.set_attribute(KeyValue::new("model", self.ollama.primary_model.clone()));
             }
             self.ollama.clone()
@@ -3752,10 +3936,13 @@ Return ONLY the summary, nothing else."#,
         //      (OPENROUTER_ALLOWED_MODELS) — no live precheck. A bad model id
         //      surfaces as a chat-call error on the next step.
         let has_vision = if describes_then_inlines {
-            // In hybrid + llamacpp modes the chat model never sees images
-            // directly — we describe-then-inject, so `has_vision` drives only
-            // whether we bother loading the image to describe it, which we
-            // always do.
+            // Hybrid: chat model never sees images — describe-then-inject.
+            true
+        } else if local_via_llamacpp {
+            // llama-swap models receive images directly via OpenAI content
+            // parts. Capability probing isn't available (no `/api/show`),
+            // so assume vision support; a misconfigured model surfaces as
+            // a chat-call error.
             true
         } else {
             if let Some(ref model_name) = custom_model {
@@ -3935,10 +4122,9 @@ Return ONLY the summary, nothing else."#,
             None
         };
 
-        // describe-then-inline path. Vision describe routes through whichever
-        // `LLM_BACKEND` is configured — llama-swap when `local_via_llamacpp`
-        // is set (even in hybrid mode, since chat is OpenRouter but vision
-        // stays on the local stack), otherwise Ollama.
+        // describe-then-inline path (hybrid only). Vision describe routes
+        // through whichever local backend is configured — llama-swap when
+        // `local_via_llamacpp`, otherwise Ollama.
         let inlined_visual_description: Option<String> = if describes_then_inlines {
             match image_base64.as_deref() {
                 Some(b64) => {
@@ -4043,10 +4229,10 @@ Return ONLY the summary, nothing else."#,
         );
 
         // 10. Define tools. Gate flags computed from current data presence;
-        //     describe-then-inline modes (hybrid OR local_via_llamacpp) omit
-        //     describe_photo since the chat model receives the visual
-        //     description inline (so we pass `false` for has_vision in
-        //     those modes regardless of the model's actual capability).
+        //     hybrid mode omits describe_photo since the chat model receives
+        //     the visual description inline (so we pass `false` for
+        //     has_vision in that mode regardless of the model's actual
+        //     capability).
         let gate_opts = self.current_gate_opts(has_vision && !describes_then_inlines);
         let tools = Self::build_tool_definitions(gate_opts);
 
diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs
index d23fad5..2a03aed 100644
--- a/src/ai/llamacpp.rs
+++ b/src/ai/llamacpp.rs
@@ -50,9 +50,9 @@ pub struct LlamaCppClient {
     /// Embedding model slot id (e.g. `"embed"`). Used for
     /// `generate_embeddings`.
     pub embedding_model: String,
-    /// Vision model slot id (e.g. `"vision"`). Used for `describe_image`,
-    /// and the only slot that reports `has_vision = true` in capability
-    /// lookups (llama-swap's `/v1/models` doesn't surface modality).
+    /// Vision model slot id. Used for `describe_image` routing. Defaults
+    /// to `primary_model` so describe_image works out of the box; override
+    /// via `LLAMA_SWAP_VISION_MODEL` for a dedicated vision slot.
     pub vision_model: String,
     num_ctx: Option<i32>,
     temperature: Option<f32>,
@@ -67,6 +67,7 @@ impl LlamaCppClient {
             .ok()
             .and_then(|v| v.parse::<u64>().ok())
             .unwrap_or(DEFAULT_REQUEST_TIMEOUT_SECS);
+        let pm = primary_model.unwrap_or_else(|| DEFAULT_PRIMARY_MODEL.to_string());
         Self {
             client: Client::builder()
                 .connect_timeout(Duration::from_secs(10))
@@ -74,9 +75,9 @@ impl LlamaCppClient {
                 .build()
                 .unwrap_or_else(|_| Client::new()),
             base_url: base_url.unwrap_or_else(|| DEFAULT_BASE_URL.to_string()),
-            primary_model: primary_model.unwrap_or_else(|| DEFAULT_PRIMARY_MODEL.to_string()),
+            primary_model: pm.clone(),
             embedding_model: DEFAULT_EMBEDDING_MODEL.to_string(),
-            vision_model: DEFAULT_VISION_MODEL.to_string(),
+            vision_model: pm,
             num_ctx: None,
             temperature: None,
             top_p: None,
@@ -124,6 +125,13 @@ impl LlamaCppClient {
             let mut obj = serde_json::Map::new();
             obj.insert("role".into(), Value::String(msg.role.clone()));
 
+            // Assistant messages with tool_calls must emit `content: null`
+            // (not `""`) — some Jinja templates (Mistral-family) treat
+            // empty-string content as a regular message rather than a
+            // tool-calling turn, breaking role-alternation validation.
+            let has_tool_calls = msg.role == "assistant"
+                && msg.tool_calls.as_ref().is_some_and(|tcs| !tcs.is_empty());
+
             match &msg.images {
                 Some(images) if !images.is_empty() => {
                     let mut parts: Vec<Value> = Vec::new();
@@ -139,6 +147,9 @@ impl LlamaCppClient {
                     }
                     obj.insert("content".into(), Value::Array(parts));
                 }
+                _ if has_tool_calls && msg.content.is_empty() => {
+                    obj.insert("content".into(), Value::Null);
+                }
                 _ => {
                     obj.insert("content".into(), Value::String(msg.content.clone()));
                 }
@@ -276,6 +287,13 @@ impl LlamaCppClient {
         tools: Vec<Tool>,
     ) -> Result<(ChatMessage, Option<i32>, Option<i32>)> {
         let url = format!("{}/chat/completions", self.base_url);
+        let roles: Vec<&str> = messages.iter().map(|m| m.role.as_str()).collect();
+        log::debug!(
+            "llama-swap chat_completion: model={} roles={:?} tools={}",
+            model,
+            roles,
+            tools.len()
+        );
         let mut body = serde_json::Map::new();
         body.insert("model".into(), Value::String(model.to_string()));
         body.insert(
@@ -381,6 +399,13 @@ impl LlmClient for LlamaCppClient {
         tools: Vec<Tool>,
     ) -> Result<BoxStream<'static, Result<LlmStreamEvent>>> {
         let url = format!("{}/chat/completions", self.base_url);
+        let roles: Vec<&str> = messages.iter().map(|m| m.role.as_str()).collect();
+        log::debug!(
+            "llama-swap stream: model={} roles={:?} tools={}",
+            self.primary_model,
+            roles,
+            tools.len()
+        );
         let mut body = serde_json::Map::new();
         body.insert(
             "model".into(),
@@ -681,12 +706,12 @@ impl LlamaCppClient {
             .and_then(|v| v.as_str())
             .unwrap_or_default()
             .to_string();
-        let has_vision = name == self.vision_model;
-        // Tool calling is the default for llama-swap entries we configure
-        // (--jinja flag); no negative-list mechanism yet, so report true.
+        // llama-swap doesn't expose per-model modality flags. Assume all
+        // configured models support vision and tool calling; a model
+        // without multimodal support surfaces as a chat-call error.
         ModelCapabilities {
             name,
-            has_vision,
+            has_vision: true,
             has_tool_calling: true,
         }
     }
@@ -943,7 +968,7 @@ mod tests {
     }
 
     #[test]
-    fn capability_inference_marks_only_vision_slot() {
+    fn all_models_report_vision_capable() {
         let mut c = LlamaCppClient::new(None, Some("chat".into()));
         c.set_vision_model("vision".into());
 
@@ -955,9 +980,9 @@ mod tests {
         let vision = c.parse_model_capabilities(&m_vision);
         let other = c.parse_model_capabilities(&m_other);
 
-        assert!(!chat.has_vision);
+        assert!(chat.has_vision);
         assert!(chat.has_tool_calling);
         assert!(vision.has_vision);
-        assert!(!other.has_vision);
+        assert!(other.has_vision);
     }
 }
diff --git a/src/ai/mod.rs b/src/ai/mod.rs
index 8d634fd..09cbfda 100644
--- a/src/ai/mod.rs
+++ b/src/ai/mod.rs
@@ -1,4 +1,5 @@
 pub mod apollo_client;
+pub mod backend;
 pub mod clip_client;
 pub mod daily_summary_job;
 pub mod face_client;