From a8a661f70a8022f304851985575bc58193b9354d Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Sun, 24 May 2026 15:00:50 -0400
Subject: [PATCH] ai: extract ResolvedBackend, remove ~480 lines of duplicated
 dispatch
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Replace 5 copies of the ~80-line backend resolution pattern with a
single InsightGenerator::resolve_backend() builder that returns a
ResolvedBackend (chat + local clients, BackendKind enum, images_inline
flag). Tool dispatch now takes &ResolvedBackend instead of
&OllamaClient + model + backend strings.

Remove duplicated ollama/openrouter/llamacpp fields from
InsightChatService — InsightGenerator owns them and resolve_backend
uses them. Delete build_chat_clients (replaced by resolve_backend).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/ai/insight_chat.rs      | 362 +++++++-----------------------
 src/ai/insight_generator.rs | 430 +++++++-----------------------------
 src/state.rs                |   6 -
 3 files changed, 158 insertions(+), 640 deletions(-)
diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs
index 54350df..6d52f8b 100644
--- a/src/ai/insight_chat.rs
+++ b/src/ai/insight_chat.rs
@@ -6,11 +6,9 @@ use std::collections::HashMap;
 use std::sync::{Arc, Mutex};
 use tokio::sync::Mutex as TokioMutex;
 
+use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides};
 use crate::ai::insight_generator::InsightGenerator;
-use crate::ai::llm_client::{ChatMessage, LlmClient, LlmStreamEvent, Tool};
-use crate::ai::ollama::OllamaClient;
-use crate::ai::llamacpp::LlamaCppClient;
-use crate::ai::openrouter::OpenRouterClient;
+use crate::ai::llm_client::{ChatMessage, LlmStreamEvent, Tool};
 use crate::database::InsightDao;
 use crate::database::models::InsertPhotoInsight;
 use crate::otel::global_tracer;
@@ -92,9 +90,6 @@ pub struct ChatTurnResult {
 #[derive(Clone)]
 pub struct InsightChatService {
     generator: Arc<InsightGenerator>,
-    ollama: OllamaClient,
-    openrouter: Option<Arc<OpenRouterClient>>,
-    llamacpp: Option<Arc<LlamaCppClient>>,
     insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
     chat_locks: ChatLockMap,
 }
@@ -102,17 +97,11 @@ pub struct InsightChatService {
 impl InsightChatService {
     pub fn new(
         generator: Arc<InsightGenerator>,
-        ollama: OllamaClient,
-        openrouter: Option<Arc<OpenRouterClient>>,
-        llamacpp: Option<Arc<LlamaCppClient>>,
         insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
         chat_locks: ChatLockMap,
     ) -> Self {
         Self {
             generator,
-            ollama,
-            openrouter,
-            llamacpp,
             insight_dao,
             chat_locks,
         }
@@ -308,16 +297,9 @@ impl InsightChatService {
             .filter(|s| !s.is_empty())
             .unwrap_or_else(|| stored_backend.clone());
         validate_cross_replay(&stored_backend, &effective_backend)?;
-        let is_hybrid = effective_backend == "hybrid";
-        let local_via_llamacpp =
-            crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
-        let describes_then_inlines = is_hybrid;
-        span.set_attribute(KeyValue::new("backend", effective_backend.clone()));
+        let kind = BackendKind::parse(&effective_backend)?;
+        span.set_attribute(KeyValue::new("backend", kind.as_str()));
 
-        // 4. Build the chat backend client. Hybrid → OpenRouter; local with
-        //    `LLM_BACKEND=llamacpp` → llama-swap; otherwise Ollama. Clones
-        //    so per-request sampling/model overrides don't leak into shared
-        //    state.
         let max_iterations = req
             .max_iterations
             .unwrap_or(DEFAULT_MAX_ITERATIONS)
@@ -325,113 +307,36 @@ impl InsightChatService {
         span.set_attribute(KeyValue::new("max_iterations", max_iterations as i64));
 
         let stored_model = insight.model_version.clone();
-        let custom_model = req
-            .model
-            .clone()
-            .or_else(|| Some(stored_model.clone()))
-            .filter(|m| !m.is_empty());
-
-        let mut ollama_client = self.ollama.clone();
-        let mut openrouter_client: Option<OpenRouterClient> = None;
-        let mut llamacpp_client: Option<LlamaCppClient> = None;
-
-        if is_hybrid {
-            let arc = self.openrouter.as_ref().ok_or_else(|| {
-                anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
-            })?;
-            let mut c: OpenRouterClient = (**arc).clone();
-            if let Some(ref m) = custom_model {
-                c.primary_model = m.clone();
-            }
-            if req.temperature.is_some()
-                || req.top_p.is_some()
-                || req.top_k.is_some()
-                || req.min_p.is_some()
-            {
-                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
-            }
-            if let Some(ctx) = req.num_ctx {
-                c.set_num_ctx(Some(ctx));
-            }
-            openrouter_client = Some(c);
-        } else if local_via_llamacpp {
-            let arc = self.llamacpp.as_ref().ok_or_else(|| {
-                anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured")
-            })?;
-            let mut c: LlamaCppClient = (**arc).clone();
-            if let Some(ref m) = custom_model {
-                c.primary_model = m.clone();
-            }
-            if req.temperature.is_some()
-                || req.top_p.is_some()
-                || req.top_k.is_some()
-                || req.min_p.is_some()
-            {
-                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
-            }
-            if let Some(ctx) = req.num_ctx {
-                c.set_num_ctx(Some(ctx));
-            }
-            llamacpp_client = Some(c);
-        } else {
-            // Pure local (Ollama): model swap. Build a new client when the
-            // chat model differs from the configured one.
-            if let Some(ref m) = custom_model
-                && m != &self.ollama.primary_model
-            {
-                ollama_client = OllamaClient::new(
-                    self.ollama.primary_url.clone(),
-                    self.ollama.fallback_url.clone(),
-                    m.clone(),
-                    Some(m.clone()),
-                );
-            }
-            if req.temperature.is_some()
-                || req.top_p.is_some()
-                || req.top_k.is_some()
-                || req.min_p.is_some()
-            {
-                ollama_client.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
-            }
-            if let Some(ctx) = req.num_ctx {
-                ollama_client.set_num_ctx(Some(ctx));
-            }
-        }
-
-        let chat_backend: &dyn LlmClient = if let Some(ref c) = llamacpp_client {
-            c
-        } else if let Some(ref c) = openrouter_client {
-            c
-        } else {
-            &ollama_client
+        let overrides = SamplingOverrides {
+            model: req.model.clone()
+                .or_else(|| Some(stored_model.clone()))
+                .filter(|m| !m.is_empty()),
+            num_ctx: req.num_ctx,
+            temperature: req.temperature,
+            top_p: req.top_p,
+            top_k: req.top_k,
+            min_p: req.min_p,
         };
-        let model_used = chat_backend.primary_model().to_string();
+        let backend = self.generator.resolve_backend(kind, &overrides).await?;
+        let model_used = backend.model().to_string();
         span.set_attribute(KeyValue::new("model", model_used.clone()));
 
-        // 5. Decide vision + tool set. In describe-then-inline mode
-        //    (hybrid only) we omit `describe_photo`. In local and llamacpp
-        //    we trust the stored history's first-user shape: if it carries
-        //    `images`, the original model was vision-capable, and we keep
-        //    `describe_photo` available.
+        // 5. Decide vision + tool set. In hybrid (describe-then-inline) mode
+        //    we omit `describe_photo`. Otherwise trust the stored history:
+        //    if the first user message carries images, describe_photo stays.
         let local_first_user_has_image = messages
             .iter()
             .find(|m| m.role == "user")
             .and_then(|m| m.images.as_ref())
             .map(|imgs| !imgs.is_empty())
             .unwrap_or(false);
-        let offer_describe_tool = !describes_then_inlines && local_first_user_has_image;
-        // current_gate_opts(has_vision) sets gate_opts.has_vision = has_vision
-        // and probes the per-table presence flags. Pass `offer_describe_tool`
-        // directly — the `!is_hybrid && local_first_user_has_image` decision
-        // is the chat-path's vision predicate.
+        let offer_describe_tool = backend.images_inline && local_first_user_has_image;
         let gate_opts = self.generator.current_gate_opts_for_persona(
             offer_describe_tool,
             Some((req.user_id, &active_persona)),
         );
         let tools = InsightGenerator::build_tool_definitions(gate_opts);
 
-        // Image base64 only needed when describe_photo is on the menu. Load
-        // lazily to avoid disk IO when the loop never invokes it.
         let image_base64: Option<String> = if offer_describe_tool {
             self.generator.load_image_as_base64(&normalized).ok()
         } else {
@@ -480,13 +385,13 @@ impl InsightChatService {
             iterations_used = iteration + 1;
             log::info!("Chat iteration {}/{}", iterations_used, max_iterations);
 
-            let (response, prompt_tokens, eval_tokens) = chat_backend
+            let (response, prompt_tokens, eval_tokens) = backend
+                .chat()
                 .chat_with_tools(messages.clone(), tools.clone())
                 .await?;
             last_prompt_eval_count = prompt_tokens;
             last_eval_count = eval_tokens;
 
-            // Ollama rejects non-object tool-call arguments on replay.
             let mut response = response;
             if let Some(ref mut tcs) = response.tool_calls {
                 for tc in tcs.iter_mut() {
@@ -514,13 +419,11 @@ impl InsightChatService {
                         .execute_tool(
                             &tool_call.function.name,
                             &tool_call.function.arguments,
-                            &ollama_client,
+                            &backend,
                             &image_base64,
                             &normalized,
                             req.user_id,
                             &active_persona,
-                            &model_used,
-                            &effective_backend,
                             &loop_cx,
                         )
                         .await;
@@ -534,8 +437,6 @@ impl InsightChatService {
         }
 
         if final_content.is_empty() {
-            // The model never produced a final answer; ask once more without
-            // tools to force a textual reply.
             log::info!(
                 "Chat loop exhausted after {} iterations, requesting final answer",
                 iterations_used
@@ -543,7 +444,8 @@ impl InsightChatService {
             messages.push(ChatMessage::user(
                 "Please write your final answer now without calling any more tools.",
             ));
-            let (final_response, prompt_tokens, eval_tokens) = chat_backend
+            let (final_response, prompt_tokens, eval_tokens) = backend
+                .chat()
                 .chat_with_tools(messages.clone(), vec![])
                 .await?;
             last_prompt_eval_count = prompt_tokens;
@@ -579,7 +481,8 @@ impl InsightChatService {
                  Capture the key moment or theme. Return ONLY the title, nothing else.",
                 final_content
             );
-            let title_raw = chat_backend
+            let title_raw = backend
+                .chat()
                 .generate(
                     &title_prompt,
                     Some(
@@ -604,7 +507,7 @@ impl InsightChatService {
                 model_version: model_used.clone(),
                 is_current: true,
                 training_messages: Some(json),
-                backend: effective_backend.clone(),
+                backend: kind.as_str().to_string(),
                 fewshot_source_ids: None,
                 content_hash: None,
             };
@@ -629,7 +532,7 @@ impl InsightChatService {
             prompt_eval_count: last_prompt_eval_count,
             eval_count: last_eval_count,
             amended_insight_id,
-            backend_used: effective_backend,
+            backend_used: kind.as_str().to_string(),
             model_used,
         })
     }
@@ -818,9 +721,8 @@ impl InsightChatService {
             .map(|s| s.trim().to_lowercase())
             .filter(|s| !s.is_empty())
             .unwrap_or_else(|| stored_backend.clone());
-        validate_cross_replay(&stored_backend, &effective_backend)?;
-        let is_hybrid = effective_backend == "hybrid";
-        let describes_then_inlines = is_hybrid;
+        let kind = BackendKind::parse(&effective_backend)?;
+        validate_cross_replay(&stored_backend, kind.as_str())?;
 
         let max_iterations = req
             .max_iterations
@@ -828,18 +730,20 @@ impl InsightChatService {
             .clamp(1, env_max_iterations());
 
         let stored_model = insight.model_version.clone();
-        let custom_model = req
-            .model
-            .clone()
-            .or_else(|| Some(stored_model.clone()))
-            .filter(|m| !m.is_empty());
+        let overrides = SamplingOverrides {
+            model: req.model.clone()
+                .or_else(|| Some(stored_model.clone()))
+                .filter(|m| !m.is_empty()),
+            num_ctx: req.num_ctx,
+            temperature: req.temperature,
+            top_p: req.top_p,
+            top_k: req.top_k,
+            min_p: req.min_p,
+        };
+        let backend = self.generator.resolve_backend(kind, &overrides).await?;
+        let model_used = backend.model().to_string();
 
-        let (chat_backend_holder, ollama_client) =
-            self.build_chat_clients(&effective_backend, custom_model.as_deref(), &req)?;
-        let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref();
-        let model_used = chat_backend.primary_model().to_string();
-
-        // Tool set — local/llamacpp mode + first user turn carries an image →
+        // Tool set — images_inline mode + first user turn carries an image →
         // offer describe_photo. Describe-then-inline mode (hybrid only):
         // visual description was inlined at bootstrap, no describe tool needed.
         let local_first_user_has_image = messages
@@ -848,7 +752,7 @@ impl InsightChatService {
             .and_then(|m| m.images.as_ref())
             .map(|imgs| !imgs.is_empty())
             .unwrap_or(false);
-        let offer_describe_tool = !describes_then_inlines && local_first_user_has_image;
+        let offer_describe_tool = backend.images_inline && local_first_user_has_image;
         let gate_opts = self.generator.current_gate_opts_for_persona(
             offer_describe_tool,
             Some((req.user_id, &active_persona)),
@@ -879,16 +783,13 @@ impl InsightChatService {
 
         let outcome = self
             .run_streaming_agentic_loop(
-                chat_backend,
-                &ollama_client,
+                &backend,
                 &mut messages,
                 tools,
                 &image_base64,
                 &normalized,
                 req.user_id,
                 &active_persona,
-                &model_used,
-                &effective_backend,
                 max_iterations,
                 &tx,
             )
@@ -916,7 +817,7 @@ impl InsightChatService {
 
         let mut amended_insight_id: Option<i32> = None;
         if req.amend {
-            let title = self.generate_title(chat_backend, &final_content).await?;
+            let title = self.generate_title(&backend, &final_content).await?;
 
             // Amended rows intentionally do not inherit the parent's
             // `fewshot_source_ids`. The parent's few-shot influence is still
@@ -932,7 +833,7 @@ impl InsightChatService {
                 model_version: model_used.clone(),
                 is_current: true,
                 training_messages: Some(json),
-                backend: effective_backend.clone(),
+                backend: kind.as_str().to_string(),
                 fewshot_source_ids: None,
                 content_hash: None,
             };
@@ -958,7 +859,7 @@ impl InsightChatService {
                 eval_tokens: last_eval_count,
                 num_ctx: req.num_ctx,
                 amended_insight_id,
-                backend_used: effective_backend,
+                backend_used: kind.as_str().to_string(),
                 model_used,
             })
             .await;
@@ -984,21 +885,23 @@ impl InsightChatService {
             .filter(|s| !s.trim().is_empty())
             .unwrap_or_else(|| "default".to_string());
         let effective_backend = resolve_bootstrap_backend(req.backend.as_deref())?;
-        let is_hybrid = effective_backend == "hybrid";
-        let local_via_llamacpp =
-            crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
-        let describes_then_inlines = is_hybrid;
+        let kind = BackendKind::parse(&effective_backend)?;
 
         let max_iterations = req
             .max_iterations
             .unwrap_or(DEFAULT_MAX_ITERATIONS)
             .clamp(1, env_max_iterations());
 
-        let custom_model = req.model.clone().filter(|m| !m.is_empty());
-        let (chat_backend_holder, ollama_client) =
-            self.build_chat_clients(&effective_backend, custom_model.as_deref(), &req)?;
-        let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref();
-        let model_used = chat_backend.primary_model().to_string();
+        let overrides = SamplingOverrides {
+            model: req.model.clone().filter(|m| !m.is_empty()),
+            num_ctx: req.num_ctx,
+            temperature: req.temperature,
+            top_p: req.top_p,
+            top_k: req.top_k,
+            min_p: req.min_p,
+        };
+        let backend = self.generator.resolve_backend(kind, &overrides).await?;
+        let model_used = backend.model().to_string();
 
         // Load image bytes once. RAW preview fallback is handled inside
         // load_image_as_base64. Errors degrade silently — a chat that
@@ -1020,26 +923,17 @@ impl InsightChatService {
             });
 
         // Describe-then-inline (hybrid only): pre-describe the image so a
-        // text-only chat model gets the visual description inline. llamacpp
-        // sends images directly to the chat model.
-        let visual_block = if describes_then_inlines {
+        // text-only chat model gets the visual description inline.
+        // images_inline backends send images directly to the chat model.
+        let visual_block = if !backend.images_inline {
             match image_base64.as_deref() {
                 Some(b64) => {
-                    let described = if local_via_llamacpp {
-                        self.llamacpp
-                            .as_ref()
-                            .expect("local_via_llamacpp guarantees Some")
-                            .describe_image(b64)
-                            .await
-                    } else {
-                        self.ollama.describe_image(b64).await
-                    };
-                    match described {
+                    match backend.local().describe_image(b64).await {
                         Ok(desc) => {
                             format!("Visual description (from local vision model):\n{}\n", desc)
                         }
                         Err(e) => {
-                            log::warn!("{} bootstrap: describe_image failed: {}", effective_backend, e);
+                            log::warn!("{} bootstrap: describe_image failed: {}", kind.as_str(), e);
                             String::new()
                         }
                     }
@@ -1050,10 +944,10 @@ impl InsightChatService {
             String::new()
         };
 
-        // Tool gates. Local + image present → expose describe_photo so
-        // the chat model can re-look at the photo on demand. Hybrid:
+        // Tool gates. images_inline + image present → expose describe_photo so
+        // the chat model can re-look at the photo on demand. Non-inline:
         // already inlined, no tool needed.
-        let offer_describe_tool = !describes_then_inlines && image_base64.is_some();
+        let offer_describe_tool = backend.images_inline && image_base64.is_some();
         let gate_opts = self.generator.current_gate_opts_for_persona(
             offer_describe_tool,
             Some((req.user_id, &active_persona)),
@@ -1079,23 +973,22 @@ impl InsightChatService {
         );
         let system_msg = ChatMessage::system(system_content);
         let mut user_msg = ChatMessage::user(req.user_message.clone());
-        if !describes_then_inlines && let Some(ref img) = image_base64 {
-            user_msg.images = Some(vec![img.clone()]);
+        if backend.images_inline {
+            if let Some(ref img) = image_base64 {
+                user_msg.images = Some(vec![img.clone()]);
+            }
         }
         let mut messages = vec![system_msg, user_msg];
 
         let outcome = self
             .run_streaming_agentic_loop(
-                chat_backend,
-                &ollama_client,
+                &backend,
                 &mut messages,
                 tools,
                 &image_base64,
                 &normalized,
                 req.user_id,
                 &active_persona,
-                &model_used,
-                &effective_backend,
                 max_iterations,
                 &tx,
             )
@@ -1108,7 +1001,7 @@ impl InsightChatService {
             final_content,
         } = outcome;
 
-        let title = self.generate_title(chat_backend, &final_content).await?;
+        let title = self.generate_title(&backend, &final_content).await?;
 
         let json = serde_json::to_string(&messages)
             .map_err(|e| anyhow!("failed to serialize chat history: {}", e))?;
@@ -1121,7 +1014,7 @@ impl InsightChatService {
             model_version: model_used.clone(),
             is_current: true,
             training_messages: Some(json),
-            backend: effective_backend.clone(),
+            backend: kind.as_str().to_string(),
             fewshot_source_ids: None,
             content_hash: None,
         };
@@ -1144,7 +1037,7 @@ impl InsightChatService {
                 eval_tokens: last_eval_count,
                 num_ctx: req.num_ctx,
                 amended_insight_id: Some(stored.id),
-                backend_used: effective_backend,
+                backend_used: kind.as_str().to_string(),
                 model_used,
             })
             .await;
@@ -1152,95 +1045,12 @@ impl InsightChatService {
         Ok(())
     }
 
-    /// Set up chat clients (Ollama + optional OpenRouter / LlamaCpp) shared
-    /// by bootstrap and continuation. Returns the chat-side backend client
-    /// (boxed because each backend has a different concrete type) and the
-    /// Ollama client used for describe-image / local tool calls.
-    ///
-    /// `effective_backend` must be one of `"local"` or `"hybrid"` (validated
-    /// upstream). Hybrid → OpenRouter; local with `LLM_BACKEND=llamacpp` →
-    /// llama-swap; pure local → Ollama. Returns the dispatched chat client
-    /// plus the (possibly per-request) Ollama client that the caller uses
-    /// for non-chat helpers (image describe in non-llamacpp mode, tool ops).
-    fn build_chat_clients(
-        &self,
-        effective_backend: &str,
-        custom_model: Option<&str>,
-        req: &ChatTurnRequest,
-    ) -> Result<(Box<dyn LlmClient>, OllamaClient)> {
-        let mut ollama_client = self.ollama.clone();
-
-        if effective_backend == "hybrid" {
-            let arc = self.openrouter.as_ref().ok_or_else(|| {
-                anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
-            })?;
-            let mut c: OpenRouterClient = (**arc).clone();
-            if let Some(m) = custom_model {
-                c.primary_model = m.to_string();
-            }
-            if req.temperature.is_some()
-                || req.top_p.is_some()
-                || req.top_k.is_some()
-                || req.min_p.is_some()
-            {
-                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
-            }
-            if let Some(ctx) = req.num_ctx {
-                c.set_num_ctx(Some(ctx));
-            }
-            return Ok((Box::new(c), ollama_client));
-        }
-
-        // Local mode — env switch decides between Ollama and llama-swap.
-        if crate::ai::local_backend_is_llamacpp()
-            && let Some(arc) = self.llamacpp.as_ref()
-        {
-            let mut c: LlamaCppClient = (**arc).clone();
-            if let Some(m) = custom_model {
-                c.primary_model = m.to_string();
-            }
-            if req.temperature.is_some()
-                || req.top_p.is_some()
-                || req.top_k.is_some()
-                || req.min_p.is_some()
-            {
-                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
-            }
-            if let Some(ctx) = req.num_ctx {
-                c.set_num_ctx(Some(ctx));
-            }
-            return Ok((Box::new(c), ollama_client));
-        }
-
-        if let Some(m) = custom_model
-            && m != self.ollama.primary_model
-        {
-            ollama_client = OllamaClient::new(
-                self.ollama.primary_url.clone(),
-                self.ollama.fallback_url.clone(),
-                m.to_string(),
-                Some(m.to_string()),
-            );
-        }
-        if req.temperature.is_some()
-            || req.top_p.is_some()
-            || req.top_k.is_some()
-            || req.min_p.is_some()
-        {
-            ollama_client.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
-        }
-        if let Some(ctx) = req.num_ctx {
-            ollama_client.set_num_ctx(Some(ctx));
-        }
-        Ok((Box::new(ollama_client.clone()), ollama_client))
-    }
-
     /// Generate a short title via the same chat backend so voice stays
     /// consistent with the body. Mirrors generate_agentic_insight_for_photo's
     /// titling pass.
     async fn generate_title(
         &self,
-        chat_backend: &dyn LlmClient,
+        backend: &ResolvedBackend,
         final_content: &str,
     ) -> Result<String> {
         let title_prompt = format!(
@@ -1248,7 +1058,8 @@ impl InsightChatService {
              Capture the key moment or theme. Return ONLY the title, nothing else.",
             final_content
         );
-        let title_raw = chat_backend
+        let title_raw = backend
+            .chat()
             .generate(
                 &title_prompt,
                 Some(
@@ -1266,18 +1077,13 @@ impl InsightChatService {
     /// final assistant content.
     async fn run_streaming_agentic_loop(
         &self,
-        chat_backend: &dyn LlmClient,
-        ollama_client: &OllamaClient,
+        backend: &ResolvedBackend,
         messages: &mut Vec<ChatMessage>,
         tools: Vec<Tool>,
         image_base64: &Option<String>,
         normalized: &str,
         user_id: i32,
         active_persona: &str,
-        // Provenance — stamped onto any store_fact tool call made
-        // during this loop. Mirrors the non-streaming chat path.
-        model_used: &str,
-        effective_backend: &str,
         max_iterations: usize,
         tx: &tokio::sync::mpsc::Sender<ChatStreamEvent>,
     ) -> Result<AgenticLoopOutcome> {
@@ -1296,7 +1102,8 @@ impl InsightChatService {
                 })
                 .await;
 
-            let mut stream = chat_backend
+            let mut stream = backend
+                .chat()
                 .chat_with_tools_stream(messages.clone(), tools.clone())
                 .await?;
 
@@ -1353,13 +1160,11 @@ impl InsightChatService {
                         .execute_tool(
                             &tool_call.function.name,
                             &tool_call.function.arguments,
-                            ollama_client,
+                            backend,
                             image_base64,
                             normalized,
                             user_id,
                             active_persona,
-                            model_used,
-                            effective_backend,
                             &cx,
                         )
                         .await;
@@ -1394,7 +1199,8 @@ impl InsightChatService {
             messages.push(ChatMessage::user(
                 "Please write your final answer now without calling any more tools.",
             ));
-            let mut stream = chat_backend
+            let mut stream = backend
+                .chat()
                 .chat_with_tools_stream(messages.clone(), vec![])
                 .await?;
             let mut final_message: Option<ChatMessage> = None;
diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs
index 075001a..f95c6dc 100644
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -1594,29 +1594,24 @@ Return ONLY the summary, nothing else."#,
         &self,
         tool_name: &str,
         arguments: &serde_json::Value,
-        ollama: &OllamaClient,
+        backend: &ResolvedBackend,
         image_base64: &Option<String>,
         file_path: &str,
         user_id: i32,
         persona_id: &str,
-        // Provenance — written into entity_facts.created_by_* when
-        // the loop calls store_fact. The caller knows the actual
-        // chat-runtime model and backend (which may differ from
-        // ollama.primary_model in hybrid mode where chat lives on
-        // OpenRouter while Ollama still handles vision).
-        model: &str,
-        backend: &str,
         cx: &opentelemetry::Context,
     ) -> String {
+        let model = backend.model();
+        let backend_label = backend.kind.as_str();
         let result = match tool_name {
-            "search_rag" => self.tool_search_rag(arguments, ollama, cx).await,
+            "search_rag" => self.tool_search_rag(arguments, backend.local(), cx).await,
             "search_messages" => self.tool_search_messages(arguments, cx).await,
             "get_sms_messages" => self.tool_get_sms_messages(arguments, cx).await,
             "get_calendar_events" => self.tool_get_calendar_events(arguments, cx).await,
             "get_location_history" => self.tool_get_location_history(arguments, cx).await,
             "get_file_tags" => self.tool_get_file_tags(arguments, cx).await,
             "get_faces_in_photo" => self.tool_get_faces_in_photo(arguments, cx).await,
-            "describe_photo" => self.tool_describe_photo(ollama, image_base64).await,
+            "describe_photo" => self.tool_describe_photo(backend.local(), image_base64).await,
             "reverse_geocode" => self.tool_reverse_geocode(arguments).await,
             "get_personal_place_at" => self.tool_get_personal_place_at(arguments).await,
             "recall_entities" => self.tool_recall_entities(arguments, cx).await,
@@ -1624,19 +1619,19 @@ Return ONLY the summary, nothing else."#,
                 self.tool_recall_facts_for_photo(arguments, user_id, persona_id, cx)
                     .await
             }
-            "store_entity" => self.tool_store_entity(arguments, ollama, cx).await,
+            "store_entity" => self.tool_store_entity(arguments, cx).await,
             "store_fact" => {
                 self.tool_store_fact(
-                    arguments, file_path, user_id, persona_id, model, backend, cx,
+                    arguments, file_path, user_id, persona_id, model, backend_label, cx,
                 )
                 .await
             }
             "update_fact" => {
-                self.tool_update_fact(arguments, user_id, persona_id, model, backend, cx)
+                self.tool_update_fact(arguments, user_id, persona_id, model, backend_label, cx)
                     .await
             }
             "supersede_fact" => {
-                self.tool_supersede_fact(arguments, user_id, persona_id, model, backend, cx)
+                self.tool_supersede_fact(arguments, user_id, persona_id, model, backend_label, cx)
                     .await
             }
             "get_current_datetime" => Self::tool_get_current_datetime(),
@@ -1654,7 +1649,7 @@ Return ONLY the summary, nothing else."#,
     async fn tool_search_rag(
         &self,
         args: &serde_json::Value,
-        ollama: &OllamaClient,
+        local: &dyn LlmClient,
         _cx: &opentelemetry::Context,
     ) -> String {
         let query = match args.get("query").and_then(|v| v.as_str()) {
@@ -1718,7 +1713,7 @@ Return ONLY the summary, nothing else."#,
         };
 
         let final_results = if rerank_enabled && results.len() > limit {
-            match self.rerank_with_llm(&query, &results, limit, ollama).await {
+            match self.rerank_with_llm(&query, &results, limit, local).await {
                 Ok(reordered) => reordered,
                 Err(e) => {
                     log::warn!("rerank failed, using vector order: {}", e);
@@ -1744,7 +1739,7 @@ Return ONLY the summary, nothing else."#,
         query: &str,
         candidates: &[String],
         limit: usize,
-        ollama: &OllamaClient,
+        local: &dyn LlmClient,
     ) -> Result<Vec<String>> {
         let query_preview: String = query.chars().take(60).collect();
         log::info!(
@@ -1785,15 +1780,7 @@ Return ONLY the summary, nothing else."#,
         let system = Some(
             "You are a terse relevance ranker. You output only numbers separated by commas.",
         );
-        let response = if crate::ai::local_backend_is_llamacpp() {
-            if let Some(ref lc) = self.llamacpp {
-                lc.generate(&prompt, system, None).await?
-            } else {
-                ollama.generate_no_think(&prompt, system).await?
-            }
-        } else {
-            ollama.generate_no_think(&prompt, system).await?
-        };
+        let response = local.generate(&prompt, system, None).await?;
         log::info!(
             "rerank: finished in {} ms (prompt={} chars)",
             started.elapsed().as_millis(),
@@ -2365,31 +2352,17 @@ Return ONLY the summary, nothing else."#,
         out
     }
 
-    /// Tool: describe_photo — generate a visual description of the photo.
-    /// Routes through llama-swap when `LLM_BACKEND=llamacpp`, Ollama otherwise.
     async fn tool_describe_photo(
         &self,
-        ollama: &OllamaClient,
+        local: &dyn LlmClient,
         image_base64: &Option<String>,
     ) -> String {
         log::info!("tool_describe_photo: generating visual description");
-
         match image_base64 {
-            Some(img) => {
-                let result = if crate::ai::local_backend_is_llamacpp() {
-                    if let Some(ref lc) = self.llamacpp {
-                        lc.describe_image(img).await
-                    } else {
-                        ollama.generate_photo_description(img).await
-                    }
-                } else {
-                    ollama.generate_photo_description(img).await
-                };
-                match result {
-                    Ok(desc) => desc,
-                    Err(e) => format!("Error describing photo: {}", e),
-                }
-            }
+            Some(img) => match local.describe_image(img).await {
+                Ok(desc) => desc,
+                Err(e) => format!("Error describing photo: {}", e),
+            },
             None => "No image available for description.".to_string(),
         }
     }
@@ -2635,7 +2608,6 @@ Return ONLY the summary, nothing else."#,
     async fn tool_store_entity(
         &self,
         args: &serde_json::Value,
-        _ollama: &OllamaClient,
         cx: &opentelemetry::Context,
     ) -> String {
         use crate::database::models::InsertEntity;
@@ -3775,243 +3747,25 @@ Return ONLY the summary, nothing else."#,
         span.set_attribute(KeyValue::new("file_path", file_path.clone()));
         span.set_attribute(KeyValue::new("max_iterations", max_iterations as i64));
 
-        // 1a. Resolve backend label (defaults to "local").
-        let backend_label = backend
-            .as_deref()
-            .map(|s| s.trim().to_lowercase())
-            .filter(|s| !s.is_empty())
-            .unwrap_or_else(|| "local".to_string());
-        if !matches!(backend_label.as_str(), "local" | "hybrid") {
-            return Err(anyhow::anyhow!(
-                "unknown backend '{}'; expected 'local' or 'hybrid'",
-                backend_label
-            ));
-        }
-        span.set_attribute(KeyValue::new("backend", backend_label.clone()));
-        let is_hybrid = backend_label == "hybrid";
-        // `LLM_BACKEND=llamacpp` swaps Ollama out for llama-swap as the
-        // "local" stack — chat + embeddings route through llama-swap.
-        // llamacpp models receive images directly (vision-capable); only
-        // hybrid mode (OpenRouter chat) uses describe-then-inline.
-        let local_via_llamacpp =
-            crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
-        let describes_then_inlines = is_hybrid;
-        let ollama_is_chat = !is_hybrid && !local_via_llamacpp;
-
-        // 1b. Always build an Ollama client. In local mode it owns the chat
-        //     loop; in hybrid/llamacpp mode it still handles tool-local calls
-        //     (e.g. future embedding-backed tools). The chat backend is
-        //     selected separately below.
-        //     Sampling overrides only apply when Ollama is the chat backend.
-        let apply_sampling_to_ollama = ollama_is_chat;
-        let mut ollama_client = if let Some(ref model) = custom_model
-            && ollama_is_chat
-        {
-            log::info!("Using custom model for agentic: {}", model);
-            span.set_attribute(KeyValue::new("custom_model", model.clone()));
-            OllamaClient::new(
-                self.ollama.primary_url.clone(),
-                self.ollama.fallback_url.clone(),
-                model.clone(),
-                Some(model.clone()),
-            )
-        } else {
-            if ollama_is_chat {
-                span.set_attribute(KeyValue::new("model", self.ollama.primary_model.clone()));
-            }
-            self.ollama.clone()
-        };
-
-        if apply_sampling_to_ollama {
-            if let Some(ctx) = num_ctx {
-                log::info!("Using custom context size: {}", ctx);
-                span.set_attribute(KeyValue::new("num_ctx", ctx as i64));
-                ollama_client.set_num_ctx(Some(ctx));
-            }
-
-            if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() {
-                log::info!(
-                    "Using sampling params — temperature: {:?}, top_p: {:?}, top_k: {:?}, min_p: {:?}",
-                    temperature,
-                    top_p,
-                    top_k,
-                    min_p
-                );
-                if let Some(t) = temperature {
-                    span.set_attribute(KeyValue::new("temperature", t as f64));
-                }
-                if let Some(p) = top_p {
-                    span.set_attribute(KeyValue::new("top_p", p as f64));
-                }
-                if let Some(k) = top_k {
-                    span.set_attribute(KeyValue::new("top_k", k as i64));
-                }
-                if let Some(m) = min_p {
-                    span.set_attribute(KeyValue::new("min_p", m as f64));
-                }
-                ollama_client.set_sampling_params(temperature, top_p, top_k, min_p);
-            }
-        }
-
-        // 1c. In hybrid mode, clone the configured OpenRouter client and
-        //     apply per-request overrides.
-        let openrouter_client: Option<OpenRouterClient> = if is_hybrid {
-            let arc = self.openrouter.as_ref().ok_or_else(|| {
-                anyhow::anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
-            })?;
-            let mut c: OpenRouterClient = (**arc).clone();
-            if let Some(ref m) = custom_model {
-                c.primary_model = m.clone();
-                span.set_attribute(KeyValue::new("custom_model", m.clone()));
-            }
-            span.set_attribute(KeyValue::new("openrouter_model", c.primary_model.clone()));
-            if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() {
-                if let Some(t) = temperature {
-                    span.set_attribute(KeyValue::new("temperature", t as f64));
-                }
-                if let Some(p) = top_p {
-                    span.set_attribute(KeyValue::new("top_p", p as f64));
-                }
-                if let Some(k) = top_k {
-                    span.set_attribute(KeyValue::new("top_k", k as i64));
-                }
-                if let Some(m) = min_p {
-                    span.set_attribute(KeyValue::new("min_p", m as f64));
-                }
-                c.set_sampling_params(temperature, top_p, top_k, min_p);
-            }
-            if let Some(ctx) = num_ctx {
-                span.set_attribute(KeyValue::new("num_ctx", ctx as i64));
-                c.set_num_ctx(Some(ctx));
-            }
-            Some(c)
-        } else {
-            None
-        };
-
-        // 1d. When `LLM_BACKEND=llamacpp` and we're in local mode (not
-        //     hybrid — hybrid keeps chat on OpenRouter), clone the llamacpp
-        //     client and apply per-request overrides. Same shape as the
-        //     openrouter branch above; describe_image will route through
-        //     the vision slot configured on the client.
-        let llamacpp_client: Option<LlamaCppClient> = if local_via_llamacpp && !is_hybrid {
-            let arc = self.llamacpp.as_ref().ok_or_else(|| {
-                anyhow::anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured")
-            })?;
-            let mut c: LlamaCppClient = (**arc).clone();
-            if let Some(ref m) = custom_model {
-                c.primary_model = m.clone();
-                span.set_attribute(KeyValue::new("custom_model", m.clone()));
-            }
-            span.set_attribute(KeyValue::new("llamacpp_model", c.primary_model.clone()));
-            if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() {
-                if let Some(t) = temperature {
-                    span.set_attribute(KeyValue::new("temperature", t as f64));
-                }
-                if let Some(p) = top_p {
-                    span.set_attribute(KeyValue::new("top_p", p as f64));
-                }
-                if let Some(k) = top_k {
-                    span.set_attribute(KeyValue::new("top_k", k as i64));
-                }
-                if let Some(m) = min_p {
-                    span.set_attribute(KeyValue::new("min_p", m as f64));
-                }
-                c.set_sampling_params(temperature, top_p, top_k, min_p);
-            }
-            if let Some(ctx) = num_ctx {
-                span.set_attribute(KeyValue::new("num_ctx", ctx as i64));
-                c.set_num_ctx(Some(ctx));
-            }
-            Some(c)
-        } else {
-            None
+        // 1. Resolve backend + build clients.
+        let kind = BackendKind::parse(
+            backend.as_deref().unwrap_or("local"),
+        )?;
+        span.set_attribute(KeyValue::new("backend", kind.as_str()));
+        let overrides = SamplingOverrides {
+            model: custom_model,
+            num_ctx,
+            temperature,
+            top_p,
+            top_k,
+            min_p,
         };
+        let backend = self.resolve_backend(kind, &overrides).await?;
+        span.set_attribute(KeyValue::new("model", backend.model().to_string()));
+        span.set_attribute(KeyValue::new("images_inline", backend.images_inline));
 
         let insight_cx = current_cx.with_span(span);
 
-        // 2. Verify chat model supports tool calling.
-        //    - local: existing Ollama model availability + capability check.
-        //    - hybrid: trust the operator's curated allowlist
-        //      (OPENROUTER_ALLOWED_MODELS) — no live precheck. A bad model id
-        //      surfaces as a chat-call error on the next step.
-        let has_vision = if describes_then_inlines {
-            // Hybrid: chat model never sees images — describe-then-inject.
-            true
-        } else if local_via_llamacpp {
-            // llama-swap models receive images directly via OpenAI content
-            // parts. Capability probing isn't available (no `/api/show`),
-            // so assume vision support; a misconfigured model surfaces as
-            // a chat-call error.
-            true
-        } else {
-            if let Some(ref model_name) = custom_model {
-                let available_on_primary =
-                    OllamaClient::is_model_available(&ollama_client.primary_url, model_name)
-                        .await
-                        .unwrap_or(false);
-
-                let available_on_fallback =
-                    if let Some(ref fallback_url) = ollama_client.fallback_url {
-                        OllamaClient::is_model_available(fallback_url, model_name)
-                            .await
-                            .unwrap_or(false)
-                    } else {
-                        false
-                    };
-
-                if !available_on_primary && !available_on_fallback {
-                    anyhow::bail!(
-                        "model not available: '{}' not found on any configured server",
-                        model_name
-                    );
-                }
-            }
-
-            let model_name_for_caps = &ollama_client.primary_model;
-            let capabilities = match OllamaClient::check_model_capabilities(
-                &ollama_client.primary_url,
-                model_name_for_caps,
-            )
-            .await
-            {
-                Ok(caps) => caps,
-                Err(_) => {
-                    let fallback_url = ollama_client.fallback_url.as_deref().ok_or_else(|| {
-                        anyhow::anyhow!(
-                            "Failed to check model capabilities for '{}': model not found on primary server and no fallback configured",
-                            model_name_for_caps
-                        )
-                    })?;
-                    OllamaClient::check_model_capabilities(fallback_url, model_name_for_caps)
-                        .await
-                        .map_err(|e| {
-                            anyhow::anyhow!(
-                                "Failed to check model capabilities for '{}': {}",
-                                model_name_for_caps,
-                                e
-                            )
-                        })?
-                }
-            };
-
-            if !capabilities.has_tool_calling {
-                return Err(anyhow::anyhow!(
-                    "tool calling not supported by model '{}'",
-                    ollama_client.primary_model
-                ));
-            }
-
-            insight_cx
-                .span()
-                .set_attribute(KeyValue::new("model_has_vision", capabilities.has_vision));
-            insight_cx
-                .span()
-                .set_attribute(KeyValue::new("model_has_tool_calling", true));
-
-            capabilities.has_vision
-        };
-
         // 3. Fetch EXIF
         let exif = {
             let mut exif_dao = self.exif_dao.lock().expect("Unable to lock ExifDao");
@@ -4103,60 +3857,33 @@ Return ONLY the summary, nothing else."#,
             }
         };
 
-        // 7. Load image if vision capable.
-        //    In hybrid mode we ALSO describe it locally now so the
-        //    description can be inlined as text — the OpenRouter chat model
-        //    never receives the base64 image directly.
-        let image_base64 = if has_vision {
-            match self.load_image_as_base64(&file_path) {
-                Ok(b64) => {
-                    log::info!("Loaded image for vision-capable agentic model");
-                    Some(b64)
-                }
-                Err(e) => {
-                    log::warn!("Failed to load image for agentic vision: {}", e);
-                    None
-                }
+        // 7. Load image. Always attempted — vision-capable models get the
+        //    base64 inline; hybrid mode describes it locally and injects text.
+        let image_base64 = match self.load_image_as_base64(&file_path) {
+            Ok(b64) => {
+                log::info!("Loaded image for agentic model");
+                Some(b64)
+            }
+            Err(e) => {
+                log::warn!("Failed to load image for agentic: {}", e);
+                None
             }
-        } else {
-            None
         };
 
-        // describe-then-inline path (hybrid only). Vision describe routes
-        // through whichever local backend is configured — llama-swap when
-        // `local_via_llamacpp`, otherwise Ollama.
-        let inlined_visual_description: Option<String> = if describes_then_inlines {
+        // Describe-then-inline (hybrid only). Vision describe routes through
+        // the local backend so non-text work stays off OpenRouter.
+        let inlined_visual_description: Option<String> = if !backend.images_inline {
             match image_base64.as_deref() {
-                Some(b64) => {
-                    let described = if local_via_llamacpp {
-                        self.llamacpp
-                            .as_ref()
-                            .expect("local_via_llamacpp guarantees Some")
-                            .describe_image(b64)
-                            .await
-                    } else {
-                        self.ollama.describe_image(b64).await
-                    };
-
-                    match described {
-                        Ok(desc) => {
-                            log::info!(
-                                "{}: vision describe succeeded ({} chars)",
-                                backend_label,
-                                desc.len()
-                            );
-                            Some(desc)
-                        }
-                        Err(e) => {
-                            log::warn!(
-                                "{}: vision describe failed, continuing without: {}",
-                                backend_label,
-                                e
-                            );
-                            None
-                        }
+                Some(b64) => match backend.local().describe_image(b64).await {
+                    Ok(desc) => {
+                        log::info!("{}: vision describe succeeded ({} chars)", kind, desc.len());
+                        Some(desc)
                     }
-                }
+                    Err(e) => {
+                        log::warn!("{}: vision describe failed, continuing without: {}", kind, e);
+                        None
+                    }
+                },
                 None => None,
             }
         } else {
@@ -4228,34 +3955,24 @@ Return ONLY the summary, nothing else."#,
             date = date_taken.format("%B %d, %Y"),
         );
 
-        // 10. Define tools. Gate flags computed from current data presence;
-        //     hybrid mode omits describe_photo since the chat model receives
-        //     the visual description inline (so we pass `false` for
-        //     has_vision in that mode regardless of the model's actual
-        //     capability).
-        let gate_opts = self.current_gate_opts(has_vision && !describes_then_inlines);
+        // 10. Define tools. describe_photo offered only when the chat model
+        //     sees images directly (images_inline); in hybrid mode the visual
+        //     description is already inlined as text.
+        let gate_opts = self.current_gate_opts(backend.images_inline);
         let tools = Self::build_tool_definitions(gate_opts);
 
-        // 11. Build initial messages. In describe-then-inline modes images
-        //     are never attached to the wire message — the description is part
-        //     of `user_content`.
+        // 11. Build initial messages. images_inline → attach base64 to the
+        //     user message; describe-then-inline → text was already injected.
         let system_msg = ChatMessage::system(system_content);
         let mut user_msg = ChatMessage::user(user_content);
-        if !describes_then_inlines && let Some(ref img) = image_base64 {
-            user_msg.images = Some(vec![img.clone()]);
+        if backend.images_inline {
+            if let Some(ref img) = image_base64 {
+                user_msg.images = Some(vec![img.clone()]);
+            }
         }
 
         let mut messages = vec![system_msg, user_msg];
 
-        // 12. Agentic loop — dispatch through the selected backend.
-        let chat_backend: &dyn LlmClient = if let Some(ref lc_c) = llamacpp_client {
-            lc_c
-        } else if let Some(ref or_c) = openrouter_client {
-            or_c
-        } else {
-            &ollama_client
-        };
-
         let loop_span = tracer.start_with_context("ai.agentic.loop", &insight_cx);
         let loop_cx = insight_cx.with_span(loop_span);
 
@@ -4268,7 +3985,8 @@ Return ONLY the summary, nothing else."#,
             iterations_used = iteration + 1;
             log::info!("Agentic iteration {}/{}", iteration + 1, max_iterations);
 
-            let (response, prompt_tokens, eval_tokens) = chat_backend
+            let (response, prompt_tokens, eval_tokens) = backend
+                .chat()
                 .chat_with_tools(messages.clone(), tools.clone())
                 .await?;
 
@@ -4308,13 +4026,11 @@ Return ONLY the summary, nothing else."#,
                         .execute_tool(
                             &tool_call.function.name,
                             &tool_call.function.arguments,
-                            &ollama_client,
+                            &backend,
                             &image_base64,
                             &file_path,
                             user_id,
                             &persona_id,
-                            chat_backend.primary_model(),
-                            &backend_label,
                             &loop_cx,
                         )
                         .await;
@@ -4338,7 +4054,8 @@ Return ONLY the summary, nothing else."#,
                 "Based on the context gathered, please write the final photo insight: a title and a detailed personal summary. Write in first person as {}.",
                 user_display_name()
             )));
-            let (final_response, prompt_tokens, eval_tokens) = chat_backend
+            let (final_response, prompt_tokens, eval_tokens) = backend
+                .chat()
                 .chat_with_tools(messages.clone(), vec![])
                 .await?;
             last_prompt_eval_count = prompt_tokens;
@@ -4360,7 +4077,8 @@ Return ONLY the summary, nothing else."#,
         let title_system = custom_system_prompt.as_deref().unwrap_or(
             "You are my long term memory assistant. Use only the information provided. Do not invent details.",
         );
-        let title_raw = chat_backend
+        let title_raw = backend
+            .chat()
             .generate(&title_prompt, Some(title_system), None)
             .await?;
         let title = title_raw.trim().trim_matches('"').to_string();
@@ -4383,7 +4101,7 @@ Return ONLY the summary, nothing else."#,
         };
 
         // 15. Store insight (returns the persisted row including its new id)
-        let model_version = chat_backend.primary_model().to_string();
+        let model_version = backend.model().to_string();
         let fewshot_source_ids_json = if fewshot_source_ids.is_empty() {
             None
         } else {
@@ -4398,7 +4116,7 @@ Return ONLY the summary, nothing else."#,
             model_version,
             is_current: true,
             training_messages,
-            backend: backend_label.clone(),
+            backend: kind.as_str().to_string(),
             fewshot_source_ids: fewshot_source_ids_json,
             content_hash: None,
         };
diff --git a/src/state.rs b/src/state.rs
index c4f810a..8cfccbb 100644
--- a/src/state.rs
+++ b/src/state.rs
@@ -290,9 +290,6 @@ impl Default for AppState {
             Arc::new(tokio::sync::Mutex::new(std::collections::HashMap::new()));
         let insight_chat = Arc::new(InsightChatService::new(
             Arc::new(insight_generator.clone()),
-            ollama.clone(),
-            openrouter.clone(),
-            llamacpp.clone(),
             insight_dao.clone(),
             chat_locks,
         ));
@@ -470,9 +467,6 @@ impl AppState {
             Arc::new(tokio::sync::Mutex::new(std::collections::HashMap::new()));
         let insight_chat = Arc::new(InsightChatService::new(
             Arc::new(insight_generator.clone()),
-            ollama.clone(),
-            None,
-            None,
             insight_dao.clone(),
             chat_locks,
         ));