ai: extract ResolvedBackend, remove ~480 lines of duplicated dispatch

Replace 5 copies of the ~80-line backend resolution pattern with a single InsightGenerator::resolve_backend() builder that returns a ResolvedBackend (chat + local clients, BackendKind enum, images_inline flag). Tool dispatch now takes &ResolvedBackend instead of &OllamaClient + model + backend strings. Remove duplicated ollama/openrouter/llamacpp fields from InsightChatService — InsightGenerator owns them and resolve_backend uses them. Delete build_chat_clients (replaced by resolve_backend). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-24 15:00:50 -04:00
parent 0631820fbf
commit a8a661f70a
3 changed files with 158 additions and 640 deletions
@@ -6,11 +6,9 @@ use std::collections::HashMap;
 use std::sync::{Arc, Mutex};
 use tokio::sync::Mutex as TokioMutex;

+use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides};
 use crate::ai::insight_generator::InsightGenerator;
-use crate::ai::llm_client::{ChatMessage, LlmClient, LlmStreamEvent, Tool};
-use crate::ai::ollama::OllamaClient;
-use crate::ai::llamacpp::LlamaCppClient;
-use crate::ai::openrouter::OpenRouterClient;
+use crate::ai::llm_client::{ChatMessage, LlmStreamEvent, Tool};
 use crate::database::InsightDao;
 use crate::database::models::InsertPhotoInsight;
 use crate::otel::global_tracer;
@@ -92,9 +90,6 @@ pub struct ChatTurnResult {
 #[derive(Clone)]
 pub struct InsightChatService {
    generator: Arc<InsightGenerator>,
-    ollama: OllamaClient,
-    openrouter: Option<Arc<OpenRouterClient>>,
-    llamacpp: Option<Arc<LlamaCppClient>>,
    insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
    chat_locks: ChatLockMap,
 }
@@ -102,17 +97,11 @@ pub struct InsightChatService {
 impl InsightChatService {
    pub fn new(
        generator: Arc<InsightGenerator>,
-        ollama: OllamaClient,
-        openrouter: Option<Arc<OpenRouterClient>>,
-        llamacpp: Option<Arc<LlamaCppClient>>,
        insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
        chat_locks: ChatLockMap,
    ) -> Self {
        Self {
            generator,
-            ollama,
-            openrouter,
-            llamacpp,
            insight_dao,
            chat_locks,
        }
@@ -308,16 +297,9 @@ impl InsightChatService {
            .filter(|s| !s.is_empty())
            .unwrap_or_else(|| stored_backend.clone());
        validate_cross_replay(&stored_backend, &effective_backend)?;
-        let is_hybrid = effective_backend == "hybrid";
-        let local_via_llamacpp =
-            crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
-        let describes_then_inlines = is_hybrid;
-        span.set_attribute(KeyValue::new("backend", effective_backend.clone()));
+        let kind = BackendKind::parse(&effective_backend)?;
+        span.set_attribute(KeyValue::new("backend", kind.as_str()));

-        // 4. Build the chat backend client. Hybrid → OpenRouter; local with
-        //    `LLM_BACKEND=llamacpp` → llama-swap; otherwise Ollama. Clones
-        //    so per-request sampling/model overrides don't leak into shared
-        //    state.
        let max_iterations = req
            .max_iterations
            .unwrap_or(DEFAULT_MAX_ITERATIONS)
@@ -325,113 +307,36 @@ impl InsightChatService {
        span.set_attribute(KeyValue::new("max_iterations", max_iterations as i64));

        let stored_model = insight.model_version.clone();
-        let custom_model = req
-            .model
-            .clone()
-            .or_else(|| Some(stored_model.clone()))
-            .filter(|m| !m.is_empty());
-
-        let mut ollama_client = self.ollama.clone();
-        let mut openrouter_client: Option<OpenRouterClient> = None;
-        let mut llamacpp_client: Option<LlamaCppClient> = None;
-
-        if is_hybrid {
-            let arc = self.openrouter.as_ref().ok_or_else(|| {
-                anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
-            })?;
-            let mut c: OpenRouterClient = (**arc).clone();
-            if let Some(ref m) = custom_model {
-                c.primary_model = m.clone();
-            }
-            if req.temperature.is_some()
-                || req.top_p.is_some()
-                || req.top_k.is_some()
-                || req.min_p.is_some()
-            {
-                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
-            }
-            if let Some(ctx) = req.num_ctx {
-                c.set_num_ctx(Some(ctx));
-            }
-            openrouter_client = Some(c);
-        } else if local_via_llamacpp {
-            let arc = self.llamacpp.as_ref().ok_or_else(|| {
-                anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured")
-            })?;
-            let mut c: LlamaCppClient = (**arc).clone();
-            if let Some(ref m) = custom_model {
-                c.primary_model = m.clone();
-            }
-            if req.temperature.is_some()
-                || req.top_p.is_some()
-                || req.top_k.is_some()
-                || req.min_p.is_some()
-            {
-                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
-            }
-            if let Some(ctx) = req.num_ctx {
-                c.set_num_ctx(Some(ctx));
-            }
-            llamacpp_client = Some(c);
-        } else {
-            // Pure local (Ollama): model swap. Build a new client when the
-            // chat model differs from the configured one.
-            if let Some(ref m) = custom_model
-                && m != &self.ollama.primary_model
-            {
-                ollama_client = OllamaClient::new(
-                    self.ollama.primary_url.clone(),
-                    self.ollama.fallback_url.clone(),
-                    m.clone(),
-                    Some(m.clone()),
-                );
-            }
-            if req.temperature.is_some()
-                || req.top_p.is_some()
-                || req.top_k.is_some()
-                || req.min_p.is_some()
-            {
-                ollama_client.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
-            }
-            if let Some(ctx) = req.num_ctx {
-                ollama_client.set_num_ctx(Some(ctx));
-            }
-        }
-
-        let chat_backend: &dyn LlmClient = if let Some(ref c) = llamacpp_client {
-            c
-        } else if let Some(ref c) = openrouter_client {
-            c
-        } else {
-            &ollama_client
+        let overrides = SamplingOverrides {
+            model: req.model.clone()
+                .or_else(|| Some(stored_model.clone()))
+                .filter(|m| !m.is_empty()),
+            num_ctx: req.num_ctx,
+            temperature: req.temperature,
+            top_p: req.top_p,
+            top_k: req.top_k,
+            min_p: req.min_p,
        };
-        let model_used = chat_backend.primary_model().to_string();
+        let backend = self.generator.resolve_backend(kind, &overrides).await?;
+        let model_used = backend.model().to_string();
        span.set_attribute(KeyValue::new("model", model_used.clone()));

-        // 5. Decide vision + tool set. In describe-then-inline mode
-        //    (hybrid only) we omit `describe_photo`. In local and llamacpp
-        //    we trust the stored history's first-user shape: if it carries
-        //    `images`, the original model was vision-capable, and we keep
-        //    `describe_photo` available.
+        // 5. Decide vision + tool set. In hybrid (describe-then-inline) mode
+        //    we omit `describe_photo`. Otherwise trust the stored history:
+        //    if the first user message carries images, describe_photo stays.
        let local_first_user_has_image = messages
            .iter()
            .find(|m| m.role == "user")
            .and_then(|m| m.images.as_ref())
            .map(|imgs| !imgs.is_empty())
            .unwrap_or(false);
-        let offer_describe_tool = !describes_then_inlines && local_first_user_has_image;
-        // current_gate_opts(has_vision) sets gate_opts.has_vision = has_vision
-        // and probes the per-table presence flags. Pass `offer_describe_tool`
-        // directly — the `!is_hybrid && local_first_user_has_image` decision
-        // is the chat-path's vision predicate.
+        let offer_describe_tool = backend.images_inline && local_first_user_has_image;
        let gate_opts = self.generator.current_gate_opts_for_persona(
            offer_describe_tool,
            Some((req.user_id, &active_persona)),
        );
        let tools = InsightGenerator::build_tool_definitions(gate_opts);

-        // Image base64 only needed when describe_photo is on the menu. Load
-        // lazily to avoid disk IO when the loop never invokes it.
        let image_base64: Option<String> = if offer_describe_tool {
            self.generator.load_image_as_base64(&normalized).ok()
        } else {
@@ -480,13 +385,13 @@ impl InsightChatService {
            iterations_used = iteration + 1;
            log::info!("Chat iteration {}/{}", iterations_used, max_iterations);

-            let (response, prompt_tokens, eval_tokens) = chat_backend
+            let (response, prompt_tokens, eval_tokens) = backend
+                .chat()
                .chat_with_tools(messages.clone(), tools.clone())
                .await?;
            last_prompt_eval_count = prompt_tokens;
            last_eval_count = eval_tokens;

-            // Ollama rejects non-object tool-call arguments on replay.
            let mut response = response;
            if let Some(ref mut tcs) = response.tool_calls {
                for tc in tcs.iter_mut() {
@@ -514,13 +419,11 @@ impl InsightChatService {
                        .execute_tool(
                            &tool_call.function.name,
                            &tool_call.function.arguments,
-                            &ollama_client,
+                            &backend,
                            &image_base64,
                            &normalized,
                            req.user_id,
                            &active_persona,
-                            &model_used,
-                            &effective_backend,
                            &loop_cx,
                        )
                        .await;
@@ -534,8 +437,6 @@ impl InsightChatService {
        }

        if final_content.is_empty() {
-            // The model never produced a final answer; ask once more without
-            // tools to force a textual reply.
            log::info!(
                "Chat loop exhausted after {} iterations, requesting final answer",
                iterations_used
@@ -543,7 +444,8 @@ impl InsightChatService {
            messages.push(ChatMessage::user(
                "Please write your final answer now without calling any more tools.",
            ));
-            let (final_response, prompt_tokens, eval_tokens) = chat_backend
+            let (final_response, prompt_tokens, eval_tokens) = backend
+                .chat()
                .chat_with_tools(messages.clone(), vec![])
                .await?;
            last_prompt_eval_count = prompt_tokens;
@@ -579,7 +481,8 @@ impl InsightChatService {
                 Capture the key moment or theme. Return ONLY the title, nothing else.",
                final_content
            );
-            let title_raw = chat_backend
+            let title_raw = backend
+                .chat()
                .generate(
                    &title_prompt,
                    Some(
@@ -604,7 +507,7 @@ impl InsightChatService {
                model_version: model_used.clone(),
                is_current: true,
                training_messages: Some(json),
-                backend: effective_backend.clone(),
+                backend: kind.as_str().to_string(),
                fewshot_source_ids: None,
                content_hash: None,
            };
@@ -629,7 +532,7 @@ impl InsightChatService {
            prompt_eval_count: last_prompt_eval_count,
            eval_count: last_eval_count,
            amended_insight_id,
-            backend_used: effective_backend,
+            backend_used: kind.as_str().to_string(),
            model_used,
        })
    }
@@ -818,9 +721,8 @@ impl InsightChatService {
            .map(|s| s.trim().to_lowercase())
            .filter(|s| !s.is_empty())
            .unwrap_or_else(|| stored_backend.clone());
-        validate_cross_replay(&stored_backend, &effective_backend)?;
-        let is_hybrid = effective_backend == "hybrid";
-        let describes_then_inlines = is_hybrid;
+        let kind = BackendKind::parse(&effective_backend)?;
+        validate_cross_replay(&stored_backend, kind.as_str())?;

        let max_iterations = req
            .max_iterations
@@ -828,18 +730,20 @@ impl InsightChatService {
            .clamp(1, env_max_iterations());

        let stored_model = insight.model_version.clone();
-        let custom_model = req
-            .model
-            .clone()
-            .or_else(|| Some(stored_model.clone()))
-            .filter(|m| !m.is_empty());
+        let overrides = SamplingOverrides {
+            model: req.model.clone()
+                .or_else(|| Some(stored_model.clone()))
+                .filter(|m| !m.is_empty()),
+            num_ctx: req.num_ctx,
+            temperature: req.temperature,
+            top_p: req.top_p,
+            top_k: req.top_k,
+            min_p: req.min_p,
+        };
+        let backend = self.generator.resolve_backend(kind, &overrides).await?;
+        let model_used = backend.model().to_string();

-        let (chat_backend_holder, ollama_client) =
-            self.build_chat_clients(&effective_backend, custom_model.as_deref(), &req)?;
-        let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref();
-        let model_used = chat_backend.primary_model().to_string();
-
-        // Tool set — local/llamacpp mode + first user turn carries an image →
+        // Tool set — images_inline mode + first user turn carries an image →
        // offer describe_photo. Describe-then-inline mode (hybrid only):
        // visual description was inlined at bootstrap, no describe tool needed.
        let local_first_user_has_image = messages
@@ -848,7 +752,7 @@ impl InsightChatService {
            .and_then(|m| m.images.as_ref())
            .map(|imgs| !imgs.is_empty())
            .unwrap_or(false);
-        let offer_describe_tool = !describes_then_inlines && local_first_user_has_image;
+        let offer_describe_tool = backend.images_inline && local_first_user_has_image;
        let gate_opts = self.generator.current_gate_opts_for_persona(
            offer_describe_tool,
            Some((req.user_id, &active_persona)),
@@ -879,16 +783,13 @@ impl InsightChatService {

        let outcome = self
            .run_streaming_agentic_loop(
-                chat_backend,
-                &ollama_client,
+                &backend,
                &mut messages,
                tools,
                &image_base64,
                &normalized,
                req.user_id,
                &active_persona,
-                &model_used,
-                &effective_backend,
                max_iterations,
                &tx,
            )
@@ -916,7 +817,7 @@ impl InsightChatService {

        let mut amended_insight_id: Option<i32> = None;
        if req.amend {
-            let title = self.generate_title(chat_backend, &final_content).await?;
+            let title = self.generate_title(&backend, &final_content).await?;

            // Amended rows intentionally do not inherit the parent's
            // `fewshot_source_ids`. The parent's few-shot influence is still
@@ -932,7 +833,7 @@ impl InsightChatService {
                model_version: model_used.clone(),
                is_current: true,
                training_messages: Some(json),
-                backend: effective_backend.clone(),
+                backend: kind.as_str().to_string(),
                fewshot_source_ids: None,
                content_hash: None,
            };
@@ -958,7 +859,7 @@ impl InsightChatService {
                eval_tokens: last_eval_count,
                num_ctx: req.num_ctx,
                amended_insight_id,
-                backend_used: effective_backend,
+                backend_used: kind.as_str().to_string(),
                model_used,
            })
            .await;
@@ -984,21 +885,23 @@ impl InsightChatService {
            .filter(|s| !s.trim().is_empty())
            .unwrap_or_else(|| "default".to_string());
        let effective_backend = resolve_bootstrap_backend(req.backend.as_deref())?;
-        let is_hybrid = effective_backend == "hybrid";
-        let local_via_llamacpp =
-            crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
-        let describes_then_inlines = is_hybrid;
+        let kind = BackendKind::parse(&effective_backend)?;

        let max_iterations = req
            .max_iterations
            .unwrap_or(DEFAULT_MAX_ITERATIONS)
            .clamp(1, env_max_iterations());

-        let custom_model = req.model.clone().filter(|m| !m.is_empty());
-        let (chat_backend_holder, ollama_client) =
-            self.build_chat_clients(&effective_backend, custom_model.as_deref(), &req)?;
-        let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref();
-        let model_used = chat_backend.primary_model().to_string();
+        let overrides = SamplingOverrides {
+            model: req.model.clone().filter(|m| !m.is_empty()),
+            num_ctx: req.num_ctx,
+            temperature: req.temperature,
+            top_p: req.top_p,
+            top_k: req.top_k,
+            min_p: req.min_p,
+        };
+        let backend = self.generator.resolve_backend(kind, &overrides).await?;
+        let model_used = backend.model().to_string();

        // Load image bytes once. RAW preview fallback is handled inside
        // load_image_as_base64. Errors degrade silently — a chat that
@@ -1020,26 +923,17 @@ impl InsightChatService {
            });

        // Describe-then-inline (hybrid only): pre-describe the image so a
-        // text-only chat model gets the visual description inline. llamacpp
-        // sends images directly to the chat model.
-        let visual_block = if describes_then_inlines {
+        // text-only chat model gets the visual description inline.
+        // images_inline backends send images directly to the chat model.
+        let visual_block = if !backend.images_inline {
            match image_base64.as_deref() {
                Some(b64) => {
-                    let described = if local_via_llamacpp {
-                        self.llamacpp
-                            .as_ref()
-                            .expect("local_via_llamacpp guarantees Some")
-                            .describe_image(b64)
-                            .await
-                    } else {
-                        self.ollama.describe_image(b64).await
-                    };
-                    match described {
+                    match backend.local().describe_image(b64).await {
                        Ok(desc) => {
                            format!("Visual description (from local vision model):\n{}\n", desc)
                        }
                        Err(e) => {
-                            log::warn!("{} bootstrap: describe_image failed: {}", effective_backend, e);
+                            log::warn!("{} bootstrap: describe_image failed: {}", kind.as_str(), e);
                            String::new()
                        }
                    }
@@ -1050,10 +944,10 @@ impl InsightChatService {
            String::new()
        };

-        // Tool gates. Local + image present → expose describe_photo so
-        // the chat model can re-look at the photo on demand. Hybrid:
+        // Tool gates. images_inline + image present → expose describe_photo so
+        // the chat model can re-look at the photo on demand. Non-inline:
        // already inlined, no tool needed.
-        let offer_describe_tool = !describes_then_inlines && image_base64.is_some();
+        let offer_describe_tool = backend.images_inline && image_base64.is_some();
        let gate_opts = self.generator.current_gate_opts_for_persona(
            offer_describe_tool,
            Some((req.user_id, &active_persona)),
@@ -1079,23 +973,22 @@ impl InsightChatService {
        );
        let system_msg = ChatMessage::system(system_content);
        let mut user_msg = ChatMessage::user(req.user_message.clone());
-        if !describes_then_inlines && let Some(ref img) = image_base64 {
-            user_msg.images = Some(vec![img.clone()]);
+        if backend.images_inline {
+            if let Some(ref img) = image_base64 {
+                user_msg.images = Some(vec![img.clone()]);
+            }
        }
        let mut messages = vec![system_msg, user_msg];

        let outcome = self
            .run_streaming_agentic_loop(
-                chat_backend,
-                &ollama_client,
+                &backend,
                &mut messages,
                tools,
                &image_base64,
                &normalized,
                req.user_id,
                &active_persona,
-                &model_used,
-                &effective_backend,
                max_iterations,
                &tx,
            )
@@ -1108,7 +1001,7 @@ impl InsightChatService {
            final_content,
        } = outcome;

-        let title = self.generate_title(chat_backend, &final_content).await?;
+        let title = self.generate_title(&backend, &final_content).await?;

        let json = serde_json::to_string(&messages)
            .map_err(|e| anyhow!("failed to serialize chat history: {}", e))?;
@@ -1121,7 +1014,7 @@ impl InsightChatService {
            model_version: model_used.clone(),
            is_current: true,
            training_messages: Some(json),
-            backend: effective_backend.clone(),
+            backend: kind.as_str().to_string(),
            fewshot_source_ids: None,
            content_hash: None,
        };
@@ -1144,7 +1037,7 @@ impl InsightChatService {
                eval_tokens: last_eval_count,
                num_ctx: req.num_ctx,
                amended_insight_id: Some(stored.id),
-                backend_used: effective_backend,
+                backend_used: kind.as_str().to_string(),
                model_used,
            })
            .await;
@@ -1152,95 +1045,12 @@ impl InsightChatService {
        Ok(())
    }

-    /// Set up chat clients (Ollama + optional OpenRouter / LlamaCpp) shared
-    /// by bootstrap and continuation. Returns the chat-side backend client
-    /// (boxed because each backend has a different concrete type) and the
-    /// Ollama client used for describe-image / local tool calls.
-    ///
-    /// `effective_backend` must be one of `"local"` or `"hybrid"` (validated
-    /// upstream). Hybrid → OpenRouter; local with `LLM_BACKEND=llamacpp` →
-    /// llama-swap; pure local → Ollama. Returns the dispatched chat client
-    /// plus the (possibly per-request) Ollama client that the caller uses
-    /// for non-chat helpers (image describe in non-llamacpp mode, tool ops).
-    fn build_chat_clients(
-        &self,
-        effective_backend: &str,
-        custom_model: Option<&str>,
-        req: &ChatTurnRequest,
-    ) -> Result<(Box<dyn LlmClient>, OllamaClient)> {
-        let mut ollama_client = self.ollama.clone();
-
-        if effective_backend == "hybrid" {
-            let arc = self.openrouter.as_ref().ok_or_else(|| {
-                anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
-            })?;
-            let mut c: OpenRouterClient = (**arc).clone();
-            if let Some(m) = custom_model {
-                c.primary_model = m.to_string();
-            }
-            if req.temperature.is_some()
-                || req.top_p.is_some()
-                || req.top_k.is_some()
-                || req.min_p.is_some()
-            {
-                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
-            }
-            if let Some(ctx) = req.num_ctx {
-                c.set_num_ctx(Some(ctx));
-            }
-            return Ok((Box::new(c), ollama_client));
-        }
-
-        // Local mode — env switch decides between Ollama and llama-swap.
-        if crate::ai::local_backend_is_llamacpp()
-            && let Some(arc) = self.llamacpp.as_ref()
-        {
-            let mut c: LlamaCppClient = (**arc).clone();
-            if let Some(m) = custom_model {
-                c.primary_model = m.to_string();
-            }
-            if req.temperature.is_some()
-                || req.top_p.is_some()
-                || req.top_k.is_some()
-                || req.min_p.is_some()
-            {
-                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
-            }
-            if let Some(ctx) = req.num_ctx {
-                c.set_num_ctx(Some(ctx));
-            }
-            return Ok((Box::new(c), ollama_client));
-        }
-
-        if let Some(m) = custom_model
-            && m != self.ollama.primary_model
-        {
-            ollama_client = OllamaClient::new(
-                self.ollama.primary_url.clone(),
-                self.ollama.fallback_url.clone(),
-                m.to_string(),
-                Some(m.to_string()),
-            );
-        }
-        if req.temperature.is_some()
-            || req.top_p.is_some()
-            || req.top_k.is_some()
-            || req.min_p.is_some()
-        {
-            ollama_client.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
-        }
-        if let Some(ctx) = req.num_ctx {
-            ollama_client.set_num_ctx(Some(ctx));
-        }
-        Ok((Box::new(ollama_client.clone()), ollama_client))
-    }
-
    /// Generate a short title via the same chat backend so voice stays
    /// consistent with the body. Mirrors generate_agentic_insight_for_photo's
    /// titling pass.
    async fn generate_title(
        &self,
-        chat_backend: &dyn LlmClient,
+        backend: &ResolvedBackend,
        final_content: &str,
    ) -> Result<String> {
        let title_prompt = format!(
@@ -1248,7 +1058,8 @@ impl InsightChatService {
             Capture the key moment or theme. Return ONLY the title, nothing else.",
            final_content
        );
-        let title_raw = chat_backend
+        let title_raw = backend
+            .chat()
            .generate(
                &title_prompt,
                Some(
@@ -1266,18 +1077,13 @@ impl InsightChatService {
    /// final assistant content.
    async fn run_streaming_agentic_loop(
        &self,
-        chat_backend: &dyn LlmClient,
-        ollama_client: &OllamaClient,
+        backend: &ResolvedBackend,
        messages: &mut Vec<ChatMessage>,
        tools: Vec<Tool>,
        image_base64: &Option<String>,
        normalized: &str,
        user_id: i32,
        active_persona: &str,
-        // Provenance — stamped onto any store_fact tool call made
-        // during this loop. Mirrors the non-streaming chat path.
-        model_used: &str,
-        effective_backend: &str,
        max_iterations: usize,
        tx: &tokio::sync::mpsc::Sender<ChatStreamEvent>,
    ) -> Result<AgenticLoopOutcome> {
@@ -1296,7 +1102,8 @@ impl InsightChatService {
                })
                .await;

-            let mut stream = chat_backend
+            let mut stream = backend
+                .chat()
                .chat_with_tools_stream(messages.clone(), tools.clone())
                .await?;

@@ -1353,13 +1160,11 @@ impl InsightChatService {
                        .execute_tool(
                            &tool_call.function.name,
                            &tool_call.function.arguments,
-                            ollama_client,
+                            backend,
                            image_base64,
                            normalized,
                            user_id,
                            active_persona,
-                            model_used,
-                            effective_backend,
                            &cx,
                        )
                        .await;
@@ -1394,7 +1199,8 @@ impl InsightChatService {
            messages.push(ChatMessage::user(
                "Please write your final answer now without calling any more tools.",
            ));
-            let mut stream = chat_backend
+            let mut stream = backend
+                .chat()
                .chat_with_tools_stream(messages.clone(), vec![])
                .await?;
            let mut final_message: Option<ChatMessage> = None;