ai: extract ResolvedBackend, remove ~480 lines of duplicated dispatch

Replace 5 copies of the ~80-line backend resolution pattern with a single InsightGenerator::resolve_backend() builder that returns a ResolvedBackend (chat + local clients, BackendKind enum, images_inline flag). Tool dispatch now takes &ResolvedBackend instead of &OllamaClient + model + backend strings. Remove duplicated ollama/openrouter/llamacpp fields from InsightChatService — InsightGenerator owns them and resolve_backend uses them. Delete build_chat_clients (replaced by resolve_backend). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-24 15:00:50 -04:00
parent 0631820fbf
commit a8a661f70a
3 changed files with 158 additions and 640 deletions
--- a/src/ai/insight_chat.rs
+++ b/src/ai/insight_chat.rs
@@ -6,11 +6,9 @@ use std::collections::HashMap;
 use std::sync::{Arc, Mutex};
 use tokio::sync::Mutex as TokioMutex;
 use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides};
 use crate::ai::insight_generator::InsightGenerator;
-use crate::ai::llm_client::{ChatMessage, LlmClient, LlmStreamEvent, Tool};
+use crate::ai::llm_client::{ChatMessage, LlmStreamEvent, Tool};
 use crate::ai::ollama::OllamaClient;
 use crate::ai::llamacpp::LlamaCppClient;
 use crate::ai::openrouter::OpenRouterClient;
 use crate::database::InsightDao;
 use crate::database::models::InsertPhotoInsight;
 use crate::otel::global_tracer;
@@ -92,9 +90,6 @@ pub struct ChatTurnResult {
 #[derive(Clone)]
 pub struct InsightChatService {
    generator: Arc<InsightGenerator>,
    ollama: OllamaClient,
    openrouter: Option<Arc<OpenRouterClient>>,
    llamacpp: Option<Arc<LlamaCppClient>>,
    insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
    chat_locks: ChatLockMap,
 }
@@ -102,17 +97,11 @@ pub struct InsightChatService {
 impl InsightChatService {
    pub fn new(
        generator: Arc<InsightGenerator>,
        ollama: OllamaClient,
        openrouter: Option<Arc<OpenRouterClient>>,
        llamacpp: Option<Arc<LlamaCppClient>>,
        insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
        chat_locks: ChatLockMap,
    ) -> Self {
        Self {
            generator,
            ollama,
            openrouter,
            llamacpp,
            insight_dao,
            chat_locks,
        }
@@ -308,16 +297,9 @@ impl InsightChatService {
            .filter(|s| !s.is_empty())
            .unwrap_or_else(|| stored_backend.clone());
        validate_cross_replay(&stored_backend, &effective_backend)?;
-        let is_hybrid = effective_backend == "hybrid";
+        let kind = BackendKind::parse(&effective_backend)?;
-        let local_via_llamacpp =
+        span.set_attribute(KeyValue::new("backend", kind.as_str()));
            crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
        let describes_then_inlines = is_hybrid;
        span.set_attribute(KeyValue::new("backend", effective_backend.clone()));
        // 4. Build the chat backend client. Hybrid → OpenRouter; local with
        //    `LLM_BACKEND=llamacpp` → llama-swap; otherwise Ollama. Clones
        //    so per-request sampling/model overrides don't leak into shared
        //    state.
        let max_iterations = req
            .max_iterations
            .unwrap_or(DEFAULT_MAX_ITERATIONS)
@@ -325,113 +307,36 @@ impl InsightChatService {
        span.set_attribute(KeyValue::new("max_iterations", max_iterations as i64));
        let stored_model = insight.model_version.clone();
-        let custom_model = req
+        let overrides = SamplingOverrides {
-            .model
+            model: req.model.clone()
-            .clone()
+                .or_else(|| Some(stored_model.clone()))
-            .or_else(|| Some(stored_model.clone()))
+                .filter(|m| !m.is_empty()),
-            .filter(|m| !m.is_empty());
+            num_ctx: req.num_ctx,
-
+            temperature: req.temperature,
-        let mut ollama_client = self.ollama.clone();
+            top_p: req.top_p,
-        let mut openrouter_client: Option<OpenRouterClient> = None;
+            top_k: req.top_k,
-        let mut llamacpp_client: Option<LlamaCppClient> = None;
+            min_p: req.min_p,
        if is_hybrid {
            let arc = self.openrouter.as_ref().ok_or_else(|| {
                anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
            })?;
            let mut c: OpenRouterClient = (**arc).clone();
            if let Some(ref m) = custom_model {
                c.primary_model = m.clone();
            }
            if req.temperature.is_some()
                || req.top_p.is_some()
                || req.top_k.is_some()
                || req.min_p.is_some()
            {
                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
            }
            if let Some(ctx) = req.num_ctx {
                c.set_num_ctx(Some(ctx));
            }
            openrouter_client = Some(c);
        } else if local_via_llamacpp {
            let arc = self.llamacpp.as_ref().ok_or_else(|| {
                anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured")
            })?;
            let mut c: LlamaCppClient = (**arc).clone();
            if let Some(ref m) = custom_model {
                c.primary_model = m.clone();
            }
            if req.temperature.is_some()
                || req.top_p.is_some()
                || req.top_k.is_some()
                || req.min_p.is_some()
            {
                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
            }
            if let Some(ctx) = req.num_ctx {
                c.set_num_ctx(Some(ctx));
            }
            llamacpp_client = Some(c);
        } else {
            // Pure local (Ollama): model swap. Build a new client when the
            // chat model differs from the configured one.
            if let Some(ref m) = custom_model
                && m != &self.ollama.primary_model
            {
                ollama_client = OllamaClient::new(
                    self.ollama.primary_url.clone(),
                    self.ollama.fallback_url.clone(),
                    m.clone(),
                    Some(m.clone()),
                );
            }
            if req.temperature.is_some()
                || req.top_p.is_some()
                || req.top_k.is_some()
                || req.min_p.is_some()
            {
                ollama_client.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
            }
            if let Some(ctx) = req.num_ctx {
                ollama_client.set_num_ctx(Some(ctx));
            }
        }
        let chat_backend: &dyn LlmClient = if let Some(ref c) = llamacpp_client {
            c
        } else if let Some(ref c) = openrouter_client {
            c
        } else {
            &ollama_client
        };
-        let model_used = chat_backend.primary_model().to_string();
+        let backend = self.generator.resolve_backend(kind, &overrides).await?;
        let model_used = backend.model().to_string();
        span.set_attribute(KeyValue::new("model", model_used.clone()));
-        // 5. Decide vision + tool set. In describe-then-inline mode
+        // 5. Decide vision + tool set. In hybrid (describe-then-inline) mode
-        //    (hybrid only) we omit `describe_photo`. In local and llamacpp
+        //    we omit `describe_photo`. Otherwise trust the stored history:
-        //    we trust the stored history's first-user shape: if it carries
+        //    if the first user message carries images, describe_photo stays.
        //    `images`, the original model was vision-capable, and we keep
        //    `describe_photo` available.
        let local_first_user_has_image = messages
            .iter()
            .find(|m| m.role == "user")
            .and_then(|m| m.images.as_ref())
            .map(|imgs| !imgs.is_empty())
            .unwrap_or(false);
-        let offer_describe_tool = !describes_then_inlines && local_first_user_has_image;
+        let offer_describe_tool = backend.images_inline && local_first_user_has_image;
        // current_gate_opts(has_vision) sets gate_opts.has_vision = has_vision
        // and probes the per-table presence flags. Pass `offer_describe_tool`
        // directly — the `!is_hybrid && local_first_user_has_image` decision
        // is the chat-path's vision predicate.
        let gate_opts = self.generator.current_gate_opts_for_persona(
            offer_describe_tool,
            Some((req.user_id, &active_persona)),
        );
        let tools = InsightGenerator::build_tool_definitions(gate_opts);
        // Image base64 only needed when describe_photo is on the menu. Load
        // lazily to avoid disk IO when the loop never invokes it.
        let image_base64: Option<String> = if offer_describe_tool {
            self.generator.load_image_as_base64(&normalized).ok()
        } else {
@@ -480,13 +385,13 @@ impl InsightChatService {
            iterations_used = iteration + 1;
            log::info!("Chat iteration {}/{}", iterations_used, max_iterations);
-            let (response, prompt_tokens, eval_tokens) = chat_backend
+            let (response, prompt_tokens, eval_tokens) = backend
                .chat()
                .chat_with_tools(messages.clone(), tools.clone())
                .await?;
            last_prompt_eval_count = prompt_tokens;
            last_eval_count = eval_tokens;
            // Ollama rejects non-object tool-call arguments on replay.
            let mut response = response;
            if let Some(ref mut tcs) = response.tool_calls {
                for tc in tcs.iter_mut() {
@@ -514,13 +419,11 @@ impl InsightChatService {
                        .execute_tool(
                            &tool_call.function.name,
                            &tool_call.function.arguments,
-                            &ollama_client,
+                            &backend,
                            &image_base64,
                            &normalized,
                            req.user_id,
                            &active_persona,
                            &model_used,
                            &effective_backend,
                            &loop_cx,
                        )
                        .await;
@@ -534,8 +437,6 @@ impl InsightChatService {
        }
        if final_content.is_empty() {
            // The model never produced a final answer; ask once more without
            // tools to force a textual reply.
            log::info!(
                "Chat loop exhausted after {} iterations, requesting final answer",
                iterations_used
@@ -543,7 +444,8 @@ impl InsightChatService {
            messages.push(ChatMessage::user(
                "Please write your final answer now without calling any more tools.",
            ));
-            let (final_response, prompt_tokens, eval_tokens) = chat_backend
+            let (final_response, prompt_tokens, eval_tokens) = backend
                .chat()
                .chat_with_tools(messages.clone(), vec![])
                .await?;
            last_prompt_eval_count = prompt_tokens;
@@ -579,7 +481,8 @@ impl InsightChatService {
                 Capture the key moment or theme. Return ONLY the title, nothing else.",
                final_content
            );
-            let title_raw = chat_backend
+            let title_raw = backend
                .chat()
                .generate(
                    &title_prompt,
                    Some(
@@ -604,7 +507,7 @@ impl InsightChatService {
                model_version: model_used.clone(),
                is_current: true,
                training_messages: Some(json),
-                backend: effective_backend.clone(),
+                backend: kind.as_str().to_string(),
                fewshot_source_ids: None,
                content_hash: None,
            };
@@ -629,7 +532,7 @@ impl InsightChatService {
            prompt_eval_count: last_prompt_eval_count,
            eval_count: last_eval_count,
            amended_insight_id,
-            backend_used: effective_backend,
+            backend_used: kind.as_str().to_string(),
            model_used,
        })
    }
@@ -818,9 +721,8 @@ impl InsightChatService {
            .map(|s| s.trim().to_lowercase())
            .filter(|s| !s.is_empty())
            .unwrap_or_else(|| stored_backend.clone());
-        validate_cross_replay(&stored_backend, &effective_backend)?;
+        let kind = BackendKind::parse(&effective_backend)?;
-        let is_hybrid = effective_backend == "hybrid";
+        validate_cross_replay(&stored_backend, kind.as_str())?;
        let describes_then_inlines = is_hybrid;
        let max_iterations = req
            .max_iterations
@@ -828,18 +730,20 @@ impl InsightChatService {
            .clamp(1, env_max_iterations());
        let stored_model = insight.model_version.clone();
-        let custom_model = req
+        let overrides = SamplingOverrides {
-            .model
+            model: req.model.clone()
-            .clone()
+                .or_else(|| Some(stored_model.clone()))
-            .or_else(|| Some(stored_model.clone()))
+                .filter(|m| !m.is_empty()),
-            .filter(|m| !m.is_empty());
+            num_ctx: req.num_ctx,
            temperature: req.temperature,
            top_p: req.top_p,
            top_k: req.top_k,
            min_p: req.min_p,
        };
        let backend = self.generator.resolve_backend(kind, &overrides).await?;
        let model_used = backend.model().to_string();
-        let (chat_backend_holder, ollama_client) =
+        // Tool set — images_inline mode + first user turn carries an image →
            self.build_chat_clients(&effective_backend, custom_model.as_deref(), &req)?;
        let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref();
        let model_used = chat_backend.primary_model().to_string();
        // Tool set — local/llamacpp mode + first user turn carries an image →
        // offer describe_photo. Describe-then-inline mode (hybrid only):
        // visual description was inlined at bootstrap, no describe tool needed.
        let local_first_user_has_image = messages
@@ -848,7 +752,7 @@ impl InsightChatService {
            .and_then(|m| m.images.as_ref())
            .map(|imgs| !imgs.is_empty())
            .unwrap_or(false);
-        let offer_describe_tool = !describes_then_inlines && local_first_user_has_image;
+        let offer_describe_tool = backend.images_inline && local_first_user_has_image;
        let gate_opts = self.generator.current_gate_opts_for_persona(
            offer_describe_tool,
            Some((req.user_id, &active_persona)),
@@ -879,16 +783,13 @@ impl InsightChatService {
        let outcome = self
            .run_streaming_agentic_loop(
-                chat_backend,
+                &backend,
                &ollama_client,
                &mut messages,
                tools,
                &image_base64,
                &normalized,
                req.user_id,
                &active_persona,
                &model_used,
                &effective_backend,
                max_iterations,
                &tx,
            )
@@ -916,7 +817,7 @@ impl InsightChatService {
        let mut amended_insight_id: Option<i32> = None;
        if req.amend {
-            let title = self.generate_title(chat_backend, &final_content).await?;
+            let title = self.generate_title(&backend, &final_content).await?;
            // Amended rows intentionally do not inherit the parent's
            // `fewshot_source_ids`. The parent's few-shot influence is still
@@ -932,7 +833,7 @@ impl InsightChatService {
                model_version: model_used.clone(),
                is_current: true,
                training_messages: Some(json),
-                backend: effective_backend.clone(),
+                backend: kind.as_str().to_string(),
                fewshot_source_ids: None,
                content_hash: None,
            };
@@ -958,7 +859,7 @@ impl InsightChatService {
                eval_tokens: last_eval_count,
                num_ctx: req.num_ctx,
                amended_insight_id,
-                backend_used: effective_backend,
+                backend_used: kind.as_str().to_string(),
                model_used,
            })
            .await;
@@ -984,21 +885,23 @@ impl InsightChatService {
            .filter(|s| !s.trim().is_empty())
            .unwrap_or_else(|| "default".to_string());
        let effective_backend = resolve_bootstrap_backend(req.backend.as_deref())?;
-        let is_hybrid = effective_backend == "hybrid";
+        let kind = BackendKind::parse(&effective_backend)?;
        let local_via_llamacpp =
            crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
        let describes_then_inlines = is_hybrid;
        let max_iterations = req
            .max_iterations
            .unwrap_or(DEFAULT_MAX_ITERATIONS)
            .clamp(1, env_max_iterations());
-        let custom_model = req.model.clone().filter(|m| !m.is_empty());
+        let overrides = SamplingOverrides {
-        let (chat_backend_holder, ollama_client) =
+            model: req.model.clone().filter(|m| !m.is_empty()),
-            self.build_chat_clients(&effective_backend, custom_model.as_deref(), &req)?;
+            num_ctx: req.num_ctx,
-        let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref();
+            temperature: req.temperature,
-        let model_used = chat_backend.primary_model().to_string();
+            top_p: req.top_p,
            top_k: req.top_k,
            min_p: req.min_p,
        };
        let backend = self.generator.resolve_backend(kind, &overrides).await?;
        let model_used = backend.model().to_string();
        // Load image bytes once. RAW preview fallback is handled inside
        // load_image_as_base64. Errors degrade silently — a chat that
@@ -1020,26 +923,17 @@ impl InsightChatService {
            });
        // Describe-then-inline (hybrid only): pre-describe the image so a
-        // text-only chat model gets the visual description inline. llamacpp
+        // text-only chat model gets the visual description inline.
-        // sends images directly to the chat model.
+        // images_inline backends send images directly to the chat model.
-        let visual_block = if describes_then_inlines {
+        let visual_block = if !backend.images_inline {
            match image_base64.as_deref() {
                Some(b64) => {
-                    let described = if local_via_llamacpp {
+                    match backend.local().describe_image(b64).await {
                        self.llamacpp
                            .as_ref()
                            .expect("local_via_llamacpp guarantees Some")
                            .describe_image(b64)
                            .await
                    } else {
                        self.ollama.describe_image(b64).await
                    };
                    match described {
                        Ok(desc) => {
                            format!("Visual description (from local vision model):\n{}\n", desc)
                        }
                        Err(e) => {
-                            log::warn!("{} bootstrap: describe_image failed: {}", effective_backend, e);
+                            log::warn!("{} bootstrap: describe_image failed: {}", kind.as_str(), e);
                            String::new()
                        }
                    }
@@ -1050,10 +944,10 @@ impl InsightChatService {
            String::new()
        };
-        // Tool gates. Local + image present → expose describe_photo so
+        // Tool gates. images_inline + image present → expose describe_photo so
-        // the chat model can re-look at the photo on demand. Hybrid:
+        // the chat model can re-look at the photo on demand. Non-inline:
        // already inlined, no tool needed.
-        let offer_describe_tool = !describes_then_inlines && image_base64.is_some();
+        let offer_describe_tool = backend.images_inline && image_base64.is_some();
        let gate_opts = self.generator.current_gate_opts_for_persona(
            offer_describe_tool,
            Some((req.user_id, &active_persona)),
@@ -1079,23 +973,22 @@ impl InsightChatService {
        );
        let system_msg = ChatMessage::system(system_content);
        let mut user_msg = ChatMessage::user(req.user_message.clone());
-        if !describes_then_inlines && let Some(ref img) = image_base64 {
+        if backend.images_inline {
-            user_msg.images = Some(vec![img.clone()]);
+            if let Some(ref img) = image_base64 {
                user_msg.images = Some(vec![img.clone()]);
            }
        }
        let mut messages = vec![system_msg, user_msg];
        let outcome = self
            .run_streaming_agentic_loop(
-                chat_backend,
+                &backend,
                &ollama_client,
                &mut messages,
                tools,
                &image_base64,
                &normalized,
                req.user_id,
                &active_persona,
                &model_used,
                &effective_backend,
                max_iterations,
                &tx,
            )
@@ -1108,7 +1001,7 @@ impl InsightChatService {
            final_content,
        } = outcome;
-        let title = self.generate_title(chat_backend, &final_content).await?;
+        let title = self.generate_title(&backend, &final_content).await?;
        let json = serde_json::to_string(&messages)
            .map_err(|e| anyhow!("failed to serialize chat history: {}", e))?;
@@ -1121,7 +1014,7 @@ impl InsightChatService {
            model_version: model_used.clone(),
            is_current: true,
            training_messages: Some(json),
-            backend: effective_backend.clone(),
+            backend: kind.as_str().to_string(),
            fewshot_source_ids: None,
            content_hash: None,
        };
@@ -1144,7 +1037,7 @@ impl InsightChatService {
                eval_tokens: last_eval_count,
                num_ctx: req.num_ctx,
                amended_insight_id: Some(stored.id),
-                backend_used: effective_backend,
+                backend_used: kind.as_str().to_string(),
                model_used,
            })
            .await;
@@ -1152,95 +1045,12 @@ impl InsightChatService {
        Ok(())
    }
    /// Set up chat clients (Ollama + optional OpenRouter / LlamaCpp) shared
    /// by bootstrap and continuation. Returns the chat-side backend client
    /// (boxed because each backend has a different concrete type) and the
    /// Ollama client used for describe-image / local tool calls.
    ///
    /// `effective_backend` must be one of `"local"` or `"hybrid"` (validated
    /// upstream). Hybrid → OpenRouter; local with `LLM_BACKEND=llamacpp` →
    /// llama-swap; pure local → Ollama. Returns the dispatched chat client
    /// plus the (possibly per-request) Ollama client that the caller uses
    /// for non-chat helpers (image describe in non-llamacpp mode, tool ops).
    fn build_chat_clients(
        &self,
        effective_backend: &str,
        custom_model: Option<&str>,
        req: &ChatTurnRequest,
    ) -> Result<(Box<dyn LlmClient>, OllamaClient)> {
        let mut ollama_client = self.ollama.clone();
        if effective_backend == "hybrid" {
            let arc = self.openrouter.as_ref().ok_or_else(|| {
                anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
            })?;
            let mut c: OpenRouterClient = (**arc).clone();
            if let Some(m) = custom_model {
                c.primary_model = m.to_string();
            }
            if req.temperature.is_some()
                || req.top_p.is_some()
                || req.top_k.is_some()
                || req.min_p.is_some()
            {
                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
            }
            if let Some(ctx) = req.num_ctx {
                c.set_num_ctx(Some(ctx));
            }
            return Ok((Box::new(c), ollama_client));
        }
        // Local mode — env switch decides between Ollama and llama-swap.
        if crate::ai::local_backend_is_llamacpp()
            && let Some(arc) = self.llamacpp.as_ref()
        {
            let mut c: LlamaCppClient = (**arc).clone();
            if let Some(m) = custom_model {
                c.primary_model = m.to_string();
            }
            if req.temperature.is_some()
                || req.top_p.is_some()
                || req.top_k.is_some()
                || req.min_p.is_some()
            {
                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
            }
            if let Some(ctx) = req.num_ctx {
                c.set_num_ctx(Some(ctx));
            }
            return Ok((Box::new(c), ollama_client));
        }
        if let Some(m) = custom_model
            && m != self.ollama.primary_model
        {
            ollama_client = OllamaClient::new(
                self.ollama.primary_url.clone(),
                self.ollama.fallback_url.clone(),
                m.to_string(),
                Some(m.to_string()),
            );
        }
        if req.temperature.is_some()
            || req.top_p.is_some()
            || req.top_k.is_some()
            || req.min_p.is_some()
        {
            ollama_client.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
        }
        if let Some(ctx) = req.num_ctx {
            ollama_client.set_num_ctx(Some(ctx));
        }
        Ok((Box::new(ollama_client.clone()), ollama_client))
    }
    /// Generate a short title via the same chat backend so voice stays
    /// consistent with the body. Mirrors generate_agentic_insight_for_photo's
    /// titling pass.
    async fn generate_title(
        &self,
-        chat_backend: &dyn LlmClient,
+        backend: &ResolvedBackend,
        final_content: &str,
    ) -> Result<String> {
        let title_prompt = format!(
@@ -1248,7 +1058,8 @@ impl InsightChatService {
             Capture the key moment or theme. Return ONLY the title, nothing else.",
            final_content
        );
-        let title_raw = chat_backend
+        let title_raw = backend
            .chat()
            .generate(
                &title_prompt,
                Some(
@@ -1266,18 +1077,13 @@ impl InsightChatService {
    /// final assistant content.
    async fn run_streaming_agentic_loop(
        &self,
-        chat_backend: &dyn LlmClient,
+        backend: &ResolvedBackend,
        ollama_client: &OllamaClient,
        messages: &mut Vec<ChatMessage>,
        tools: Vec<Tool>,
        image_base64: &Option<String>,
        normalized: &str,
        user_id: i32,
        active_persona: &str,
        // Provenance — stamped onto any store_fact tool call made
        // during this loop. Mirrors the non-streaming chat path.
        model_used: &str,
        effective_backend: &str,
        max_iterations: usize,
        tx: &tokio::sync::mpsc::Sender<ChatStreamEvent>,
    ) -> Result<AgenticLoopOutcome> {
@@ -1296,7 +1102,8 @@ impl InsightChatService {
                })
                .await;
-            let mut stream = chat_backend
+            let mut stream = backend
                .chat()
                .chat_with_tools_stream(messages.clone(), tools.clone())
                .await?;
@@ -1353,13 +1160,11 @@ impl InsightChatService {
                        .execute_tool(
                            &tool_call.function.name,
                            &tool_call.function.arguments,
-                            ollama_client,
+                            backend,
                            image_base64,
                            normalized,
                            user_id,
                            active_persona,
                            model_used,
                            effective_backend,
                            &cx,
                        )
                        .await;
@@ -1394,7 +1199,8 @@ impl InsightChatService {
            messages.push(ChatMessage::user(
                "Please write your final answer now without calling any more tools.",
            ));
-            let mut stream = chat_backend
+            let mut stream = backend
                .chat()
                .chat_with_tools_stream(messages.clone(), vec![])
                .await?;
            let mut final_message: Option<ChatMessage> = None;
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -1594,29 +1594,24 @@ Return ONLY the summary, nothing else."#,
        &self,
        tool_name: &str,
        arguments: &serde_json::Value,
-        ollama: &OllamaClient,
+        backend: &ResolvedBackend,
        image_base64: &Option<String>,
        file_path: &str,
        user_id: i32,
        persona_id: &str,
        // Provenance — written into entity_facts.created_by_* when
        // the loop calls store_fact. The caller knows the actual
        // chat-runtime model and backend (which may differ from
        // ollama.primary_model in hybrid mode where chat lives on
        // OpenRouter while Ollama still handles vision).
        model: &str,
        backend: &str,
        cx: &opentelemetry::Context,
    ) -> String {
        let model = backend.model();
        let backend_label = backend.kind.as_str();
        let result = match tool_name {
-            "search_rag" => self.tool_search_rag(arguments, ollama, cx).await,
+            "search_rag" => self.tool_search_rag(arguments, backend.local(), cx).await,
            "search_messages" => self.tool_search_messages(arguments, cx).await,
            "get_sms_messages" => self.tool_get_sms_messages(arguments, cx).await,
            "get_calendar_events" => self.tool_get_calendar_events(arguments, cx).await,
            "get_location_history" => self.tool_get_location_history(arguments, cx).await,
            "get_file_tags" => self.tool_get_file_tags(arguments, cx).await,
            "get_faces_in_photo" => self.tool_get_faces_in_photo(arguments, cx).await,
-            "describe_photo" => self.tool_describe_photo(ollama, image_base64).await,
+            "describe_photo" => self.tool_describe_photo(backend.local(), image_base64).await,
            "reverse_geocode" => self.tool_reverse_geocode(arguments).await,
            "get_personal_place_at" => self.tool_get_personal_place_at(arguments).await,
            "recall_entities" => self.tool_recall_entities(arguments, cx).await,
@@ -1624,19 +1619,19 @@ Return ONLY the summary, nothing else."#,
                self.tool_recall_facts_for_photo(arguments, user_id, persona_id, cx)
                    .await
            }
-            "store_entity" => self.tool_store_entity(arguments, ollama, cx).await,
+            "store_entity" => self.tool_store_entity(arguments, cx).await,
            "store_fact" => {
                self.tool_store_fact(
-                    arguments, file_path, user_id, persona_id, model, backend, cx,
+                    arguments, file_path, user_id, persona_id, model, backend_label, cx,
                )
                .await
            }
            "update_fact" => {
-                self.tool_update_fact(arguments, user_id, persona_id, model, backend, cx)
+                self.tool_update_fact(arguments, user_id, persona_id, model, backend_label, cx)
                    .await
            }
            "supersede_fact" => {
-                self.tool_supersede_fact(arguments, user_id, persona_id, model, backend, cx)
+                self.tool_supersede_fact(arguments, user_id, persona_id, model, backend_label, cx)
                    .await
            }
            "get_current_datetime" => Self::tool_get_current_datetime(),
@@ -1654,7 +1649,7 @@ Return ONLY the summary, nothing else."#,
    async fn tool_search_rag(
        &self,
        args: &serde_json::Value,
-        ollama: &OllamaClient,
+        local: &dyn LlmClient,
        _cx: &opentelemetry::Context,
    ) -> String {
        let query = match args.get("query").and_then(|v| v.as_str()) {
@@ -1718,7 +1713,7 @@ Return ONLY the summary, nothing else."#,
        };
        let final_results = if rerank_enabled && results.len() > limit {
-            match self.rerank_with_llm(&query, &results, limit, ollama).await {
+            match self.rerank_with_llm(&query, &results, limit, local).await {
                Ok(reordered) => reordered,
                Err(e) => {
                    log::warn!("rerank failed, using vector order: {}", e);
@@ -1744,7 +1739,7 @@ Return ONLY the summary, nothing else."#,
        query: &str,
        candidates: &[String],
        limit: usize,
-        ollama: &OllamaClient,
+        local: &dyn LlmClient,
    ) -> Result<Vec<String>> {
        let query_preview: String = query.chars().take(60).collect();
        log::info!(
@@ -1785,15 +1780,7 @@ Return ONLY the summary, nothing else."#,
        let system = Some(
            "You are a terse relevance ranker. You output only numbers separated by commas.",
        );
-        let response = if crate::ai::local_backend_is_llamacpp() {
+        let response = local.generate(&prompt, system, None).await?;
            if let Some(ref lc) = self.llamacpp {
                lc.generate(&prompt, system, None).await?
            } else {
                ollama.generate_no_think(&prompt, system).await?
            }
        } else {
            ollama.generate_no_think(&prompt, system).await?
        };
        log::info!(
            "rerank: finished in {} ms (prompt={} chars)",
            started.elapsed().as_millis(),
@@ -2365,31 +2352,17 @@ Return ONLY the summary, nothing else."#,
        out
    }
    /// Tool: describe_photo — generate a visual description of the photo.
    /// Routes through llama-swap when `LLM_BACKEND=llamacpp`, Ollama otherwise.
    async fn tool_describe_photo(
        &self,
-        ollama: &OllamaClient,
+        local: &dyn LlmClient,
        image_base64: &Option<String>,
    ) -> String {
        log::info!("tool_describe_photo: generating visual description");
        match image_base64 {
-            Some(img) => {
+            Some(img) => match local.describe_image(img).await {
-                let result = if crate::ai::local_backend_is_llamacpp() {
+                Ok(desc) => desc,
-                    if let Some(ref lc) = self.llamacpp {
+                Err(e) => format!("Error describing photo: {}", e),
-                        lc.describe_image(img).await
+            },
                    } else {
                        ollama.generate_photo_description(img).await
                    }
                } else {
                    ollama.generate_photo_description(img).await
                };
                match result {
                    Ok(desc) => desc,
                    Err(e) => format!("Error describing photo: {}", e),
                }
            }
            None => "No image available for description.".to_string(),
        }
    }
@@ -2635,7 +2608,6 @@ Return ONLY the summary, nothing else."#,
    async fn tool_store_entity(
        &self,
        args: &serde_json::Value,
        _ollama: &OllamaClient,
        cx: &opentelemetry::Context,
    ) -> String {
        use crate::database::models::InsertEntity;
@@ -3775,243 +3747,25 @@ Return ONLY the summary, nothing else."#,
        span.set_attribute(KeyValue::new("file_path", file_path.clone()));
        span.set_attribute(KeyValue::new("max_iterations", max_iterations as i64));
-        // 1a. Resolve backend label (defaults to "local").
+        // 1. Resolve backend + build clients.
-        let backend_label = backend
+        let kind = BackendKind::parse(
-            .as_deref()
+            backend.as_deref().unwrap_or("local"),
-            .map(|s| s.trim().to_lowercase())
+        )?;
-            .filter(|s| !s.is_empty())
+        span.set_attribute(KeyValue::new("backend", kind.as_str()));
-            .unwrap_or_else(|| "local".to_string());
+        let overrides = SamplingOverrides {
-        if !matches!(backend_label.as_str(), "local" | "hybrid") {
+            model: custom_model,
-            return Err(anyhow::anyhow!(
+            num_ctx,
-                "unknown backend '{}'; expected 'local' or 'hybrid'",
+            temperature,
-                backend_label
+            top_p,
-            ));
+            top_k,
-        }
+            min_p,
        span.set_attribute(KeyValue::new("backend", backend_label.clone()));
        let is_hybrid = backend_label == "hybrid";
        // `LLM_BACKEND=llamacpp` swaps Ollama out for llama-swap as the
        // "local" stack — chat + embeddings route through llama-swap.
        // llamacpp models receive images directly (vision-capable); only
        // hybrid mode (OpenRouter chat) uses describe-then-inline.
        let local_via_llamacpp =
            crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
        let describes_then_inlines = is_hybrid;
        let ollama_is_chat = !is_hybrid && !local_via_llamacpp;
        // 1b. Always build an Ollama client. In local mode it owns the chat
        //     loop; in hybrid/llamacpp mode it still handles tool-local calls
        //     (e.g. future embedding-backed tools). The chat backend is
        //     selected separately below.
        //     Sampling overrides only apply when Ollama is the chat backend.
        let apply_sampling_to_ollama = ollama_is_chat;
        let mut ollama_client = if let Some(ref model) = custom_model
            && ollama_is_chat
        {
            log::info!("Using custom model for agentic: {}", model);
            span.set_attribute(KeyValue::new("custom_model", model.clone()));
            OllamaClient::new(
                self.ollama.primary_url.clone(),
                self.ollama.fallback_url.clone(),
                model.clone(),
                Some(model.clone()),
            )
        } else {
            if ollama_is_chat {
                span.set_attribute(KeyValue::new("model", self.ollama.primary_model.clone()));
            }
            self.ollama.clone()
        };
        if apply_sampling_to_ollama {
            if let Some(ctx) = num_ctx {
                log::info!("Using custom context size: {}", ctx);
                span.set_attribute(KeyValue::new("num_ctx", ctx as i64));
                ollama_client.set_num_ctx(Some(ctx));
            }
            if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() {
                log::info!(
                    "Using sampling params — temperature: {:?}, top_p: {:?}, top_k: {:?}, min_p: {:?}",
                    temperature,
                    top_p,
                    top_k,
                    min_p
                );
                if let Some(t) = temperature {
                    span.set_attribute(KeyValue::new("temperature", t as f64));
                }
                if let Some(p) = top_p {
                    span.set_attribute(KeyValue::new("top_p", p as f64));
                }
                if let Some(k) = top_k {
                    span.set_attribute(KeyValue::new("top_k", k as i64));
                }
                if let Some(m) = min_p {
                    span.set_attribute(KeyValue::new("min_p", m as f64));
                }
                ollama_client.set_sampling_params(temperature, top_p, top_k, min_p);
            }
        }
        // 1c. In hybrid mode, clone the configured OpenRouter client and
        //     apply per-request overrides.
        let openrouter_client: Option<OpenRouterClient> = if is_hybrid {
            let arc = self.openrouter.as_ref().ok_or_else(|| {
                anyhow::anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
            })?;
            let mut c: OpenRouterClient = (**arc).clone();
            if let Some(ref m) = custom_model {
                c.primary_model = m.clone();
                span.set_attribute(KeyValue::new("custom_model", m.clone()));
            }
            span.set_attribute(KeyValue::new("openrouter_model", c.primary_model.clone()));
            if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() {
                if let Some(t) = temperature {
                    span.set_attribute(KeyValue::new("temperature", t as f64));
                }
                if let Some(p) = top_p {
                    span.set_attribute(KeyValue::new("top_p", p as f64));
                }
                if let Some(k) = top_k {
                    span.set_attribute(KeyValue::new("top_k", k as i64));
                }
                if let Some(m) = min_p {
                    span.set_attribute(KeyValue::new("min_p", m as f64));
                }
                c.set_sampling_params(temperature, top_p, top_k, min_p);
            }
            if let Some(ctx) = num_ctx {
                span.set_attribute(KeyValue::new("num_ctx", ctx as i64));
                c.set_num_ctx(Some(ctx));
            }
            Some(c)
        } else {
            None
        };
        // 1d. When `LLM_BACKEND=llamacpp` and we're in local mode (not
        //     hybrid — hybrid keeps chat on OpenRouter), clone the llamacpp
        //     client and apply per-request overrides. Same shape as the
        //     openrouter branch above; describe_image will route through
        //     the vision slot configured on the client.
        let llamacpp_client: Option<LlamaCppClient> = if local_via_llamacpp && !is_hybrid {
            let arc = self.llamacpp.as_ref().ok_or_else(|| {
                anyhow::anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured")
            })?;
            let mut c: LlamaCppClient = (**arc).clone();
            if let Some(ref m) = custom_model {
                c.primary_model = m.clone();
                span.set_attribute(KeyValue::new("custom_model", m.clone()));
            }
            span.set_attribute(KeyValue::new("llamacpp_model", c.primary_model.clone()));
            if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() {
                if let Some(t) = temperature {
                    span.set_attribute(KeyValue::new("temperature", t as f64));
                }
                if let Some(p) = top_p {
                    span.set_attribute(KeyValue::new("top_p", p as f64));
                }
                if let Some(k) = top_k {
                    span.set_attribute(KeyValue::new("top_k", k as i64));
                }
                if let Some(m) = min_p {
                    span.set_attribute(KeyValue::new("min_p", m as f64));
                }
                c.set_sampling_params(temperature, top_p, top_k, min_p);
            }
            if let Some(ctx) = num_ctx {
                span.set_attribute(KeyValue::new("num_ctx", ctx as i64));
                c.set_num_ctx(Some(ctx));
            }
            Some(c)
        } else {
            None
        };
        let backend = self.resolve_backend(kind, &overrides).await?;
        span.set_attribute(KeyValue::new("model", backend.model().to_string()));
        span.set_attribute(KeyValue::new("images_inline", backend.images_inline));
        let insight_cx = current_cx.with_span(span);
        // 2. Verify chat model supports tool calling.
        //    - local: existing Ollama model availability + capability check.
        //    - hybrid: trust the operator's curated allowlist
        //      (OPENROUTER_ALLOWED_MODELS) — no live precheck. A bad model id
        //      surfaces as a chat-call error on the next step.
        let has_vision = if describes_then_inlines {
            // Hybrid: chat model never sees images — describe-then-inject.
            true
        } else if local_via_llamacpp {
            // llama-swap models receive images directly via OpenAI content
            // parts. Capability probing isn't available (no `/api/show`),
            // so assume vision support; a misconfigured model surfaces as
            // a chat-call error.
            true
        } else {
            if let Some(ref model_name) = custom_model {
                let available_on_primary =
                    OllamaClient::is_model_available(&ollama_client.primary_url, model_name)
                        .await
                        .unwrap_or(false);
                let available_on_fallback =
                    if let Some(ref fallback_url) = ollama_client.fallback_url {
                        OllamaClient::is_model_available(fallback_url, model_name)
                            .await
                            .unwrap_or(false)
                    } else {
                        false
                    };
                if !available_on_primary && !available_on_fallback {
                    anyhow::bail!(
                        "model not available: '{}' not found on any configured server",
                        model_name
                    );
                }
            }
            let model_name_for_caps = &ollama_client.primary_model;
            let capabilities = match OllamaClient::check_model_capabilities(
                &ollama_client.primary_url,
                model_name_for_caps,
            )
            .await
            {
                Ok(caps) => caps,
                Err(_) => {
                    let fallback_url = ollama_client.fallback_url.as_deref().ok_or_else(|| {
                        anyhow::anyhow!(
                            "Failed to check model capabilities for '{}': model not found on primary server and no fallback configured",
                            model_name_for_caps
                        )
                    })?;
                    OllamaClient::check_model_capabilities(fallback_url, model_name_for_caps)
                        .await
                        .map_err(|e| {
                            anyhow::anyhow!(
                                "Failed to check model capabilities for '{}': {}",
                                model_name_for_caps,
                                e
                            )
                        })?
                }
            };
            if !capabilities.has_tool_calling {
                return Err(anyhow::anyhow!(
                    "tool calling not supported by model '{}'",
                    ollama_client.primary_model
                ));
            }
            insight_cx
                .span()
                .set_attribute(KeyValue::new("model_has_vision", capabilities.has_vision));
            insight_cx
                .span()
                .set_attribute(KeyValue::new("model_has_tool_calling", true));
            capabilities.has_vision
        };
        // 3. Fetch EXIF
        let exif = {
            let mut exif_dao = self.exif_dao.lock().expect("Unable to lock ExifDao");
@@ -4103,60 +3857,33 @@ Return ONLY the summary, nothing else."#,
            }
        };
-        // 7. Load image if vision capable.
+        // 7. Load image. Always attempted — vision-capable models get the
-        //    In hybrid mode we ALSO describe it locally now so the
+        //    base64 inline; hybrid mode describes it locally and injects text.
-        //    description can be inlined as text — the OpenRouter chat model
+        let image_base64 = match self.load_image_as_base64(&file_path) {
-        //    never receives the base64 image directly.
+            Ok(b64) => {
-        let image_base64 = if has_vision {
+                log::info!("Loaded image for agentic model");
-            match self.load_image_as_base64(&file_path) {
+                Some(b64)
-                Ok(b64) => {
+            }
-                    log::info!("Loaded image for vision-capable agentic model");
+            Err(e) => {
-                    Some(b64)
+                log::warn!("Failed to load image for agentic: {}", e);
-                }
+                None
                Err(e) => {
                    log::warn!("Failed to load image for agentic vision: {}", e);
                    None
                }
            }
        } else {
            None
        };
-        // describe-then-inline path (hybrid only). Vision describe routes
+        // Describe-then-inline (hybrid only). Vision describe routes through
-        // through whichever local backend is configured — llama-swap when
+        // the local backend so non-text work stays off OpenRouter.
-        // `local_via_llamacpp`, otherwise Ollama.
+        let inlined_visual_description: Option<String> = if !backend.images_inline {
        let inlined_visual_description: Option<String> = if describes_then_inlines {
            match image_base64.as_deref() {
-                Some(b64) => {
+                Some(b64) => match backend.local().describe_image(b64).await {
-                    let described = if local_via_llamacpp {
+                    Ok(desc) => {
-                        self.llamacpp
+                        log::info!("{}: vision describe succeeded ({} chars)", kind, desc.len());
-                            .as_ref()
+                        Some(desc)
                            .expect("local_via_llamacpp guarantees Some")
                            .describe_image(b64)
                            .await
                    } else {
                        self.ollama.describe_image(b64).await
                    };
                    match described {
                        Ok(desc) => {
                            log::info!(
                                "{}: vision describe succeeded ({} chars)",
                                backend_label,
                                desc.len()
                            );
                            Some(desc)
                        }
                        Err(e) => {
                            log::warn!(
                                "{}: vision describe failed, continuing without: {}",
                                backend_label,
                                e
                            );
                            None
                        }
                    }
-                }
+                    Err(e) => {
                        log::warn!("{}: vision describe failed, continuing without: {}", kind, e);
                        None
                    }
                },
                None => None,
            }
        } else {
@@ -4228,34 +3955,24 @@ Return ONLY the summary, nothing else."#,
            date = date_taken.format("%B %d, %Y"),
        );
-        // 10. Define tools. Gate flags computed from current data presence;
+        // 10. Define tools. describe_photo offered only when the chat model
-        //     hybrid mode omits describe_photo since the chat model receives
+        //     sees images directly (images_inline); in hybrid mode the visual
-        //     the visual description inline (so we pass `false` for
+        //     description is already inlined as text.
-        //     has_vision in that mode regardless of the model's actual
+        let gate_opts = self.current_gate_opts(backend.images_inline);
        //     capability).
        let gate_opts = self.current_gate_opts(has_vision && !describes_then_inlines);
        let tools = Self::build_tool_definitions(gate_opts);
-        // 11. Build initial messages. In describe-then-inline modes images
+        // 11. Build initial messages. images_inline → attach base64 to the
-        //     are never attached to the wire message — the description is part
+        //     user message; describe-then-inline → text was already injected.
        //     of `user_content`.
        let system_msg = ChatMessage::system(system_content);
        let mut user_msg = ChatMessage::user(user_content);
-        if !describes_then_inlines && let Some(ref img) = image_base64 {
+        if backend.images_inline {
-            user_msg.images = Some(vec![img.clone()]);
+            if let Some(ref img) = image_base64 {
                user_msg.images = Some(vec![img.clone()]);
            }
        }
        let mut messages = vec![system_msg, user_msg];
        // 12. Agentic loop — dispatch through the selected backend.
        let chat_backend: &dyn LlmClient = if let Some(ref lc_c) = llamacpp_client {
            lc_c
        } else if let Some(ref or_c) = openrouter_client {
            or_c
        } else {
            &ollama_client
        };
        let loop_span = tracer.start_with_context("ai.agentic.loop", &insight_cx);
        let loop_cx = insight_cx.with_span(loop_span);
@@ -4268,7 +3985,8 @@ Return ONLY the summary, nothing else."#,
            iterations_used = iteration + 1;
            log::info!("Agentic iteration {}/{}", iteration + 1, max_iterations);
-            let (response, prompt_tokens, eval_tokens) = chat_backend
+            let (response, prompt_tokens, eval_tokens) = backend
                .chat()
                .chat_with_tools(messages.clone(), tools.clone())
                .await?;
@@ -4308,13 +4026,11 @@ Return ONLY the summary, nothing else."#,
                        .execute_tool(
                            &tool_call.function.name,
                            &tool_call.function.arguments,
-                            &ollama_client,
+                            &backend,
                            &image_base64,
                            &file_path,
                            user_id,
                            &persona_id,
                            chat_backend.primary_model(),
                            &backend_label,
                            &loop_cx,
                        )
                        .await;
@@ -4338,7 +4054,8 @@ Return ONLY the summary, nothing else."#,
                "Based on the context gathered, please write the final photo insight: a title and a detailed personal summary. Write in first person as {}.",
                user_display_name()
            )));
-            let (final_response, prompt_tokens, eval_tokens) = chat_backend
+            let (final_response, prompt_tokens, eval_tokens) = backend
                .chat()
                .chat_with_tools(messages.clone(), vec![])
                .await?;
            last_prompt_eval_count = prompt_tokens;
@@ -4360,7 +4077,8 @@ Return ONLY the summary, nothing else."#,
        let title_system = custom_system_prompt.as_deref().unwrap_or(
            "You are my long term memory assistant. Use only the information provided. Do not invent details.",
        );
-        let title_raw = chat_backend
+        let title_raw = backend
            .chat()
            .generate(&title_prompt, Some(title_system), None)
            .await?;
        let title = title_raw.trim().trim_matches('"').to_string();
@@ -4383,7 +4101,7 @@ Return ONLY the summary, nothing else."#,
        };
        // 15. Store insight (returns the persisted row including its new id)
-        let model_version = chat_backend.primary_model().to_string();
+        let model_version = backend.model().to_string();
        let fewshot_source_ids_json = if fewshot_source_ids.is_empty() {
            None
        } else {
@@ -4398,7 +4116,7 @@ Return ONLY the summary, nothing else."#,
            model_version,
            is_current: true,
            training_messages,
-            backend: backend_label.clone(),
+            backend: kind.as_str().to_string(),
            fewshot_source_ids: fewshot_source_ids_json,
            content_hash: None,
        };
--- a/src/state.rs
+++ b/src/state.rs
@@ -290,9 +290,6 @@ impl Default for AppState {
            Arc::new(tokio::sync::Mutex::new(std::collections::HashMap::new()));
        let insight_chat = Arc::new(InsightChatService::new(
            Arc::new(insight_generator.clone()),
            ollama.clone(),
            openrouter.clone(),
            llamacpp.clone(),
            insight_dao.clone(),
            chat_locks,
        ));
@@ -470,9 +467,6 @@ impl AppState {
            Arc::new(tokio::sync::Mutex::new(std::collections::HashMap::new()));
        let insight_chat = Arc::new(InsightChatService::new(
            Arc::new(insight_generator.clone()),
            ollama.clone(),
            None,
            None,
            insight_dao.clone(),
            chat_locks,
        ));