From a8a661f70a8022f304851985575bc58193b9354d Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Sun, 24 May 2026 15:00:50 -0400 Subject: [PATCH] ai: extract ResolvedBackend, remove ~480 lines of duplicated dispatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace 5 copies of the ~80-line backend resolution pattern with a single InsightGenerator::resolve_backend() builder that returns a ResolvedBackend (chat + local clients, BackendKind enum, images_inline flag). Tool dispatch now takes &ResolvedBackend instead of &OllamaClient + model + backend strings. Remove duplicated ollama/openrouter/llamacpp fields from InsightChatService — InsightGenerator owns them and resolve_backend uses them. Delete build_chat_clients (replaced by resolve_backend). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ai/insight_chat.rs | 362 +++++++----------------------- src/ai/insight_generator.rs | 430 +++++++----------------------------- src/state.rs | 6 - 3 files changed, 158 insertions(+), 640 deletions(-) diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs index 54350df..6d52f8b 100644 --- a/src/ai/insight_chat.rs +++ b/src/ai/insight_chat.rs @@ -6,11 +6,9 @@ use std::collections::HashMap; use std::sync::{Arc, Mutex}; use tokio::sync::Mutex as TokioMutex; +use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides}; use crate::ai::insight_generator::InsightGenerator; -use crate::ai::llm_client::{ChatMessage, LlmClient, LlmStreamEvent, Tool}; -use crate::ai::ollama::OllamaClient; -use crate::ai::llamacpp::LlamaCppClient; -use crate::ai::openrouter::OpenRouterClient; +use crate::ai::llm_client::{ChatMessage, LlmStreamEvent, Tool}; use crate::database::InsightDao; use crate::database::models::InsertPhotoInsight; use crate::otel::global_tracer; @@ -92,9 +90,6 @@ pub struct ChatTurnResult { #[derive(Clone)] pub struct InsightChatService { generator: Arc, - ollama: OllamaClient, - openrouter: Option>, - llamacpp: Option>, insight_dao: Arc>>, chat_locks: ChatLockMap, } @@ -102,17 +97,11 @@ pub struct InsightChatService { impl InsightChatService { pub fn new( generator: Arc, - ollama: OllamaClient, - openrouter: Option>, - llamacpp: Option>, insight_dao: Arc>>, chat_locks: ChatLockMap, ) -> Self { Self { generator, - ollama, - openrouter, - llamacpp, insight_dao, chat_locks, } @@ -308,16 +297,9 @@ impl InsightChatService { .filter(|s| !s.is_empty()) .unwrap_or_else(|| stored_backend.clone()); validate_cross_replay(&stored_backend, &effective_backend)?; - let is_hybrid = effective_backend == "hybrid"; - let local_via_llamacpp = - crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); - let describes_then_inlines = is_hybrid; - span.set_attribute(KeyValue::new("backend", effective_backend.clone())); + let kind = BackendKind::parse(&effective_backend)?; + span.set_attribute(KeyValue::new("backend", kind.as_str())); - // 4. Build the chat backend client. Hybrid → OpenRouter; local with - // `LLM_BACKEND=llamacpp` → llama-swap; otherwise Ollama. Clones - // so per-request sampling/model overrides don't leak into shared - // state. let max_iterations = req .max_iterations .unwrap_or(DEFAULT_MAX_ITERATIONS) @@ -325,113 +307,36 @@ impl InsightChatService { span.set_attribute(KeyValue::new("max_iterations", max_iterations as i64)); let stored_model = insight.model_version.clone(); - let custom_model = req - .model - .clone() - .or_else(|| Some(stored_model.clone())) - .filter(|m| !m.is_empty()); - - let mut ollama_client = self.ollama.clone(); - let mut openrouter_client: Option = None; - let mut llamacpp_client: Option = None; - - if is_hybrid { - let arc = self.openrouter.as_ref().ok_or_else(|| { - anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured") - })?; - let mut c: OpenRouterClient = (**arc).clone(); - if let Some(ref m) = custom_model { - c.primary_model = m.clone(); - } - if req.temperature.is_some() - || req.top_p.is_some() - || req.top_k.is_some() - || req.min_p.is_some() - { - c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p); - } - if let Some(ctx) = req.num_ctx { - c.set_num_ctx(Some(ctx)); - } - openrouter_client = Some(c); - } else if local_via_llamacpp { - let arc = self.llamacpp.as_ref().ok_or_else(|| { - anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured") - })?; - let mut c: LlamaCppClient = (**arc).clone(); - if let Some(ref m) = custom_model { - c.primary_model = m.clone(); - } - if req.temperature.is_some() - || req.top_p.is_some() - || req.top_k.is_some() - || req.min_p.is_some() - { - c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p); - } - if let Some(ctx) = req.num_ctx { - c.set_num_ctx(Some(ctx)); - } - llamacpp_client = Some(c); - } else { - // Pure local (Ollama): model swap. Build a new client when the - // chat model differs from the configured one. - if let Some(ref m) = custom_model - && m != &self.ollama.primary_model - { - ollama_client = OllamaClient::new( - self.ollama.primary_url.clone(), - self.ollama.fallback_url.clone(), - m.clone(), - Some(m.clone()), - ); - } - if req.temperature.is_some() - || req.top_p.is_some() - || req.top_k.is_some() - || req.min_p.is_some() - { - ollama_client.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p); - } - if let Some(ctx) = req.num_ctx { - ollama_client.set_num_ctx(Some(ctx)); - } - } - - let chat_backend: &dyn LlmClient = if let Some(ref c) = llamacpp_client { - c - } else if let Some(ref c) = openrouter_client { - c - } else { - &ollama_client + let overrides = SamplingOverrides { + model: req.model.clone() + .or_else(|| Some(stored_model.clone())) + .filter(|m| !m.is_empty()), + num_ctx: req.num_ctx, + temperature: req.temperature, + top_p: req.top_p, + top_k: req.top_k, + min_p: req.min_p, }; - let model_used = chat_backend.primary_model().to_string(); + let backend = self.generator.resolve_backend(kind, &overrides).await?; + let model_used = backend.model().to_string(); span.set_attribute(KeyValue::new("model", model_used.clone())); - // 5. Decide vision + tool set. In describe-then-inline mode - // (hybrid only) we omit `describe_photo`. In local and llamacpp - // we trust the stored history's first-user shape: if it carries - // `images`, the original model was vision-capable, and we keep - // `describe_photo` available. + // 5. Decide vision + tool set. In hybrid (describe-then-inline) mode + // we omit `describe_photo`. Otherwise trust the stored history: + // if the first user message carries images, describe_photo stays. let local_first_user_has_image = messages .iter() .find(|m| m.role == "user") .and_then(|m| m.images.as_ref()) .map(|imgs| !imgs.is_empty()) .unwrap_or(false); - let offer_describe_tool = !describes_then_inlines && local_first_user_has_image; - // current_gate_opts(has_vision) sets gate_opts.has_vision = has_vision - // and probes the per-table presence flags. Pass `offer_describe_tool` - // directly — the `!is_hybrid && local_first_user_has_image` decision - // is the chat-path's vision predicate. + let offer_describe_tool = backend.images_inline && local_first_user_has_image; let gate_opts = self.generator.current_gate_opts_for_persona( offer_describe_tool, Some((req.user_id, &active_persona)), ); let tools = InsightGenerator::build_tool_definitions(gate_opts); - // Image base64 only needed when describe_photo is on the menu. Load - // lazily to avoid disk IO when the loop never invokes it. let image_base64: Option = if offer_describe_tool { self.generator.load_image_as_base64(&normalized).ok() } else { @@ -480,13 +385,13 @@ impl InsightChatService { iterations_used = iteration + 1; log::info!("Chat iteration {}/{}", iterations_used, max_iterations); - let (response, prompt_tokens, eval_tokens) = chat_backend + let (response, prompt_tokens, eval_tokens) = backend + .chat() .chat_with_tools(messages.clone(), tools.clone()) .await?; last_prompt_eval_count = prompt_tokens; last_eval_count = eval_tokens; - // Ollama rejects non-object tool-call arguments on replay. let mut response = response; if let Some(ref mut tcs) = response.tool_calls { for tc in tcs.iter_mut() { @@ -514,13 +419,11 @@ impl InsightChatService { .execute_tool( &tool_call.function.name, &tool_call.function.arguments, - &ollama_client, + &backend, &image_base64, &normalized, req.user_id, &active_persona, - &model_used, - &effective_backend, &loop_cx, ) .await; @@ -534,8 +437,6 @@ impl InsightChatService { } if final_content.is_empty() { - // The model never produced a final answer; ask once more without - // tools to force a textual reply. log::info!( "Chat loop exhausted after {} iterations, requesting final answer", iterations_used @@ -543,7 +444,8 @@ impl InsightChatService { messages.push(ChatMessage::user( "Please write your final answer now without calling any more tools.", )); - let (final_response, prompt_tokens, eval_tokens) = chat_backend + let (final_response, prompt_tokens, eval_tokens) = backend + .chat() .chat_with_tools(messages.clone(), vec![]) .await?; last_prompt_eval_count = prompt_tokens; @@ -579,7 +481,8 @@ impl InsightChatService { Capture the key moment or theme. Return ONLY the title, nothing else.", final_content ); - let title_raw = chat_backend + let title_raw = backend + .chat() .generate( &title_prompt, Some( @@ -604,7 +507,7 @@ impl InsightChatService { model_version: model_used.clone(), is_current: true, training_messages: Some(json), - backend: effective_backend.clone(), + backend: kind.as_str().to_string(), fewshot_source_ids: None, content_hash: None, }; @@ -629,7 +532,7 @@ impl InsightChatService { prompt_eval_count: last_prompt_eval_count, eval_count: last_eval_count, amended_insight_id, - backend_used: effective_backend, + backend_used: kind.as_str().to_string(), model_used, }) } @@ -818,9 +721,8 @@ impl InsightChatService { .map(|s| s.trim().to_lowercase()) .filter(|s| !s.is_empty()) .unwrap_or_else(|| stored_backend.clone()); - validate_cross_replay(&stored_backend, &effective_backend)?; - let is_hybrid = effective_backend == "hybrid"; - let describes_then_inlines = is_hybrid; + let kind = BackendKind::parse(&effective_backend)?; + validate_cross_replay(&stored_backend, kind.as_str())?; let max_iterations = req .max_iterations @@ -828,18 +730,20 @@ impl InsightChatService { .clamp(1, env_max_iterations()); let stored_model = insight.model_version.clone(); - let custom_model = req - .model - .clone() - .or_else(|| Some(stored_model.clone())) - .filter(|m| !m.is_empty()); + let overrides = SamplingOverrides { + model: req.model.clone() + .or_else(|| Some(stored_model.clone())) + .filter(|m| !m.is_empty()), + num_ctx: req.num_ctx, + temperature: req.temperature, + top_p: req.top_p, + top_k: req.top_k, + min_p: req.min_p, + }; + let backend = self.generator.resolve_backend(kind, &overrides).await?; + let model_used = backend.model().to_string(); - let (chat_backend_holder, ollama_client) = - self.build_chat_clients(&effective_backend, custom_model.as_deref(), &req)?; - let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref(); - let model_used = chat_backend.primary_model().to_string(); - - // Tool set — local/llamacpp mode + first user turn carries an image → + // Tool set — images_inline mode + first user turn carries an image → // offer describe_photo. Describe-then-inline mode (hybrid only): // visual description was inlined at bootstrap, no describe tool needed. let local_first_user_has_image = messages @@ -848,7 +752,7 @@ impl InsightChatService { .and_then(|m| m.images.as_ref()) .map(|imgs| !imgs.is_empty()) .unwrap_or(false); - let offer_describe_tool = !describes_then_inlines && local_first_user_has_image; + let offer_describe_tool = backend.images_inline && local_first_user_has_image; let gate_opts = self.generator.current_gate_opts_for_persona( offer_describe_tool, Some((req.user_id, &active_persona)), @@ -879,16 +783,13 @@ impl InsightChatService { let outcome = self .run_streaming_agentic_loop( - chat_backend, - &ollama_client, + &backend, &mut messages, tools, &image_base64, &normalized, req.user_id, &active_persona, - &model_used, - &effective_backend, max_iterations, &tx, ) @@ -916,7 +817,7 @@ impl InsightChatService { let mut amended_insight_id: Option = None; if req.amend { - let title = self.generate_title(chat_backend, &final_content).await?; + let title = self.generate_title(&backend, &final_content).await?; // Amended rows intentionally do not inherit the parent's // `fewshot_source_ids`. The parent's few-shot influence is still @@ -932,7 +833,7 @@ impl InsightChatService { model_version: model_used.clone(), is_current: true, training_messages: Some(json), - backend: effective_backend.clone(), + backend: kind.as_str().to_string(), fewshot_source_ids: None, content_hash: None, }; @@ -958,7 +859,7 @@ impl InsightChatService { eval_tokens: last_eval_count, num_ctx: req.num_ctx, amended_insight_id, - backend_used: effective_backend, + backend_used: kind.as_str().to_string(), model_used, }) .await; @@ -984,21 +885,23 @@ impl InsightChatService { .filter(|s| !s.trim().is_empty()) .unwrap_or_else(|| "default".to_string()); let effective_backend = resolve_bootstrap_backend(req.backend.as_deref())?; - let is_hybrid = effective_backend == "hybrid"; - let local_via_llamacpp = - crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); - let describes_then_inlines = is_hybrid; + let kind = BackendKind::parse(&effective_backend)?; let max_iterations = req .max_iterations .unwrap_or(DEFAULT_MAX_ITERATIONS) .clamp(1, env_max_iterations()); - let custom_model = req.model.clone().filter(|m| !m.is_empty()); - let (chat_backend_holder, ollama_client) = - self.build_chat_clients(&effective_backend, custom_model.as_deref(), &req)?; - let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref(); - let model_used = chat_backend.primary_model().to_string(); + let overrides = SamplingOverrides { + model: req.model.clone().filter(|m| !m.is_empty()), + num_ctx: req.num_ctx, + temperature: req.temperature, + top_p: req.top_p, + top_k: req.top_k, + min_p: req.min_p, + }; + let backend = self.generator.resolve_backend(kind, &overrides).await?; + let model_used = backend.model().to_string(); // Load image bytes once. RAW preview fallback is handled inside // load_image_as_base64. Errors degrade silently — a chat that @@ -1020,26 +923,17 @@ impl InsightChatService { }); // Describe-then-inline (hybrid only): pre-describe the image so a - // text-only chat model gets the visual description inline. llamacpp - // sends images directly to the chat model. - let visual_block = if describes_then_inlines { + // text-only chat model gets the visual description inline. + // images_inline backends send images directly to the chat model. + let visual_block = if !backend.images_inline { match image_base64.as_deref() { Some(b64) => { - let described = if local_via_llamacpp { - self.llamacpp - .as_ref() - .expect("local_via_llamacpp guarantees Some") - .describe_image(b64) - .await - } else { - self.ollama.describe_image(b64).await - }; - match described { + match backend.local().describe_image(b64).await { Ok(desc) => { format!("Visual description (from local vision model):\n{}\n", desc) } Err(e) => { - log::warn!("{} bootstrap: describe_image failed: {}", effective_backend, e); + log::warn!("{} bootstrap: describe_image failed: {}", kind.as_str(), e); String::new() } } @@ -1050,10 +944,10 @@ impl InsightChatService { String::new() }; - // Tool gates. Local + image present → expose describe_photo so - // the chat model can re-look at the photo on demand. Hybrid: + // Tool gates. images_inline + image present → expose describe_photo so + // the chat model can re-look at the photo on demand. Non-inline: // already inlined, no tool needed. - let offer_describe_tool = !describes_then_inlines && image_base64.is_some(); + let offer_describe_tool = backend.images_inline && image_base64.is_some(); let gate_opts = self.generator.current_gate_opts_for_persona( offer_describe_tool, Some((req.user_id, &active_persona)), @@ -1079,23 +973,22 @@ impl InsightChatService { ); let system_msg = ChatMessage::system(system_content); let mut user_msg = ChatMessage::user(req.user_message.clone()); - if !describes_then_inlines && let Some(ref img) = image_base64 { - user_msg.images = Some(vec![img.clone()]); + if backend.images_inline { + if let Some(ref img) = image_base64 { + user_msg.images = Some(vec![img.clone()]); + } } let mut messages = vec![system_msg, user_msg]; let outcome = self .run_streaming_agentic_loop( - chat_backend, - &ollama_client, + &backend, &mut messages, tools, &image_base64, &normalized, req.user_id, &active_persona, - &model_used, - &effective_backend, max_iterations, &tx, ) @@ -1108,7 +1001,7 @@ impl InsightChatService { final_content, } = outcome; - let title = self.generate_title(chat_backend, &final_content).await?; + let title = self.generate_title(&backend, &final_content).await?; let json = serde_json::to_string(&messages) .map_err(|e| anyhow!("failed to serialize chat history: {}", e))?; @@ -1121,7 +1014,7 @@ impl InsightChatService { model_version: model_used.clone(), is_current: true, training_messages: Some(json), - backend: effective_backend.clone(), + backend: kind.as_str().to_string(), fewshot_source_ids: None, content_hash: None, }; @@ -1144,7 +1037,7 @@ impl InsightChatService { eval_tokens: last_eval_count, num_ctx: req.num_ctx, amended_insight_id: Some(stored.id), - backend_used: effective_backend, + backend_used: kind.as_str().to_string(), model_used, }) .await; @@ -1152,95 +1045,12 @@ impl InsightChatService { Ok(()) } - /// Set up chat clients (Ollama + optional OpenRouter / LlamaCpp) shared - /// by bootstrap and continuation. Returns the chat-side backend client - /// (boxed because each backend has a different concrete type) and the - /// Ollama client used for describe-image / local tool calls. - /// - /// `effective_backend` must be one of `"local"` or `"hybrid"` (validated - /// upstream). Hybrid → OpenRouter; local with `LLM_BACKEND=llamacpp` → - /// llama-swap; pure local → Ollama. Returns the dispatched chat client - /// plus the (possibly per-request) Ollama client that the caller uses - /// for non-chat helpers (image describe in non-llamacpp mode, tool ops). - fn build_chat_clients( - &self, - effective_backend: &str, - custom_model: Option<&str>, - req: &ChatTurnRequest, - ) -> Result<(Box, OllamaClient)> { - let mut ollama_client = self.ollama.clone(); - - if effective_backend == "hybrid" { - let arc = self.openrouter.as_ref().ok_or_else(|| { - anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured") - })?; - let mut c: OpenRouterClient = (**arc).clone(); - if let Some(m) = custom_model { - c.primary_model = m.to_string(); - } - if req.temperature.is_some() - || req.top_p.is_some() - || req.top_k.is_some() - || req.min_p.is_some() - { - c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p); - } - if let Some(ctx) = req.num_ctx { - c.set_num_ctx(Some(ctx)); - } - return Ok((Box::new(c), ollama_client)); - } - - // Local mode — env switch decides between Ollama and llama-swap. - if crate::ai::local_backend_is_llamacpp() - && let Some(arc) = self.llamacpp.as_ref() - { - let mut c: LlamaCppClient = (**arc).clone(); - if let Some(m) = custom_model { - c.primary_model = m.to_string(); - } - if req.temperature.is_some() - || req.top_p.is_some() - || req.top_k.is_some() - || req.min_p.is_some() - { - c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p); - } - if let Some(ctx) = req.num_ctx { - c.set_num_ctx(Some(ctx)); - } - return Ok((Box::new(c), ollama_client)); - } - - if let Some(m) = custom_model - && m != self.ollama.primary_model - { - ollama_client = OllamaClient::new( - self.ollama.primary_url.clone(), - self.ollama.fallback_url.clone(), - m.to_string(), - Some(m.to_string()), - ); - } - if req.temperature.is_some() - || req.top_p.is_some() - || req.top_k.is_some() - || req.min_p.is_some() - { - ollama_client.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p); - } - if let Some(ctx) = req.num_ctx { - ollama_client.set_num_ctx(Some(ctx)); - } - Ok((Box::new(ollama_client.clone()), ollama_client)) - } - /// Generate a short title via the same chat backend so voice stays /// consistent with the body. Mirrors generate_agentic_insight_for_photo's /// titling pass. async fn generate_title( &self, - chat_backend: &dyn LlmClient, + backend: &ResolvedBackend, final_content: &str, ) -> Result { let title_prompt = format!( @@ -1248,7 +1058,8 @@ impl InsightChatService { Capture the key moment or theme. Return ONLY the title, nothing else.", final_content ); - let title_raw = chat_backend + let title_raw = backend + .chat() .generate( &title_prompt, Some( @@ -1266,18 +1077,13 @@ impl InsightChatService { /// final assistant content. async fn run_streaming_agentic_loop( &self, - chat_backend: &dyn LlmClient, - ollama_client: &OllamaClient, + backend: &ResolvedBackend, messages: &mut Vec, tools: Vec, image_base64: &Option, normalized: &str, user_id: i32, active_persona: &str, - // Provenance — stamped onto any store_fact tool call made - // during this loop. Mirrors the non-streaming chat path. - model_used: &str, - effective_backend: &str, max_iterations: usize, tx: &tokio::sync::mpsc::Sender, ) -> Result { @@ -1296,7 +1102,8 @@ impl InsightChatService { }) .await; - let mut stream = chat_backend + let mut stream = backend + .chat() .chat_with_tools_stream(messages.clone(), tools.clone()) .await?; @@ -1353,13 +1160,11 @@ impl InsightChatService { .execute_tool( &tool_call.function.name, &tool_call.function.arguments, - ollama_client, + backend, image_base64, normalized, user_id, active_persona, - model_used, - effective_backend, &cx, ) .await; @@ -1394,7 +1199,8 @@ impl InsightChatService { messages.push(ChatMessage::user( "Please write your final answer now without calling any more tools.", )); - let mut stream = chat_backend + let mut stream = backend + .chat() .chat_with_tools_stream(messages.clone(), vec![]) .await?; let mut final_message: Option = None; diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs index 075001a..f95c6dc 100644 --- a/src/ai/insight_generator.rs +++ b/src/ai/insight_generator.rs @@ -1594,29 +1594,24 @@ Return ONLY the summary, nothing else."#, &self, tool_name: &str, arguments: &serde_json::Value, - ollama: &OllamaClient, + backend: &ResolvedBackend, image_base64: &Option, file_path: &str, user_id: i32, persona_id: &str, - // Provenance — written into entity_facts.created_by_* when - // the loop calls store_fact. The caller knows the actual - // chat-runtime model and backend (which may differ from - // ollama.primary_model in hybrid mode where chat lives on - // OpenRouter while Ollama still handles vision). - model: &str, - backend: &str, cx: &opentelemetry::Context, ) -> String { + let model = backend.model(); + let backend_label = backend.kind.as_str(); let result = match tool_name { - "search_rag" => self.tool_search_rag(arguments, ollama, cx).await, + "search_rag" => self.tool_search_rag(arguments, backend.local(), cx).await, "search_messages" => self.tool_search_messages(arguments, cx).await, "get_sms_messages" => self.tool_get_sms_messages(arguments, cx).await, "get_calendar_events" => self.tool_get_calendar_events(arguments, cx).await, "get_location_history" => self.tool_get_location_history(arguments, cx).await, "get_file_tags" => self.tool_get_file_tags(arguments, cx).await, "get_faces_in_photo" => self.tool_get_faces_in_photo(arguments, cx).await, - "describe_photo" => self.tool_describe_photo(ollama, image_base64).await, + "describe_photo" => self.tool_describe_photo(backend.local(), image_base64).await, "reverse_geocode" => self.tool_reverse_geocode(arguments).await, "get_personal_place_at" => self.tool_get_personal_place_at(arguments).await, "recall_entities" => self.tool_recall_entities(arguments, cx).await, @@ -1624,19 +1619,19 @@ Return ONLY the summary, nothing else."#, self.tool_recall_facts_for_photo(arguments, user_id, persona_id, cx) .await } - "store_entity" => self.tool_store_entity(arguments, ollama, cx).await, + "store_entity" => self.tool_store_entity(arguments, cx).await, "store_fact" => { self.tool_store_fact( - arguments, file_path, user_id, persona_id, model, backend, cx, + arguments, file_path, user_id, persona_id, model, backend_label, cx, ) .await } "update_fact" => { - self.tool_update_fact(arguments, user_id, persona_id, model, backend, cx) + self.tool_update_fact(arguments, user_id, persona_id, model, backend_label, cx) .await } "supersede_fact" => { - self.tool_supersede_fact(arguments, user_id, persona_id, model, backend, cx) + self.tool_supersede_fact(arguments, user_id, persona_id, model, backend_label, cx) .await } "get_current_datetime" => Self::tool_get_current_datetime(), @@ -1654,7 +1649,7 @@ Return ONLY the summary, nothing else."#, async fn tool_search_rag( &self, args: &serde_json::Value, - ollama: &OllamaClient, + local: &dyn LlmClient, _cx: &opentelemetry::Context, ) -> String { let query = match args.get("query").and_then(|v| v.as_str()) { @@ -1718,7 +1713,7 @@ Return ONLY the summary, nothing else."#, }; let final_results = if rerank_enabled && results.len() > limit { - match self.rerank_with_llm(&query, &results, limit, ollama).await { + match self.rerank_with_llm(&query, &results, limit, local).await { Ok(reordered) => reordered, Err(e) => { log::warn!("rerank failed, using vector order: {}", e); @@ -1744,7 +1739,7 @@ Return ONLY the summary, nothing else."#, query: &str, candidates: &[String], limit: usize, - ollama: &OllamaClient, + local: &dyn LlmClient, ) -> Result> { let query_preview: String = query.chars().take(60).collect(); log::info!( @@ -1785,15 +1780,7 @@ Return ONLY the summary, nothing else."#, let system = Some( "You are a terse relevance ranker. You output only numbers separated by commas.", ); - let response = if crate::ai::local_backend_is_llamacpp() { - if let Some(ref lc) = self.llamacpp { - lc.generate(&prompt, system, None).await? - } else { - ollama.generate_no_think(&prompt, system).await? - } - } else { - ollama.generate_no_think(&prompt, system).await? - }; + let response = local.generate(&prompt, system, None).await?; log::info!( "rerank: finished in {} ms (prompt={} chars)", started.elapsed().as_millis(), @@ -2365,31 +2352,17 @@ Return ONLY the summary, nothing else."#, out } - /// Tool: describe_photo — generate a visual description of the photo. - /// Routes through llama-swap when `LLM_BACKEND=llamacpp`, Ollama otherwise. async fn tool_describe_photo( &self, - ollama: &OllamaClient, + local: &dyn LlmClient, image_base64: &Option, ) -> String { log::info!("tool_describe_photo: generating visual description"); - match image_base64 { - Some(img) => { - let result = if crate::ai::local_backend_is_llamacpp() { - if let Some(ref lc) = self.llamacpp { - lc.describe_image(img).await - } else { - ollama.generate_photo_description(img).await - } - } else { - ollama.generate_photo_description(img).await - }; - match result { - Ok(desc) => desc, - Err(e) => format!("Error describing photo: {}", e), - } - } + Some(img) => match local.describe_image(img).await { + Ok(desc) => desc, + Err(e) => format!("Error describing photo: {}", e), + }, None => "No image available for description.".to_string(), } } @@ -2635,7 +2608,6 @@ Return ONLY the summary, nothing else."#, async fn tool_store_entity( &self, args: &serde_json::Value, - _ollama: &OllamaClient, cx: &opentelemetry::Context, ) -> String { use crate::database::models::InsertEntity; @@ -3775,243 +3747,25 @@ Return ONLY the summary, nothing else."#, span.set_attribute(KeyValue::new("file_path", file_path.clone())); span.set_attribute(KeyValue::new("max_iterations", max_iterations as i64)); - // 1a. Resolve backend label (defaults to "local"). - let backend_label = backend - .as_deref() - .map(|s| s.trim().to_lowercase()) - .filter(|s| !s.is_empty()) - .unwrap_or_else(|| "local".to_string()); - if !matches!(backend_label.as_str(), "local" | "hybrid") { - return Err(anyhow::anyhow!( - "unknown backend '{}'; expected 'local' or 'hybrid'", - backend_label - )); - } - span.set_attribute(KeyValue::new("backend", backend_label.clone())); - let is_hybrid = backend_label == "hybrid"; - // `LLM_BACKEND=llamacpp` swaps Ollama out for llama-swap as the - // "local" stack — chat + embeddings route through llama-swap. - // llamacpp models receive images directly (vision-capable); only - // hybrid mode (OpenRouter chat) uses describe-then-inline. - let local_via_llamacpp = - crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); - let describes_then_inlines = is_hybrid; - let ollama_is_chat = !is_hybrid && !local_via_llamacpp; - - // 1b. Always build an Ollama client. In local mode it owns the chat - // loop; in hybrid/llamacpp mode it still handles tool-local calls - // (e.g. future embedding-backed tools). The chat backend is - // selected separately below. - // Sampling overrides only apply when Ollama is the chat backend. - let apply_sampling_to_ollama = ollama_is_chat; - let mut ollama_client = if let Some(ref model) = custom_model - && ollama_is_chat - { - log::info!("Using custom model for agentic: {}", model); - span.set_attribute(KeyValue::new("custom_model", model.clone())); - OllamaClient::new( - self.ollama.primary_url.clone(), - self.ollama.fallback_url.clone(), - model.clone(), - Some(model.clone()), - ) - } else { - if ollama_is_chat { - span.set_attribute(KeyValue::new("model", self.ollama.primary_model.clone())); - } - self.ollama.clone() - }; - - if apply_sampling_to_ollama { - if let Some(ctx) = num_ctx { - log::info!("Using custom context size: {}", ctx); - span.set_attribute(KeyValue::new("num_ctx", ctx as i64)); - ollama_client.set_num_ctx(Some(ctx)); - } - - if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() { - log::info!( - "Using sampling params — temperature: {:?}, top_p: {:?}, top_k: {:?}, min_p: {:?}", - temperature, - top_p, - top_k, - min_p - ); - if let Some(t) = temperature { - span.set_attribute(KeyValue::new("temperature", t as f64)); - } - if let Some(p) = top_p { - span.set_attribute(KeyValue::new("top_p", p as f64)); - } - if let Some(k) = top_k { - span.set_attribute(KeyValue::new("top_k", k as i64)); - } - if let Some(m) = min_p { - span.set_attribute(KeyValue::new("min_p", m as f64)); - } - ollama_client.set_sampling_params(temperature, top_p, top_k, min_p); - } - } - - // 1c. In hybrid mode, clone the configured OpenRouter client and - // apply per-request overrides. - let openrouter_client: Option = if is_hybrid { - let arc = self.openrouter.as_ref().ok_or_else(|| { - anyhow::anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured") - })?; - let mut c: OpenRouterClient = (**arc).clone(); - if let Some(ref m) = custom_model { - c.primary_model = m.clone(); - span.set_attribute(KeyValue::new("custom_model", m.clone())); - } - span.set_attribute(KeyValue::new("openrouter_model", c.primary_model.clone())); - if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() { - if let Some(t) = temperature { - span.set_attribute(KeyValue::new("temperature", t as f64)); - } - if let Some(p) = top_p { - span.set_attribute(KeyValue::new("top_p", p as f64)); - } - if let Some(k) = top_k { - span.set_attribute(KeyValue::new("top_k", k as i64)); - } - if let Some(m) = min_p { - span.set_attribute(KeyValue::new("min_p", m as f64)); - } - c.set_sampling_params(temperature, top_p, top_k, min_p); - } - if let Some(ctx) = num_ctx { - span.set_attribute(KeyValue::new("num_ctx", ctx as i64)); - c.set_num_ctx(Some(ctx)); - } - Some(c) - } else { - None - }; - - // 1d. When `LLM_BACKEND=llamacpp` and we're in local mode (not - // hybrid — hybrid keeps chat on OpenRouter), clone the llamacpp - // client and apply per-request overrides. Same shape as the - // openrouter branch above; describe_image will route through - // the vision slot configured on the client. - let llamacpp_client: Option = if local_via_llamacpp && !is_hybrid { - let arc = self.llamacpp.as_ref().ok_or_else(|| { - anyhow::anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured") - })?; - let mut c: LlamaCppClient = (**arc).clone(); - if let Some(ref m) = custom_model { - c.primary_model = m.clone(); - span.set_attribute(KeyValue::new("custom_model", m.clone())); - } - span.set_attribute(KeyValue::new("llamacpp_model", c.primary_model.clone())); - if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() { - if let Some(t) = temperature { - span.set_attribute(KeyValue::new("temperature", t as f64)); - } - if let Some(p) = top_p { - span.set_attribute(KeyValue::new("top_p", p as f64)); - } - if let Some(k) = top_k { - span.set_attribute(KeyValue::new("top_k", k as i64)); - } - if let Some(m) = min_p { - span.set_attribute(KeyValue::new("min_p", m as f64)); - } - c.set_sampling_params(temperature, top_p, top_k, min_p); - } - if let Some(ctx) = num_ctx { - span.set_attribute(KeyValue::new("num_ctx", ctx as i64)); - c.set_num_ctx(Some(ctx)); - } - Some(c) - } else { - None + // 1. Resolve backend + build clients. + let kind = BackendKind::parse( + backend.as_deref().unwrap_or("local"), + )?; + span.set_attribute(KeyValue::new("backend", kind.as_str())); + let overrides = SamplingOverrides { + model: custom_model, + num_ctx, + temperature, + top_p, + top_k, + min_p, }; + let backend = self.resolve_backend(kind, &overrides).await?; + span.set_attribute(KeyValue::new("model", backend.model().to_string())); + span.set_attribute(KeyValue::new("images_inline", backend.images_inline)); let insight_cx = current_cx.with_span(span); - // 2. Verify chat model supports tool calling. - // - local: existing Ollama model availability + capability check. - // - hybrid: trust the operator's curated allowlist - // (OPENROUTER_ALLOWED_MODELS) — no live precheck. A bad model id - // surfaces as a chat-call error on the next step. - let has_vision = if describes_then_inlines { - // Hybrid: chat model never sees images — describe-then-inject. - true - } else if local_via_llamacpp { - // llama-swap models receive images directly via OpenAI content - // parts. Capability probing isn't available (no `/api/show`), - // so assume vision support; a misconfigured model surfaces as - // a chat-call error. - true - } else { - if let Some(ref model_name) = custom_model { - let available_on_primary = - OllamaClient::is_model_available(&ollama_client.primary_url, model_name) - .await - .unwrap_or(false); - - let available_on_fallback = - if let Some(ref fallback_url) = ollama_client.fallback_url { - OllamaClient::is_model_available(fallback_url, model_name) - .await - .unwrap_or(false) - } else { - false - }; - - if !available_on_primary && !available_on_fallback { - anyhow::bail!( - "model not available: '{}' not found on any configured server", - model_name - ); - } - } - - let model_name_for_caps = &ollama_client.primary_model; - let capabilities = match OllamaClient::check_model_capabilities( - &ollama_client.primary_url, - model_name_for_caps, - ) - .await - { - Ok(caps) => caps, - Err(_) => { - let fallback_url = ollama_client.fallback_url.as_deref().ok_or_else(|| { - anyhow::anyhow!( - "Failed to check model capabilities for '{}': model not found on primary server and no fallback configured", - model_name_for_caps - ) - })?; - OllamaClient::check_model_capabilities(fallback_url, model_name_for_caps) - .await - .map_err(|e| { - anyhow::anyhow!( - "Failed to check model capabilities for '{}': {}", - model_name_for_caps, - e - ) - })? - } - }; - - if !capabilities.has_tool_calling { - return Err(anyhow::anyhow!( - "tool calling not supported by model '{}'", - ollama_client.primary_model - )); - } - - insight_cx - .span() - .set_attribute(KeyValue::new("model_has_vision", capabilities.has_vision)); - insight_cx - .span() - .set_attribute(KeyValue::new("model_has_tool_calling", true)); - - capabilities.has_vision - }; - // 3. Fetch EXIF let exif = { let mut exif_dao = self.exif_dao.lock().expect("Unable to lock ExifDao"); @@ -4103,60 +3857,33 @@ Return ONLY the summary, nothing else."#, } }; - // 7. Load image if vision capable. - // In hybrid mode we ALSO describe it locally now so the - // description can be inlined as text — the OpenRouter chat model - // never receives the base64 image directly. - let image_base64 = if has_vision { - match self.load_image_as_base64(&file_path) { - Ok(b64) => { - log::info!("Loaded image for vision-capable agentic model"); - Some(b64) - } - Err(e) => { - log::warn!("Failed to load image for agentic vision: {}", e); - None - } + // 7. Load image. Always attempted — vision-capable models get the + // base64 inline; hybrid mode describes it locally and injects text. + let image_base64 = match self.load_image_as_base64(&file_path) { + Ok(b64) => { + log::info!("Loaded image for agentic model"); + Some(b64) + } + Err(e) => { + log::warn!("Failed to load image for agentic: {}", e); + None } - } else { - None }; - // describe-then-inline path (hybrid only). Vision describe routes - // through whichever local backend is configured — llama-swap when - // `local_via_llamacpp`, otherwise Ollama. - let inlined_visual_description: Option = if describes_then_inlines { + // Describe-then-inline (hybrid only). Vision describe routes through + // the local backend so non-text work stays off OpenRouter. + let inlined_visual_description: Option = if !backend.images_inline { match image_base64.as_deref() { - Some(b64) => { - let described = if local_via_llamacpp { - self.llamacpp - .as_ref() - .expect("local_via_llamacpp guarantees Some") - .describe_image(b64) - .await - } else { - self.ollama.describe_image(b64).await - }; - - match described { - Ok(desc) => { - log::info!( - "{}: vision describe succeeded ({} chars)", - backend_label, - desc.len() - ); - Some(desc) - } - Err(e) => { - log::warn!( - "{}: vision describe failed, continuing without: {}", - backend_label, - e - ); - None - } + Some(b64) => match backend.local().describe_image(b64).await { + Ok(desc) => { + log::info!("{}: vision describe succeeded ({} chars)", kind, desc.len()); + Some(desc) } - } + Err(e) => { + log::warn!("{}: vision describe failed, continuing without: {}", kind, e); + None + } + }, None => None, } } else { @@ -4228,34 +3955,24 @@ Return ONLY the summary, nothing else."#, date = date_taken.format("%B %d, %Y"), ); - // 10. Define tools. Gate flags computed from current data presence; - // hybrid mode omits describe_photo since the chat model receives - // the visual description inline (so we pass `false` for - // has_vision in that mode regardless of the model's actual - // capability). - let gate_opts = self.current_gate_opts(has_vision && !describes_then_inlines); + // 10. Define tools. describe_photo offered only when the chat model + // sees images directly (images_inline); in hybrid mode the visual + // description is already inlined as text. + let gate_opts = self.current_gate_opts(backend.images_inline); let tools = Self::build_tool_definitions(gate_opts); - // 11. Build initial messages. In describe-then-inline modes images - // are never attached to the wire message — the description is part - // of `user_content`. + // 11. Build initial messages. images_inline → attach base64 to the + // user message; describe-then-inline → text was already injected. let system_msg = ChatMessage::system(system_content); let mut user_msg = ChatMessage::user(user_content); - if !describes_then_inlines && let Some(ref img) = image_base64 { - user_msg.images = Some(vec![img.clone()]); + if backend.images_inline { + if let Some(ref img) = image_base64 { + user_msg.images = Some(vec![img.clone()]); + } } let mut messages = vec![system_msg, user_msg]; - // 12. Agentic loop — dispatch through the selected backend. - let chat_backend: &dyn LlmClient = if let Some(ref lc_c) = llamacpp_client { - lc_c - } else if let Some(ref or_c) = openrouter_client { - or_c - } else { - &ollama_client - }; - let loop_span = tracer.start_with_context("ai.agentic.loop", &insight_cx); let loop_cx = insight_cx.with_span(loop_span); @@ -4268,7 +3985,8 @@ Return ONLY the summary, nothing else."#, iterations_used = iteration + 1; log::info!("Agentic iteration {}/{}", iteration + 1, max_iterations); - let (response, prompt_tokens, eval_tokens) = chat_backend + let (response, prompt_tokens, eval_tokens) = backend + .chat() .chat_with_tools(messages.clone(), tools.clone()) .await?; @@ -4308,13 +4026,11 @@ Return ONLY the summary, nothing else."#, .execute_tool( &tool_call.function.name, &tool_call.function.arguments, - &ollama_client, + &backend, &image_base64, &file_path, user_id, &persona_id, - chat_backend.primary_model(), - &backend_label, &loop_cx, ) .await; @@ -4338,7 +4054,8 @@ Return ONLY the summary, nothing else."#, "Based on the context gathered, please write the final photo insight: a title and a detailed personal summary. Write in first person as {}.", user_display_name() ))); - let (final_response, prompt_tokens, eval_tokens) = chat_backend + let (final_response, prompt_tokens, eval_tokens) = backend + .chat() .chat_with_tools(messages.clone(), vec![]) .await?; last_prompt_eval_count = prompt_tokens; @@ -4360,7 +4077,8 @@ Return ONLY the summary, nothing else."#, let title_system = custom_system_prompt.as_deref().unwrap_or( "You are my long term memory assistant. Use only the information provided. Do not invent details.", ); - let title_raw = chat_backend + let title_raw = backend + .chat() .generate(&title_prompt, Some(title_system), None) .await?; let title = title_raw.trim().trim_matches('"').to_string(); @@ -4383,7 +4101,7 @@ Return ONLY the summary, nothing else."#, }; // 15. Store insight (returns the persisted row including its new id) - let model_version = chat_backend.primary_model().to_string(); + let model_version = backend.model().to_string(); let fewshot_source_ids_json = if fewshot_source_ids.is_empty() { None } else { @@ -4398,7 +4116,7 @@ Return ONLY the summary, nothing else."#, model_version, is_current: true, training_messages, - backend: backend_label.clone(), + backend: kind.as_str().to_string(), fewshot_source_ids: fewshot_source_ids_json, content_hash: None, }; diff --git a/src/state.rs b/src/state.rs index c4f810a..8cfccbb 100644 --- a/src/state.rs +++ b/src/state.rs @@ -290,9 +290,6 @@ impl Default for AppState { Arc::new(tokio::sync::Mutex::new(std::collections::HashMap::new())); let insight_chat = Arc::new(InsightChatService::new( Arc::new(insight_generator.clone()), - ollama.clone(), - openrouter.clone(), - llamacpp.clone(), insight_dao.clone(), chat_locks, )); @@ -470,9 +467,6 @@ impl AppState { Arc::new(tokio::sync::Mutex::new(std::collections::HashMap::new())); let insight_chat = Arc::new(InsightChatService::new( Arc::new(insight_generator.clone()), - ollama.clone(), - None, - None, insight_dao.clone(), chat_locks, ));