From b03ee6034237e99248c8f4f4d7f5cb91b972ef2e Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Tue, 26 May 2026 09:55:16 -0400 Subject: [PATCH] fix: prevent hybrid mode from leaking OpenRouter model to local llamacpp client When backend=hybrid with LLM_BACKEND=llamacpp, the user-selected model (an OpenRouter id like "google/gemini-3-flash-preview") was being applied to the local LlamaCppClient's primary_model and vision_model. This caused describe_image to send the OpenRouter model name to llama-swap, which returned 400 because it has no such slot. Guard the local-client model override with !is_hybrid so it only applies in local-only mode (where the user is selecting a different local model). Bump to v1.2.0. Co-Authored-By: Claude Opus 4.7 (1M context) --- Cargo.lock | 2 +- Cargo.toml | 2 +- src/ai/backend.rs | 38 +++++++--- src/ai/insight_chat.rs | 37 +++++----- src/ai/insight_generator.rs | 138 ++++++++++++++++++++++++------------ src/ai/llamacpp.rs | 32 +++++++-- src/ai/mod.rs | 2 +- 7 files changed, 172 insertions(+), 79 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 3f6d0c2..5d3e4ce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2051,7 +2051,7 @@ dependencies = [ [[package]] name = "image-api" -version = "1.1.0" +version = "1.2.0" dependencies = [ "actix", "actix-cors", diff --git a/Cargo.toml b/Cargo.toml index 7713970..7324001 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "image-api" -version = "1.1.0" +version = "1.2.0" authors = ["Cameron Cordes "] edition = "2024" diff --git a/src/ai/backend.rs b/src/ai/backend.rs index 0fe7747..0515f1c 100644 --- a/src/ai/backend.rs +++ b/src/ai/backend.rs @@ -13,7 +13,10 @@ impl BackendKind { match s.trim().to_lowercase().as_str() { "local" | "" => Ok(Self::Local), "hybrid" => Ok(Self::Hybrid), - other => Err(anyhow!("unknown backend '{}'; expected 'local' or 'hybrid'", other)), + other => Err(anyhow!( + "unknown backend '{}'; expected 'local' or 'hybrid'", + other + )), } } @@ -65,7 +68,12 @@ impl ResolvedBackend { kind: BackendKind, images_inline: bool, ) -> Self { - Self { chat, local, kind, images_inline } + Self { + chat, + local, + kind, + images_inline, + } } pub fn chat(&self) -> &dyn LlmClient { @@ -97,21 +105,35 @@ mod tests { #[test] fn backend_kind_as_str_roundtrips() { - assert_eq!(BackendKind::parse(BackendKind::Local.as_str()).unwrap(), BackendKind::Local); - assert_eq!(BackendKind::parse(BackendKind::Hybrid.as_str()).unwrap(), BackendKind::Hybrid); + assert_eq!( + BackendKind::parse(BackendKind::Local.as_str()).unwrap(), + BackendKind::Local + ); + assert_eq!( + BackendKind::parse(BackendKind::Hybrid.as_str()).unwrap(), + BackendKind::Hybrid + ); } #[test] fn sampling_overrides_has_sampling() { let empty = SamplingOverrides { - model: None, num_ctx: None, temperature: None, - top_p: None, top_k: None, min_p: None, + model: None, + num_ctx: None, + temperature: None, + top_p: None, + top_k: None, + min_p: None, }; assert!(!empty.has_sampling()); let with_temp = SamplingOverrides { - model: None, num_ctx: Some(4096), temperature: Some(0.7), - top_p: None, top_k: None, min_p: None, + model: None, + num_ctx: Some(4096), + temperature: Some(0.7), + top_p: None, + top_k: None, + min_p: None, }; assert!(with_temp.has_sampling()); } diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs index 86671af..9837733 100644 --- a/src/ai/insight_chat.rs +++ b/src/ai/insight_chat.rs @@ -308,7 +308,9 @@ impl InsightChatService { let stored_model = insight.model_version.clone(); let overrides = SamplingOverrides { - model: req.model.clone() + model: req + .model + .clone() .or_else(|| Some(stored_model.clone())) .filter(|m| !m.is_empty()), num_ctx: req.num_ctx, @@ -731,7 +733,9 @@ impl InsightChatService { let stored_model = insight.model_version.clone(); let overrides = SamplingOverrides { - model: req.model.clone() + model: req + .model + .clone() .or_else(|| Some(stored_model.clone())) .filter(|m| !m.is_empty()), num_ctx: req.num_ctx, @@ -928,17 +932,15 @@ impl InsightChatService { // images_inline backends send images directly to the chat model. let visual_block = if !backend.images_inline { match image_base64.as_deref() { - Some(b64) => { - match backend.local().describe_image(b64).await { - Ok(desc) => { - format!("Visual description (from local vision model):\n{}\n", desc) - } - Err(e) => { - log::warn!("{} bootstrap: describe_image failed: {}", kind.as_str(), e); - String::new() - } + Some(b64) => match backend.local().describe_image(b64).await { + Ok(desc) => { + format!("Visual description (from local vision model):\n{}\n", desc) } - } + Err(e) => { + log::warn!("{} bootstrap: describe_image failed: {}", kind.as_str(), e); + String::new() + } + }, None => String::new(), } } else { @@ -1328,10 +1330,7 @@ fn resolve_bootstrap_backend(supplied: Option<&str>) -> Result { .filter(|s| !s.is_empty()) .unwrap_or_else(|| "local".to_string()); if !matches!(lower.as_str(), "local" | "hybrid") { - bail!( - "unknown backend '{}'; expected 'local' or 'hybrid'", - lower - ); + bail!("unknown backend '{}'; expected 'local' or 'hybrid'", lower); } Ok(lower) } @@ -1971,7 +1970,11 @@ mod tests { // Both "openrouter" and the former "llamacpp" value are unknown now. for label in &["openrouter", "llamacpp"] { let err = validate_cross_replay("local", label).unwrap_err(); - assert!(format!("{}", err).contains("unknown backend"), "label={}", label); + assert!( + format!("{}", err).contains("unknown backend"), + "label={}", + label + ); } } diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs index b0cd54c..8e39c59 100644 --- a/src/ai/insight_generator.rs +++ b/src/ai/insight_generator.rs @@ -11,9 +11,9 @@ use std::sync::{Arc, Mutex}; use crate::ai::apollo_client::{ApolloClient, ApolloPlace}; use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides}; +use crate::ai::llamacpp::LlamaCppClient; use crate::ai::llm_client::LlmClient; use crate::ai::ollama::{ChatMessage, OllamaClient, Tool}; -use crate::ai::llamacpp::LlamaCppClient; use crate::ai::openrouter::OpenRouterClient; use crate::ai::sms_client::{SmsApiClient, SmsSearchHit, SmsSearchParams}; use crate::ai::user_display_name; @@ -35,7 +35,10 @@ pub(crate) fn parse_title_body(raw: &str) -> (String, String) { let trimmed = raw.trim(); // Try "Title: \n\n<body>" or "Title: <title>\n<body>" - if let Some(rest) = trimmed.strip_prefix("Title:").or_else(|| trimmed.strip_prefix("title:")) { + if let Some(rest) = trimmed + .strip_prefix("Title:") + .or_else(|| trimmed.strip_prefix("title:")) + { let rest = rest.trim_start(); if let Some(split_pos) = rest.find("\n\n").or_else(|| rest.find('\n')) { let title = rest[..split_pos].trim(); @@ -1644,7 +1647,10 @@ Return ONLY the summary, nothing else."#, "get_location_history" => self.tool_get_location_history(arguments, cx).await, "get_file_tags" => self.tool_get_file_tags(arguments, cx).await, "get_faces_in_photo" => self.tool_get_faces_in_photo(arguments, cx).await, - "describe_photo" => self.tool_describe_photo(backend.local(), image_base64).await, + "describe_photo" => { + self.tool_describe_photo(backend.local(), image_base64) + .await + } "reverse_geocode" => self.tool_reverse_geocode(arguments).await, "get_personal_place_at" => self.tool_get_personal_place_at(arguments).await, "recall_entities" => self.tool_recall_entities(arguments, cx).await, @@ -1655,7 +1661,13 @@ Return ONLY the summary, nothing else."#, "store_entity" => self.tool_store_entity(arguments, cx).await, "store_fact" => { self.tool_store_fact( - arguments, file_path, user_id, persona_id, model, backend_label, cx, + arguments, + file_path, + user_id, + persona_id, + model, + backend_label, + cx, ) .await } @@ -1810,9 +1822,8 @@ Return ONLY the summary, nothing else."#, ); let started = std::time::Instant::now(); - let system = Some( - "You are a terse relevance ranker. You output only numbers separated by commas.", - ); + let system = + Some("You are a terse relevance ranker. You output only numbers separated by commas."); let response = local.generate(&prompt, system, None).await?; log::info!( "rerank: finished in {} ms (prompt={} chars)", @@ -1960,7 +1971,8 @@ Return ONLY the summary, nothing else."#, .unwrap_or(20) .clamp(1, 50) as usize; let contact_id = args.get("contact_id").and_then(|v| v.as_i64()); - let contact = args.get("contact") + let contact = args + .get("contact") .and_then(|v| v.as_str()) .map(|s| s.trim().to_string()) .filter(|s| !s.is_empty()) @@ -2710,22 +2722,17 @@ Return ONLY the summary, nothing else."#, // Generate embedding for name + description (best-effort) via the // configured local backend. let embed_text = format!("{} {}", name, description); - let embedding: Option<Vec<u8>> = match crate::ai::embed_one( - &self.ollama, - self.llamacpp.as_deref(), - &embed_text, - ) - .await - { - Ok(vec) => { - let bytes: Vec<u8> = vec.iter().flat_map(|f| f.to_le_bytes()).collect(); - Some(bytes) - } - Err(e) => { - log::warn!("Embedding generation failed for entity '{}': {}", name, e); - None - } - }; + let embedding: Option<Vec<u8>> = + match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &embed_text).await { + Ok(vec) => { + let bytes: Vec<u8> = vec.iter().flat_map(|f| f.to_le_bytes()).collect(); + Some(bytes) + } + Err(e) => { + log::warn!("Embedding generation failed for entity '{}': {}", name, e); + None + } + }; let now = chrono::Utc::now().timestamp(); let insert = InsertEntity { @@ -3606,8 +3613,7 @@ Return ONLY the summary, nothing else."#, kind: BackendKind, overrides: &SamplingOverrides, ) -> Result<ResolvedBackend> { - let local_via_llamacpp = - crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); + let local_via_llamacpp = crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); let is_hybrid = kind == BackendKind::Hybrid; // ── chat client ──────────────────────────────────────────────── @@ -3680,12 +3686,15 @@ Return ONLY the summary, nothing else."#, }; // ── local client (utility calls: rerank, describe_image, etc.) ─ - // For llamacpp: mirror the chat model selection so rerank / - // describe_image hit the same model that's already loaded — - // avoids a mid-turn model swap in llama-swap exclusive mode. + // For llamacpp in local mode: mirror the chat model selection so + // rerank / describe_image hit the same model that's already + // loaded — avoids a mid-turn model swap in llama-swap exclusive + // mode. In hybrid mode the override is an OpenRouter model id + // (e.g. "google/gemini-3-flash-preview") which llama-swap can't + // serve — keep the configured local slots. let local: Box<dyn LlmClient> = if local_via_llamacpp { let mut lc = self.llamacpp.as_ref().unwrap().as_ref().clone(); - if let Some(ref m) = overrides.model { + if !is_hybrid && let Some(ref m) = overrides.model { lc.primary_model = m.clone(); lc.set_vision_model(m.clone()); } @@ -3713,14 +3722,14 @@ Return ONLY the summary, nothing else."#, .await .unwrap_or(false); - let available_on_fallback = - if let Some(ref fallback_url) = self.ollama.fallback_url { - OllamaClient::is_model_available(fallback_url, model) - .await - .unwrap_or(false) - } else { - false - }; + let available_on_fallback = if let Some(ref fallback_url) = self.ollama.fallback_url + { + OllamaClient::is_model_available(fallback_url, model) + .await + .unwrap_or(false) + } else { + false + }; if !available_on_primary && !available_on_fallback { anyhow::bail!( @@ -3761,10 +3770,7 @@ Return ONLY the summary, nothing else."#, }; if !capabilities.has_tool_calling { - anyhow::bail!( - "tool calling not supported by model '{}'", - ollama_for_caps - ); + anyhow::bail!("tool calling not supported by model '{}'", ollama_for_caps); } capabilities.has_vision @@ -3801,9 +3807,7 @@ Return ONLY the summary, nothing else."#, span.set_attribute(KeyValue::new("max_iterations", max_iterations as i64)); // 1. Resolve backend + build clients. - let kind = BackendKind::parse( - backend.as_deref().unwrap_or("local"), - )?; + let kind = BackendKind::parse(backend.as_deref().unwrap_or("local"))?; span.set_attribute(KeyValue::new("backend", kind.as_str())); let overrides = SamplingOverrides { model: custom_model, @@ -3933,7 +3937,11 @@ Return ONLY the summary, nothing else."#, Some(desc) } Err(e) => { - log::warn!("{}: vision describe failed, continuing without: {}", kind, e); + log::warn!( + "{}: vision describe failed, continuing without: {}", + kind, + e + ); None } }, @@ -4813,4 +4821,42 @@ mod tests { assert!(t.len() <= 60); assert_eq!(b, input); } + + /// Regression: hybrid mode was leaking the OpenRouter model override + /// into the local llamacpp client, causing describe_image to send + /// e.g. "google/gemini-3-flash-preview" to llama-swap (which 400s). + #[test] + fn resolve_backend_hybrid_does_not_leak_model_to_local_llamacpp() { + use crate::ai::llamacpp::LlamaCppClient; + + let mut base = + LlamaCppClient::new(Some("http://localhost:9292/v1".into()), Some("chat".into())); + base.set_vision_model("vision".into()); + base.set_embedding_model("embed".into()); + + let openrouter_model = "google/gemini-3-flash-preview"; + let overrides_model: Option<String> = Some(openrouter_model.into()); + let is_hybrid = true; + + // Replicate the resolve_backend local-client construction + // (lines ~3686-3695 of this file). + let mut lc = base.clone(); + if let Some(ref m) = overrides_model { + if !is_hybrid { + lc.primary_model = m.clone(); + lc.set_vision_model(m.clone()); + } + } + + // In hybrid mode the local client must keep its configured slots. + assert_eq!( + lc.vision_model, "vision", + "hybrid mode must not override vision_model with OpenRouter model" + ); + assert_eq!( + lc.primary_model, "chat", + "hybrid mode must not override primary_model with OpenRouter model" + ); + assert_eq!(lc.embedding_model, "embed"); + } } diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs index 8ea1063..e2ba00d 100644 --- a/src/ai/llamacpp.rs +++ b/src/ai/llamacpp.rs @@ -409,10 +409,7 @@ impl LlmClient for LlamaCppClient { tools.len() ); let mut body = serde_json::Map::new(); - body.insert( - "model".into(), - Value::String(self.primary_model.clone()), - ); + body.insert("model".into(), Value::String(self.primary_model.clone())); body.insert( "messages".into(), Value::Array(Self::messages_to_openai(&messages)), @@ -1071,6 +1068,28 @@ mod tests { assert_eq!(local.embedding_model, "embed"); } + #[test] + fn hybrid_mode_local_client_preserves_vision_model() { + // In hybrid mode, overrides.model is an OpenRouter model id + // (e.g. "google/gemini-3-flash-preview"). The local llamacpp + // client must NOT adopt that as its vision_model — it should + // keep the configured LLAMA_SWAP_VISION_MODEL so describe_image + // hits the correct local slot instead of sending an unknown + // model name to llama-swap. + let mut base = LlamaCppClient::new(None, Some("chat".into())); + base.set_vision_model("vision".into()); + base.set_embedding_model("embed".into()); + + // Simulate what resolve_backend SHOULD do for hybrid mode: + // clone but do NOT override primary_model / vision_model. + let local = base.clone(); + + // The local client keeps its configured slots. + assert_eq!(local.primary_model, "chat"); + assert_eq!(local.vision_model, "vision"); + assert_eq!(local.embedding_model, "embed"); + } + #[test] fn assistant_tool_calls_emit_null_content() { let msg = ChatMessage { @@ -1086,7 +1105,10 @@ mod tests { images: None, }; let wire = LlamaCppClient::messages_to_openai(&[msg]); - assert!(wire[0]["content"].is_null(), "empty content + tool_calls should emit null"); + assert!( + wire[0]["content"].is_null(), + "empty content + tool_calls should emit null" + ); } #[test] diff --git a/src/ai/mod.rs b/src/ai/mod.rs index 09cbfda..c991c71 100644 --- a/src/ai/mod.rs +++ b/src/ai/mod.rs @@ -25,11 +25,11 @@ pub use handlers::{ get_insight_handler, get_openrouter_models_handler, rate_insight_handler, }; pub use insight_generator::InsightGenerator; +pub use llamacpp::LlamaCppClient; #[allow(unused_imports)] pub use llm_client::{ ChatMessage, LlmClient, ModelCapabilities, Tool, ToolCall, ToolCallFunction, ToolFunction, }; -pub use llamacpp::LlamaCppClient; pub use ollama::{EMBEDDING_MODEL, OllamaClient}; pub use sms_client::{SmsApiClient, SmsMessage};