From b03ee6034237e99248c8f4f4d7f5cb91b972ef2e Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Tue, 26 May 2026 09:55:16 -0400
Subject: [PATCH] fix: prevent hybrid mode from leaking OpenRouter model to
 local llamacpp client

When backend=hybrid with LLM_BACKEND=llamacpp, the user-selected model
(an OpenRouter id like "google/gemini-3-flash-preview") was being applied
to the local LlamaCppClient's primary_model and vision_model. This caused
describe_image to send the OpenRouter model name to llama-swap, which
returned 400 because it has no such slot.

Guard the local-client model override with !is_hybrid so it only applies
in local-only mode (where the user is selecting a different local model).
Bump to v1.2.0.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Cargo.lock                  |   2 +-
 Cargo.toml                  |   2 +-
 src/ai/backend.rs           |  38 +++++++---
 src/ai/insight_chat.rs      |  37 +++++-----
 src/ai/insight_generator.rs | 138 ++++++++++++++++++++++++------------
 src/ai/llamacpp.rs          |  32 +++++++--
 src/ai/mod.rs               |   2 +-
 7 files changed, 172 insertions(+), 79 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 3f6d0c2..5d3e4ce 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2051,7 +2051,7 @@ dependencies = [
 
 [[package]]
 name = "image-api"
-version = "1.1.0"
+version = "1.2.0"
 dependencies = [
  "actix",
  "actix-cors",
diff --git a/Cargo.toml b/Cargo.toml
index 7713970..7324001 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "image-api"
-version = "1.1.0"
+version = "1.2.0"
 authors = ["Cameron Cordes <cameronc.dev@gmail.com>"]
 edition = "2024"
 
diff --git a/src/ai/backend.rs b/src/ai/backend.rs
index 0fe7747..0515f1c 100644
--- a/src/ai/backend.rs
+++ b/src/ai/backend.rs
@@ -13,7 +13,10 @@ impl BackendKind {
         match s.trim().to_lowercase().as_str() {
             "local" | "" => Ok(Self::Local),
             "hybrid" => Ok(Self::Hybrid),
-            other => Err(anyhow!("unknown backend '{}'; expected 'local' or 'hybrid'", other)),
+            other => Err(anyhow!(
+                "unknown backend '{}'; expected 'local' or 'hybrid'",
+                other
+            )),
         }
     }
 
@@ -65,7 +68,12 @@ impl ResolvedBackend {
         kind: BackendKind,
         images_inline: bool,
     ) -> Self {
-        Self { chat, local, kind, images_inline }
+        Self {
+            chat,
+            local,
+            kind,
+            images_inline,
+        }
     }
 
     pub fn chat(&self) -> &dyn LlmClient {
@@ -97,21 +105,35 @@ mod tests {
 
     #[test]
     fn backend_kind_as_str_roundtrips() {
-        assert_eq!(BackendKind::parse(BackendKind::Local.as_str()).unwrap(), BackendKind::Local);
-        assert_eq!(BackendKind::parse(BackendKind::Hybrid.as_str()).unwrap(), BackendKind::Hybrid);
+        assert_eq!(
+            BackendKind::parse(BackendKind::Local.as_str()).unwrap(),
+            BackendKind::Local
+        );
+        assert_eq!(
+            BackendKind::parse(BackendKind::Hybrid.as_str()).unwrap(),
+            BackendKind::Hybrid
+        );
     }
 
     #[test]
     fn sampling_overrides_has_sampling() {
         let empty = SamplingOverrides {
-            model: None, num_ctx: None, temperature: None,
-            top_p: None, top_k: None, min_p: None,
+            model: None,
+            num_ctx: None,
+            temperature: None,
+            top_p: None,
+            top_k: None,
+            min_p: None,
         };
         assert!(!empty.has_sampling());
 
         let with_temp = SamplingOverrides {
-            model: None, num_ctx: Some(4096), temperature: Some(0.7),
-            top_p: None, top_k: None, min_p: None,
+            model: None,
+            num_ctx: Some(4096),
+            temperature: Some(0.7),
+            top_p: None,
+            top_k: None,
+            min_p: None,
         };
         assert!(with_temp.has_sampling());
     }
diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs
index 86671af..9837733 100644
--- a/src/ai/insight_chat.rs
+++ b/src/ai/insight_chat.rs
@@ -308,7 +308,9 @@ impl InsightChatService {
 
         let stored_model = insight.model_version.clone();
         let overrides = SamplingOverrides {
-            model: req.model.clone()
+            model: req
+                .model
+                .clone()
                 .or_else(|| Some(stored_model.clone()))
                 .filter(|m| !m.is_empty()),
             num_ctx: req.num_ctx,
@@ -731,7 +733,9 @@ impl InsightChatService {
 
         let stored_model = insight.model_version.clone();
         let overrides = SamplingOverrides {
-            model: req.model.clone()
+            model: req
+                .model
+                .clone()
                 .or_else(|| Some(stored_model.clone()))
                 .filter(|m| !m.is_empty()),
             num_ctx: req.num_ctx,
@@ -928,17 +932,15 @@ impl InsightChatService {
         // images_inline backends send images directly to the chat model.
         let visual_block = if !backend.images_inline {
             match image_base64.as_deref() {
-                Some(b64) => {
-                    match backend.local().describe_image(b64).await {
-                        Ok(desc) => {
-                            format!("Visual description (from local vision model):\n{}\n", desc)
-                        }
-                        Err(e) => {
-                            log::warn!("{} bootstrap: describe_image failed: {}", kind.as_str(), e);
-                            String::new()
-                        }
+                Some(b64) => match backend.local().describe_image(b64).await {
+                    Ok(desc) => {
+                        format!("Visual description (from local vision model):\n{}\n", desc)
                     }
-                }
+                    Err(e) => {
+                        log::warn!("{} bootstrap: describe_image failed: {}", kind.as_str(), e);
+                        String::new()
+                    }
+                },
                 None => String::new(),
             }
         } else {
@@ -1328,10 +1330,7 @@ fn resolve_bootstrap_backend(supplied: Option<&str>) -> Result<String> {
         .filter(|s| !s.is_empty())
         .unwrap_or_else(|| "local".to_string());
     if !matches!(lower.as_str(), "local" | "hybrid") {
-        bail!(
-            "unknown backend '{}'; expected 'local' or 'hybrid'",
-            lower
-        );
+        bail!("unknown backend '{}'; expected 'local' or 'hybrid'", lower);
     }
     Ok(lower)
 }
@@ -1971,7 +1970,11 @@ mod tests {
         // Both "openrouter" and the former "llamacpp" value are unknown now.
         for label in &["openrouter", "llamacpp"] {
             let err = validate_cross_replay("local", label).unwrap_err();
-            assert!(format!("{}", err).contains("unknown backend"), "label={}", label);
+            assert!(
+                format!("{}", err).contains("unknown backend"),
+                "label={}",
+                label
+            );
         }
     }
 
diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs
index b0cd54c..8e39c59 100644
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -11,9 +11,9 @@ use std::sync::{Arc, Mutex};
 
 use crate::ai::apollo_client::{ApolloClient, ApolloPlace};
 use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides};
+use crate::ai::llamacpp::LlamaCppClient;
 use crate::ai::llm_client::LlmClient;
 use crate::ai::ollama::{ChatMessage, OllamaClient, Tool};
-use crate::ai::llamacpp::LlamaCppClient;
 use crate::ai::openrouter::OpenRouterClient;
 use crate::ai::sms_client::{SmsApiClient, SmsSearchHit, SmsSearchParams};
 use crate::ai::user_display_name;
@@ -35,7 +35,10 @@ pub(crate) fn parse_title_body(raw: &str) -> (String, String) {
     let trimmed = raw.trim();
 
     // Try "Title: <title>\n\n<body>" or "Title: <title>\n<body>"
-    if let Some(rest) = trimmed.strip_prefix("Title:").or_else(|| trimmed.strip_prefix("title:")) {
+    if let Some(rest) = trimmed
+        .strip_prefix("Title:")
+        .or_else(|| trimmed.strip_prefix("title:"))
+    {
         let rest = rest.trim_start();
         if let Some(split_pos) = rest.find("\n\n").or_else(|| rest.find('\n')) {
             let title = rest[..split_pos].trim();
@@ -1644,7 +1647,10 @@ Return ONLY the summary, nothing else."#,
             "get_location_history" => self.tool_get_location_history(arguments, cx).await,
             "get_file_tags" => self.tool_get_file_tags(arguments, cx).await,
             "get_faces_in_photo" => self.tool_get_faces_in_photo(arguments, cx).await,
-            "describe_photo" => self.tool_describe_photo(backend.local(), image_base64).await,
+            "describe_photo" => {
+                self.tool_describe_photo(backend.local(), image_base64)
+                    .await
+            }
             "reverse_geocode" => self.tool_reverse_geocode(arguments).await,
             "get_personal_place_at" => self.tool_get_personal_place_at(arguments).await,
             "recall_entities" => self.tool_recall_entities(arguments, cx).await,
@@ -1655,7 +1661,13 @@ Return ONLY the summary, nothing else."#,
             "store_entity" => self.tool_store_entity(arguments, cx).await,
             "store_fact" => {
                 self.tool_store_fact(
-                    arguments, file_path, user_id, persona_id, model, backend_label, cx,
+                    arguments,
+                    file_path,
+                    user_id,
+                    persona_id,
+                    model,
+                    backend_label,
+                    cx,
                 )
                 .await
             }
@@ -1810,9 +1822,8 @@ Return ONLY the summary, nothing else."#,
         );
 
         let started = std::time::Instant::now();
-        let system = Some(
-            "You are a terse relevance ranker. You output only numbers separated by commas.",
-        );
+        let system =
+            Some("You are a terse relevance ranker. You output only numbers separated by commas.");
         let response = local.generate(&prompt, system, None).await?;
         log::info!(
             "rerank: finished in {} ms (prompt={} chars)",
@@ -1960,7 +1971,8 @@ Return ONLY the summary, nothing else."#,
             .unwrap_or(20)
             .clamp(1, 50) as usize;
         let contact_id = args.get("contact_id").and_then(|v| v.as_i64());
-        let contact = args.get("contact")
+        let contact = args
+            .get("contact")
             .and_then(|v| v.as_str())
             .map(|s| s.trim().to_string())
             .filter(|s| !s.is_empty())
@@ -2710,22 +2722,17 @@ Return ONLY the summary, nothing else."#,
         // Generate embedding for name + description (best-effort) via the
         // configured local backend.
         let embed_text = format!("{} {}", name, description);
-        let embedding: Option<Vec<u8>> = match crate::ai::embed_one(
-            &self.ollama,
-            self.llamacpp.as_deref(),
-            &embed_text,
-        )
-        .await
-        {
-            Ok(vec) => {
-                let bytes: Vec<u8> = vec.iter().flat_map(|f| f.to_le_bytes()).collect();
-                Some(bytes)
-            }
-            Err(e) => {
-                log::warn!("Embedding generation failed for entity '{}': {}", name, e);
-                None
-            }
-        };
+        let embedding: Option<Vec<u8>> =
+            match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &embed_text).await {
+                Ok(vec) => {
+                    let bytes: Vec<u8> = vec.iter().flat_map(|f| f.to_le_bytes()).collect();
+                    Some(bytes)
+                }
+                Err(e) => {
+                    log::warn!("Embedding generation failed for entity '{}': {}", name, e);
+                    None
+                }
+            };
 
         let now = chrono::Utc::now().timestamp();
         let insert = InsertEntity {
@@ -3606,8 +3613,7 @@ Return ONLY the summary, nothing else."#,
         kind: BackendKind,
         overrides: &SamplingOverrides,
     ) -> Result<ResolvedBackend> {
-        let local_via_llamacpp =
-            crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
+        let local_via_llamacpp = crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
         let is_hybrid = kind == BackendKind::Hybrid;
 
         // ── chat client ────────────────────────────────────────────────
@@ -3680,12 +3686,15 @@ Return ONLY the summary, nothing else."#,
         };
 
         // ── local client (utility calls: rerank, describe_image, etc.) ─
-        // For llamacpp: mirror the chat model selection so rerank /
-        // describe_image hit the same model that's already loaded —
-        // avoids a mid-turn model swap in llama-swap exclusive mode.
+        // For llamacpp in local mode: mirror the chat model selection so
+        // rerank / describe_image hit the same model that's already
+        // loaded — avoids a mid-turn model swap in llama-swap exclusive
+        // mode. In hybrid mode the override is an OpenRouter model id
+        // (e.g. "google/gemini-3-flash-preview") which llama-swap can't
+        // serve — keep the configured local slots.
         let local: Box<dyn LlmClient> = if local_via_llamacpp {
             let mut lc = self.llamacpp.as_ref().unwrap().as_ref().clone();
-            if let Some(ref m) = overrides.model {
+            if !is_hybrid && let Some(ref m) = overrides.model {
                 lc.primary_model = m.clone();
                 lc.set_vision_model(m.clone());
             }
@@ -3713,14 +3722,14 @@ Return ONLY the summary, nothing else."#,
                         .await
                         .unwrap_or(false);
 
-                let available_on_fallback =
-                    if let Some(ref fallback_url) = self.ollama.fallback_url {
-                        OllamaClient::is_model_available(fallback_url, model)
-                            .await
-                            .unwrap_or(false)
-                    } else {
-                        false
-                    };
+                let available_on_fallback = if let Some(ref fallback_url) = self.ollama.fallback_url
+                {
+                    OllamaClient::is_model_available(fallback_url, model)
+                        .await
+                        .unwrap_or(false)
+                } else {
+                    false
+                };
 
                 if !available_on_primary && !available_on_fallback {
                     anyhow::bail!(
@@ -3761,10 +3770,7 @@ Return ONLY the summary, nothing else."#,
             };
 
             if !capabilities.has_tool_calling {
-                anyhow::bail!(
-                    "tool calling not supported by model '{}'",
-                    ollama_for_caps
-                );
+                anyhow::bail!("tool calling not supported by model '{}'", ollama_for_caps);
             }
 
             capabilities.has_vision
@@ -3801,9 +3807,7 @@ Return ONLY the summary, nothing else."#,
         span.set_attribute(KeyValue::new("max_iterations", max_iterations as i64));
 
         // 1. Resolve backend + build clients.
-        let kind = BackendKind::parse(
-            backend.as_deref().unwrap_or("local"),
-        )?;
+        let kind = BackendKind::parse(backend.as_deref().unwrap_or("local"))?;
         span.set_attribute(KeyValue::new("backend", kind.as_str()));
         let overrides = SamplingOverrides {
             model: custom_model,
@@ -3933,7 +3937,11 @@ Return ONLY the summary, nothing else."#,
                         Some(desc)
                     }
                     Err(e) => {
-                        log::warn!("{}: vision describe failed, continuing without: {}", kind, e);
+                        log::warn!(
+                            "{}: vision describe failed, continuing without: {}",
+                            kind,
+                            e
+                        );
                         None
                     }
                 },
@@ -4813,4 +4821,42 @@ mod tests {
         assert!(t.len() <= 60);
         assert_eq!(b, input);
     }
+
+    /// Regression: hybrid mode was leaking the OpenRouter model override
+    /// into the local llamacpp client, causing describe_image to send
+    /// e.g. "google/gemini-3-flash-preview" to llama-swap (which 400s).
+    #[test]
+    fn resolve_backend_hybrid_does_not_leak_model_to_local_llamacpp() {
+        use crate::ai::llamacpp::LlamaCppClient;
+
+        let mut base =
+            LlamaCppClient::new(Some("http://localhost:9292/v1".into()), Some("chat".into()));
+        base.set_vision_model("vision".into());
+        base.set_embedding_model("embed".into());
+
+        let openrouter_model = "google/gemini-3-flash-preview";
+        let overrides_model: Option<String> = Some(openrouter_model.into());
+        let is_hybrid = true;
+
+        // Replicate the resolve_backend local-client construction
+        // (lines ~3686-3695 of this file).
+        let mut lc = base.clone();
+        if let Some(ref m) = overrides_model {
+            if !is_hybrid {
+                lc.primary_model = m.clone();
+                lc.set_vision_model(m.clone());
+            }
+        }
+
+        // In hybrid mode the local client must keep its configured slots.
+        assert_eq!(
+            lc.vision_model, "vision",
+            "hybrid mode must not override vision_model with OpenRouter model"
+        );
+        assert_eq!(
+            lc.primary_model, "chat",
+            "hybrid mode must not override primary_model with OpenRouter model"
+        );
+        assert_eq!(lc.embedding_model, "embed");
+    }
 }
diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs
index 8ea1063..e2ba00d 100644
--- a/src/ai/llamacpp.rs
+++ b/src/ai/llamacpp.rs
@@ -409,10 +409,7 @@ impl LlmClient for LlamaCppClient {
             tools.len()
         );
         let mut body = serde_json::Map::new();
-        body.insert(
-            "model".into(),
-            Value::String(self.primary_model.clone()),
-        );
+        body.insert("model".into(), Value::String(self.primary_model.clone()));
         body.insert(
             "messages".into(),
             Value::Array(Self::messages_to_openai(&messages)),
@@ -1071,6 +1068,28 @@ mod tests {
         assert_eq!(local.embedding_model, "embed");
     }
 
+    #[test]
+    fn hybrid_mode_local_client_preserves_vision_model() {
+        // In hybrid mode, overrides.model is an OpenRouter model id
+        // (e.g. "google/gemini-3-flash-preview"). The local llamacpp
+        // client must NOT adopt that as its vision_model — it should
+        // keep the configured LLAMA_SWAP_VISION_MODEL so describe_image
+        // hits the correct local slot instead of sending an unknown
+        // model name to llama-swap.
+        let mut base = LlamaCppClient::new(None, Some("chat".into()));
+        base.set_vision_model("vision".into());
+        base.set_embedding_model("embed".into());
+
+        // Simulate what resolve_backend SHOULD do for hybrid mode:
+        // clone but do NOT override primary_model / vision_model.
+        let local = base.clone();
+
+        // The local client keeps its configured slots.
+        assert_eq!(local.primary_model, "chat");
+        assert_eq!(local.vision_model, "vision");
+        assert_eq!(local.embedding_model, "embed");
+    }
+
     #[test]
     fn assistant_tool_calls_emit_null_content() {
         let msg = ChatMessage {
@@ -1086,7 +1105,10 @@ mod tests {
             images: None,
         };
         let wire = LlamaCppClient::messages_to_openai(&[msg]);
-        assert!(wire[0]["content"].is_null(), "empty content + tool_calls should emit null");
+        assert!(
+            wire[0]["content"].is_null(),
+            "empty content + tool_calls should emit null"
+        );
     }
 
     #[test]
diff --git a/src/ai/mod.rs b/src/ai/mod.rs
index 09cbfda..c991c71 100644
--- a/src/ai/mod.rs
+++ b/src/ai/mod.rs
@@ -25,11 +25,11 @@ pub use handlers::{
     get_insight_handler, get_openrouter_models_handler, rate_insight_handler,
 };
 pub use insight_generator::InsightGenerator;
+pub use llamacpp::LlamaCppClient;
 #[allow(unused_imports)]
 pub use llm_client::{
     ChatMessage, LlmClient, ModelCapabilities, Tool, ToolCall, ToolCallFunction, ToolFunction,
 };
-pub use llamacpp::LlamaCppClient;
 pub use ollama::{EMBEDDING_MODEL, OllamaClient};
 pub use sms_client::{SmsApiClient, SmsMessage};