fix: prevent hybrid mode from leaking OpenRouter model to local llamacpp client

When backend=hybrid with LLM_BACKEND=llamacpp, the user-selected model
(an OpenRouter id like "google/gemini-3-flash-preview") was being applied
to the local LlamaCppClient's primary_model and vision_model. This caused
describe_image to send the OpenRouter model name to llama-swap, which
returned 400 because it has no such slot.

Guard the local-client model override with !is_hybrid so it only applies
in local-only mode (where the user is selecting a different local model).
Bump to v1.2.0.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Cameron Cordes
2026-05-26 09:55:16 -04:00
parent 0a627f4880
commit b03ee60342
7 changed files with 172 additions and 79 deletions
+92 -46
View File
@@ -11,9 +11,9 @@ use std::sync::{Arc, Mutex};
use crate::ai::apollo_client::{ApolloClient, ApolloPlace};
use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides};
use crate::ai::llamacpp::LlamaCppClient;
use crate::ai::llm_client::LlmClient;
use crate::ai::ollama::{ChatMessage, OllamaClient, Tool};
use crate::ai::llamacpp::LlamaCppClient;
use crate::ai::openrouter::OpenRouterClient;
use crate::ai::sms_client::{SmsApiClient, SmsSearchHit, SmsSearchParams};
use crate::ai::user_display_name;
@@ -35,7 +35,10 @@ pub(crate) fn parse_title_body(raw: &str) -> (String, String) {
let trimmed = raw.trim();
// Try "Title: <title>\n\n<body>" or "Title: <title>\n<body>"
if let Some(rest) = trimmed.strip_prefix("Title:").or_else(|| trimmed.strip_prefix("title:")) {
if let Some(rest) = trimmed
.strip_prefix("Title:")
.or_else(|| trimmed.strip_prefix("title:"))
{
let rest = rest.trim_start();
if let Some(split_pos) = rest.find("\n\n").or_else(|| rest.find('\n')) {
let title = rest[..split_pos].trim();
@@ -1644,7 +1647,10 @@ Return ONLY the summary, nothing else."#,
"get_location_history" => self.tool_get_location_history(arguments, cx).await,
"get_file_tags" => self.tool_get_file_tags(arguments, cx).await,
"get_faces_in_photo" => self.tool_get_faces_in_photo(arguments, cx).await,
"describe_photo" => self.tool_describe_photo(backend.local(), image_base64).await,
"describe_photo" => {
self.tool_describe_photo(backend.local(), image_base64)
.await
}
"reverse_geocode" => self.tool_reverse_geocode(arguments).await,
"get_personal_place_at" => self.tool_get_personal_place_at(arguments).await,
"recall_entities" => self.tool_recall_entities(arguments, cx).await,
@@ -1655,7 +1661,13 @@ Return ONLY the summary, nothing else."#,
"store_entity" => self.tool_store_entity(arguments, cx).await,
"store_fact" => {
self.tool_store_fact(
arguments, file_path, user_id, persona_id, model, backend_label, cx,
arguments,
file_path,
user_id,
persona_id,
model,
backend_label,
cx,
)
.await
}
@@ -1810,9 +1822,8 @@ Return ONLY the summary, nothing else."#,
);
let started = std::time::Instant::now();
let system = Some(
"You are a terse relevance ranker. You output only numbers separated by commas.",
);
let system =
Some("You are a terse relevance ranker. You output only numbers separated by commas.");
let response = local.generate(&prompt, system, None).await?;
log::info!(
"rerank: finished in {} ms (prompt={} chars)",
@@ -1960,7 +1971,8 @@ Return ONLY the summary, nothing else."#,
.unwrap_or(20)
.clamp(1, 50) as usize;
let contact_id = args.get("contact_id").and_then(|v| v.as_i64());
let contact = args.get("contact")
let contact = args
.get("contact")
.and_then(|v| v.as_str())
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
@@ -2710,22 +2722,17 @@ Return ONLY the summary, nothing else."#,
// Generate embedding for name + description (best-effort) via the
// configured local backend.
let embed_text = format!("{} {}", name, description);
let embedding: Option<Vec<u8>> = match crate::ai::embed_one(
&self.ollama,
self.llamacpp.as_deref(),
&embed_text,
)
.await
{
Ok(vec) => {
let bytes: Vec<u8> = vec.iter().flat_map(|f| f.to_le_bytes()).collect();
Some(bytes)
}
Err(e) => {
log::warn!("Embedding generation failed for entity '{}': {}", name, e);
None
}
};
let embedding: Option<Vec<u8>> =
match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &embed_text).await {
Ok(vec) => {
let bytes: Vec<u8> = vec.iter().flat_map(|f| f.to_le_bytes()).collect();
Some(bytes)
}
Err(e) => {
log::warn!("Embedding generation failed for entity '{}': {}", name, e);
None
}
};
let now = chrono::Utc::now().timestamp();
let insert = InsertEntity {
@@ -3606,8 +3613,7 @@ Return ONLY the summary, nothing else."#,
kind: BackendKind,
overrides: &SamplingOverrides,
) -> Result<ResolvedBackend> {
let local_via_llamacpp =
crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
let local_via_llamacpp = crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
let is_hybrid = kind == BackendKind::Hybrid;
// ── chat client ────────────────────────────────────────────────
@@ -3680,12 +3686,15 @@ Return ONLY the summary, nothing else."#,
};
// ── local client (utility calls: rerank, describe_image, etc.) ─
// For llamacpp: mirror the chat model selection so rerank /
// describe_image hit the same model that's already loaded —
// avoids a mid-turn model swap in llama-swap exclusive mode.
// For llamacpp in local mode: mirror the chat model selection so
// rerank / describe_image hit the same model that's already
// loaded — avoids a mid-turn model swap in llama-swap exclusive
// mode. In hybrid mode the override is an OpenRouter model id
// (e.g. "google/gemini-3-flash-preview") which llama-swap can't
// serve — keep the configured local slots.
let local: Box<dyn LlmClient> = if local_via_llamacpp {
let mut lc = self.llamacpp.as_ref().unwrap().as_ref().clone();
if let Some(ref m) = overrides.model {
if !is_hybrid && let Some(ref m) = overrides.model {
lc.primary_model = m.clone();
lc.set_vision_model(m.clone());
}
@@ -3713,14 +3722,14 @@ Return ONLY the summary, nothing else."#,
.await
.unwrap_or(false);
let available_on_fallback =
if let Some(ref fallback_url) = self.ollama.fallback_url {
OllamaClient::is_model_available(fallback_url, model)
.await
.unwrap_or(false)
} else {
false
};
let available_on_fallback = if let Some(ref fallback_url) = self.ollama.fallback_url
{
OllamaClient::is_model_available(fallback_url, model)
.await
.unwrap_or(false)
} else {
false
};
if !available_on_primary && !available_on_fallback {
anyhow::bail!(
@@ -3761,10 +3770,7 @@ Return ONLY the summary, nothing else."#,
};
if !capabilities.has_tool_calling {
anyhow::bail!(
"tool calling not supported by model '{}'",
ollama_for_caps
);
anyhow::bail!("tool calling not supported by model '{}'", ollama_for_caps);
}
capabilities.has_vision
@@ -3801,9 +3807,7 @@ Return ONLY the summary, nothing else."#,
span.set_attribute(KeyValue::new("max_iterations", max_iterations as i64));
// 1. Resolve backend + build clients.
let kind = BackendKind::parse(
backend.as_deref().unwrap_or("local"),
)?;
let kind = BackendKind::parse(backend.as_deref().unwrap_or("local"))?;
span.set_attribute(KeyValue::new("backend", kind.as_str()));
let overrides = SamplingOverrides {
model: custom_model,
@@ -3933,7 +3937,11 @@ Return ONLY the summary, nothing else."#,
Some(desc)
}
Err(e) => {
log::warn!("{}: vision describe failed, continuing without: {}", kind, e);
log::warn!(
"{}: vision describe failed, continuing without: {}",
kind,
e
);
None
}
},
@@ -4813,4 +4821,42 @@ mod tests {
assert!(t.len() <= 60);
assert_eq!(b, input);
}
/// Regression: hybrid mode was leaking the OpenRouter model override
/// into the local llamacpp client, causing describe_image to send
/// e.g. "google/gemini-3-flash-preview" to llama-swap (which 400s).
#[test]
fn resolve_backend_hybrid_does_not_leak_model_to_local_llamacpp() {
use crate::ai::llamacpp::LlamaCppClient;
let mut base =
LlamaCppClient::new(Some("http://localhost:9292/v1".into()), Some("chat".into()));
base.set_vision_model("vision".into());
base.set_embedding_model("embed".into());
let openrouter_model = "google/gemini-3-flash-preview";
let overrides_model: Option<String> = Some(openrouter_model.into());
let is_hybrid = true;
// Replicate the resolve_backend local-client construction
// (lines ~3686-3695 of this file).
let mut lc = base.clone();
if let Some(ref m) = overrides_model {
if !is_hybrid {
lc.primary_model = m.clone();
lc.set_vision_model(m.clone());
}
}
// In hybrid mode the local client must keep its configured slots.
assert_eq!(
lc.vision_model, "vision",
"hybrid mode must not override vision_model with OpenRouter model"
);
assert_eq!(
lc.primary_model, "chat",
"hybrid mode must not override primary_model with OpenRouter model"
);
assert_eq!(lc.embedding_model, "embed");
}
}