feature/llamacpp-backend #101

Merged
cameron merged 11 commits from feature/llamacpp-backend into master 2026-05-26 18:58:48 +00:00
7 changed files with 172 additions and 79 deletions
Showing only changes of commit b03ee60342 - Show all commits

2
Cargo.lock generated
View File

@@ -2051,7 +2051,7 @@ dependencies = [
[[package]] [[package]]
name = "image-api" name = "image-api"
version = "1.1.0" version = "1.2.0"
dependencies = [ dependencies = [
"actix", "actix",
"actix-cors", "actix-cors",

View File

@@ -1,6 +1,6 @@
[package] [package]
name = "image-api" name = "image-api"
version = "1.1.0" version = "1.2.0"
authors = ["Cameron Cordes <cameronc.dev@gmail.com>"] authors = ["Cameron Cordes <cameronc.dev@gmail.com>"]
edition = "2024" edition = "2024"

View File

@@ -13,7 +13,10 @@ impl BackendKind {
match s.trim().to_lowercase().as_str() { match s.trim().to_lowercase().as_str() {
"local" | "" => Ok(Self::Local), "local" | "" => Ok(Self::Local),
"hybrid" => Ok(Self::Hybrid), "hybrid" => Ok(Self::Hybrid),
other => Err(anyhow!("unknown backend '{}'; expected 'local' or 'hybrid'", other)), other => Err(anyhow!(
"unknown backend '{}'; expected 'local' or 'hybrid'",
other
)),
} }
} }
@@ -65,7 +68,12 @@ impl ResolvedBackend {
kind: BackendKind, kind: BackendKind,
images_inline: bool, images_inline: bool,
) -> Self { ) -> Self {
Self { chat, local, kind, images_inline } Self {
chat,
local,
kind,
images_inline,
}
} }
pub fn chat(&self) -> &dyn LlmClient { pub fn chat(&self) -> &dyn LlmClient {
@@ -97,21 +105,35 @@ mod tests {
#[test] #[test]
fn backend_kind_as_str_roundtrips() { fn backend_kind_as_str_roundtrips() {
assert_eq!(BackendKind::parse(BackendKind::Local.as_str()).unwrap(), BackendKind::Local); assert_eq!(
assert_eq!(BackendKind::parse(BackendKind::Hybrid.as_str()).unwrap(), BackendKind::Hybrid); BackendKind::parse(BackendKind::Local.as_str()).unwrap(),
BackendKind::Local
);
assert_eq!(
BackendKind::parse(BackendKind::Hybrid.as_str()).unwrap(),
BackendKind::Hybrid
);
} }
#[test] #[test]
fn sampling_overrides_has_sampling() { fn sampling_overrides_has_sampling() {
let empty = SamplingOverrides { let empty = SamplingOverrides {
model: None, num_ctx: None, temperature: None, model: None,
top_p: None, top_k: None, min_p: None, num_ctx: None,
temperature: None,
top_p: None,
top_k: None,
min_p: None,
}; };
assert!(!empty.has_sampling()); assert!(!empty.has_sampling());
let with_temp = SamplingOverrides { let with_temp = SamplingOverrides {
model: None, num_ctx: Some(4096), temperature: Some(0.7), model: None,
top_p: None, top_k: None, min_p: None, num_ctx: Some(4096),
temperature: Some(0.7),
top_p: None,
top_k: None,
min_p: None,
}; };
assert!(with_temp.has_sampling()); assert!(with_temp.has_sampling());
} }

View File

@@ -308,7 +308,9 @@ impl InsightChatService {
let stored_model = insight.model_version.clone(); let stored_model = insight.model_version.clone();
let overrides = SamplingOverrides { let overrides = SamplingOverrides {
model: req.model.clone() model: req
.model
.clone()
.or_else(|| Some(stored_model.clone())) .or_else(|| Some(stored_model.clone()))
.filter(|m| !m.is_empty()), .filter(|m| !m.is_empty()),
num_ctx: req.num_ctx, num_ctx: req.num_ctx,
@@ -731,7 +733,9 @@ impl InsightChatService {
let stored_model = insight.model_version.clone(); let stored_model = insight.model_version.clone();
let overrides = SamplingOverrides { let overrides = SamplingOverrides {
model: req.model.clone() model: req
.model
.clone()
.or_else(|| Some(stored_model.clone())) .or_else(|| Some(stored_model.clone()))
.filter(|m| !m.is_empty()), .filter(|m| !m.is_empty()),
num_ctx: req.num_ctx, num_ctx: req.num_ctx,
@@ -928,8 +932,7 @@ impl InsightChatService {
// images_inline backends send images directly to the chat model. // images_inline backends send images directly to the chat model.
let visual_block = if !backend.images_inline { let visual_block = if !backend.images_inline {
match image_base64.as_deref() { match image_base64.as_deref() {
Some(b64) => { Some(b64) => match backend.local().describe_image(b64).await {
match backend.local().describe_image(b64).await {
Ok(desc) => { Ok(desc) => {
format!("Visual description (from local vision model):\n{}\n", desc) format!("Visual description (from local vision model):\n{}\n", desc)
} }
@@ -937,8 +940,7 @@ impl InsightChatService {
log::warn!("{} bootstrap: describe_image failed: {}", kind.as_str(), e); log::warn!("{} bootstrap: describe_image failed: {}", kind.as_str(), e);
String::new() String::new()
} }
} },
}
None => String::new(), None => String::new(),
} }
} else { } else {
@@ -1328,10 +1330,7 @@ fn resolve_bootstrap_backend(supplied: Option<&str>) -> Result<String> {
.filter(|s| !s.is_empty()) .filter(|s| !s.is_empty())
.unwrap_or_else(|| "local".to_string()); .unwrap_or_else(|| "local".to_string());
if !matches!(lower.as_str(), "local" | "hybrid") { if !matches!(lower.as_str(), "local" | "hybrid") {
bail!( bail!("unknown backend '{}'; expected 'local' or 'hybrid'", lower);
"unknown backend '{}'; expected 'local' or 'hybrid'",
lower
);
} }
Ok(lower) Ok(lower)
} }
@@ -1971,7 +1970,11 @@ mod tests {
// Both "openrouter" and the former "llamacpp" value are unknown now. // Both "openrouter" and the former "llamacpp" value are unknown now.
for label in &["openrouter", "llamacpp"] { for label in &["openrouter", "llamacpp"] {
let err = validate_cross_replay("local", label).unwrap_err(); let err = validate_cross_replay("local", label).unwrap_err();
assert!(format!("{}", err).contains("unknown backend"), "label={}", label); assert!(
format!("{}", err).contains("unknown backend"),
"label={}",
label
);
} }
} }

View File

@@ -11,9 +11,9 @@ use std::sync::{Arc, Mutex};
use crate::ai::apollo_client::{ApolloClient, ApolloPlace}; use crate::ai::apollo_client::{ApolloClient, ApolloPlace};
use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides}; use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides};
use crate::ai::llamacpp::LlamaCppClient;
use crate::ai::llm_client::LlmClient; use crate::ai::llm_client::LlmClient;
use crate::ai::ollama::{ChatMessage, OllamaClient, Tool}; use crate::ai::ollama::{ChatMessage, OllamaClient, Tool};
use crate::ai::llamacpp::LlamaCppClient;
use crate::ai::openrouter::OpenRouterClient; use crate::ai::openrouter::OpenRouterClient;
use crate::ai::sms_client::{SmsApiClient, SmsSearchHit, SmsSearchParams}; use crate::ai::sms_client::{SmsApiClient, SmsSearchHit, SmsSearchParams};
use crate::ai::user_display_name; use crate::ai::user_display_name;
@@ -35,7 +35,10 @@ pub(crate) fn parse_title_body(raw: &str) -> (String, String) {
let trimmed = raw.trim(); let trimmed = raw.trim();
// Try "Title: <title>\n\n<body>" or "Title: <title>\n<body>" // Try "Title: <title>\n\n<body>" or "Title: <title>\n<body>"
if let Some(rest) = trimmed.strip_prefix("Title:").or_else(|| trimmed.strip_prefix("title:")) { if let Some(rest) = trimmed
.strip_prefix("Title:")
.or_else(|| trimmed.strip_prefix("title:"))
{
let rest = rest.trim_start(); let rest = rest.trim_start();
if let Some(split_pos) = rest.find("\n\n").or_else(|| rest.find('\n')) { if let Some(split_pos) = rest.find("\n\n").or_else(|| rest.find('\n')) {
let title = rest[..split_pos].trim(); let title = rest[..split_pos].trim();
@@ -1644,7 +1647,10 @@ Return ONLY the summary, nothing else."#,
"get_location_history" => self.tool_get_location_history(arguments, cx).await, "get_location_history" => self.tool_get_location_history(arguments, cx).await,
"get_file_tags" => self.tool_get_file_tags(arguments, cx).await, "get_file_tags" => self.tool_get_file_tags(arguments, cx).await,
"get_faces_in_photo" => self.tool_get_faces_in_photo(arguments, cx).await, "get_faces_in_photo" => self.tool_get_faces_in_photo(arguments, cx).await,
"describe_photo" => self.tool_describe_photo(backend.local(), image_base64).await, "describe_photo" => {
self.tool_describe_photo(backend.local(), image_base64)
.await
}
"reverse_geocode" => self.tool_reverse_geocode(arguments).await, "reverse_geocode" => self.tool_reverse_geocode(arguments).await,
"get_personal_place_at" => self.tool_get_personal_place_at(arguments).await, "get_personal_place_at" => self.tool_get_personal_place_at(arguments).await,
"recall_entities" => self.tool_recall_entities(arguments, cx).await, "recall_entities" => self.tool_recall_entities(arguments, cx).await,
@@ -1655,7 +1661,13 @@ Return ONLY the summary, nothing else."#,
"store_entity" => self.tool_store_entity(arguments, cx).await, "store_entity" => self.tool_store_entity(arguments, cx).await,
"store_fact" => { "store_fact" => {
self.tool_store_fact( self.tool_store_fact(
arguments, file_path, user_id, persona_id, model, backend_label, cx, arguments,
file_path,
user_id,
persona_id,
model,
backend_label,
cx,
) )
.await .await
} }
@@ -1810,9 +1822,8 @@ Return ONLY the summary, nothing else."#,
); );
let started = std::time::Instant::now(); let started = std::time::Instant::now();
let system = Some( let system =
"You are a terse relevance ranker. You output only numbers separated by commas.", Some("You are a terse relevance ranker. You output only numbers separated by commas.");
);
let response = local.generate(&prompt, system, None).await?; let response = local.generate(&prompt, system, None).await?;
log::info!( log::info!(
"rerank: finished in {} ms (prompt={} chars)", "rerank: finished in {} ms (prompt={} chars)",
@@ -1960,7 +1971,8 @@ Return ONLY the summary, nothing else."#,
.unwrap_or(20) .unwrap_or(20)
.clamp(1, 50) as usize; .clamp(1, 50) as usize;
let contact_id = args.get("contact_id").and_then(|v| v.as_i64()); let contact_id = args.get("contact_id").and_then(|v| v.as_i64());
let contact = args.get("contact") let contact = args
.get("contact")
.and_then(|v| v.as_str()) .and_then(|v| v.as_str())
.map(|s| s.trim().to_string()) .map(|s| s.trim().to_string())
.filter(|s| !s.is_empty()) .filter(|s| !s.is_empty())
@@ -2710,13 +2722,8 @@ Return ONLY the summary, nothing else."#,
// Generate embedding for name + description (best-effort) via the // Generate embedding for name + description (best-effort) via the
// configured local backend. // configured local backend.
let embed_text = format!("{} {}", name, description); let embed_text = format!("{} {}", name, description);
let embedding: Option<Vec<u8>> = match crate::ai::embed_one( let embedding: Option<Vec<u8>> =
&self.ollama, match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &embed_text).await {
self.llamacpp.as_deref(),
&embed_text,
)
.await
{
Ok(vec) => { Ok(vec) => {
let bytes: Vec<u8> = vec.iter().flat_map(|f| f.to_le_bytes()).collect(); let bytes: Vec<u8> = vec.iter().flat_map(|f| f.to_le_bytes()).collect();
Some(bytes) Some(bytes)
@@ -3606,8 +3613,7 @@ Return ONLY the summary, nothing else."#,
kind: BackendKind, kind: BackendKind,
overrides: &SamplingOverrides, overrides: &SamplingOverrides,
) -> Result<ResolvedBackend> { ) -> Result<ResolvedBackend> {
let local_via_llamacpp = let local_via_llamacpp = crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
let is_hybrid = kind == BackendKind::Hybrid; let is_hybrid = kind == BackendKind::Hybrid;
// ── chat client ──────────────────────────────────────────────── // ── chat client ────────────────────────────────────────────────
@@ -3680,12 +3686,15 @@ Return ONLY the summary, nothing else."#,
}; };
// ── local client (utility calls: rerank, describe_image, etc.) ─ // ── local client (utility calls: rerank, describe_image, etc.) ─
// For llamacpp: mirror the chat model selection so rerank / // For llamacpp in local mode: mirror the chat model selection so
// describe_image hit the same model that's already loaded — // rerank / describe_image hit the same model that's already
// avoids a mid-turn model swap in llama-swap exclusive mode. // loaded — avoids a mid-turn model swap in llama-swap exclusive
// mode. In hybrid mode the override is an OpenRouter model id
// (e.g. "google/gemini-3-flash-preview") which llama-swap can't
// serve — keep the configured local slots.
let local: Box<dyn LlmClient> = if local_via_llamacpp { let local: Box<dyn LlmClient> = if local_via_llamacpp {
let mut lc = self.llamacpp.as_ref().unwrap().as_ref().clone(); let mut lc = self.llamacpp.as_ref().unwrap().as_ref().clone();
if let Some(ref m) = overrides.model { if !is_hybrid && let Some(ref m) = overrides.model {
lc.primary_model = m.clone(); lc.primary_model = m.clone();
lc.set_vision_model(m.clone()); lc.set_vision_model(m.clone());
} }
@@ -3713,8 +3722,8 @@ Return ONLY the summary, nothing else."#,
.await .await
.unwrap_or(false); .unwrap_or(false);
let available_on_fallback = let available_on_fallback = if let Some(ref fallback_url) = self.ollama.fallback_url
if let Some(ref fallback_url) = self.ollama.fallback_url { {
OllamaClient::is_model_available(fallback_url, model) OllamaClient::is_model_available(fallback_url, model)
.await .await
.unwrap_or(false) .unwrap_or(false)
@@ -3761,10 +3770,7 @@ Return ONLY the summary, nothing else."#,
}; };
if !capabilities.has_tool_calling { if !capabilities.has_tool_calling {
anyhow::bail!( anyhow::bail!("tool calling not supported by model '{}'", ollama_for_caps);
"tool calling not supported by model '{}'",
ollama_for_caps
);
} }
capabilities.has_vision capabilities.has_vision
@@ -3801,9 +3807,7 @@ Return ONLY the summary, nothing else."#,
span.set_attribute(KeyValue::new("max_iterations", max_iterations as i64)); span.set_attribute(KeyValue::new("max_iterations", max_iterations as i64));
// 1. Resolve backend + build clients. // 1. Resolve backend + build clients.
let kind = BackendKind::parse( let kind = BackendKind::parse(backend.as_deref().unwrap_or("local"))?;
backend.as_deref().unwrap_or("local"),
)?;
span.set_attribute(KeyValue::new("backend", kind.as_str())); span.set_attribute(KeyValue::new("backend", kind.as_str()));
let overrides = SamplingOverrides { let overrides = SamplingOverrides {
model: custom_model, model: custom_model,
@@ -3933,7 +3937,11 @@ Return ONLY the summary, nothing else."#,
Some(desc) Some(desc)
} }
Err(e) => { Err(e) => {
log::warn!("{}: vision describe failed, continuing without: {}", kind, e); log::warn!(
"{}: vision describe failed, continuing without: {}",
kind,
e
);
None None
} }
}, },
@@ -4813,4 +4821,42 @@ mod tests {
assert!(t.len() <= 60); assert!(t.len() <= 60);
assert_eq!(b, input); assert_eq!(b, input);
} }
/// Regression: hybrid mode was leaking the OpenRouter model override
/// into the local llamacpp client, causing describe_image to send
/// e.g. "google/gemini-3-flash-preview" to llama-swap (which 400s).
#[test]
fn resolve_backend_hybrid_does_not_leak_model_to_local_llamacpp() {
use crate::ai::llamacpp::LlamaCppClient;
let mut base =
LlamaCppClient::new(Some("http://localhost:9292/v1".into()), Some("chat".into()));
base.set_vision_model("vision".into());
base.set_embedding_model("embed".into());
let openrouter_model = "google/gemini-3-flash-preview";
let overrides_model: Option<String> = Some(openrouter_model.into());
let is_hybrid = true;
// Replicate the resolve_backend local-client construction
// (lines ~3686-3695 of this file).
let mut lc = base.clone();
if let Some(ref m) = overrides_model {
if !is_hybrid {
lc.primary_model = m.clone();
lc.set_vision_model(m.clone());
}
}
// In hybrid mode the local client must keep its configured slots.
assert_eq!(
lc.vision_model, "vision",
"hybrid mode must not override vision_model with OpenRouter model"
);
assert_eq!(
lc.primary_model, "chat",
"hybrid mode must not override primary_model with OpenRouter model"
);
assert_eq!(lc.embedding_model, "embed");
}
} }

View File

@@ -409,10 +409,7 @@ impl LlmClient for LlamaCppClient {
tools.len() tools.len()
); );
let mut body = serde_json::Map::new(); let mut body = serde_json::Map::new();
body.insert( body.insert("model".into(), Value::String(self.primary_model.clone()));
"model".into(),
Value::String(self.primary_model.clone()),
);
body.insert( body.insert(
"messages".into(), "messages".into(),
Value::Array(Self::messages_to_openai(&messages)), Value::Array(Self::messages_to_openai(&messages)),
@@ -1071,6 +1068,28 @@ mod tests {
assert_eq!(local.embedding_model, "embed"); assert_eq!(local.embedding_model, "embed");
} }
#[test]
fn hybrid_mode_local_client_preserves_vision_model() {
// In hybrid mode, overrides.model is an OpenRouter model id
// (e.g. "google/gemini-3-flash-preview"). The local llamacpp
// client must NOT adopt that as its vision_model — it should
// keep the configured LLAMA_SWAP_VISION_MODEL so describe_image
// hits the correct local slot instead of sending an unknown
// model name to llama-swap.
let mut base = LlamaCppClient::new(None, Some("chat".into()));
base.set_vision_model("vision".into());
base.set_embedding_model("embed".into());
// Simulate what resolve_backend SHOULD do for hybrid mode:
// clone but do NOT override primary_model / vision_model.
let local = base.clone();
// The local client keeps its configured slots.
assert_eq!(local.primary_model, "chat");
assert_eq!(local.vision_model, "vision");
assert_eq!(local.embedding_model, "embed");
}
#[test] #[test]
fn assistant_tool_calls_emit_null_content() { fn assistant_tool_calls_emit_null_content() {
let msg = ChatMessage { let msg = ChatMessage {
@@ -1086,7 +1105,10 @@ mod tests {
images: None, images: None,
}; };
let wire = LlamaCppClient::messages_to_openai(&[msg]); let wire = LlamaCppClient::messages_to_openai(&[msg]);
assert!(wire[0]["content"].is_null(), "empty content + tool_calls should emit null"); assert!(
wire[0]["content"].is_null(),
"empty content + tool_calls should emit null"
);
} }
#[test] #[test]

View File

@@ -25,11 +25,11 @@ pub use handlers::{
get_insight_handler, get_openrouter_models_handler, rate_insight_handler, get_insight_handler, get_openrouter_models_handler, rate_insight_handler,
}; };
pub use insight_generator::InsightGenerator; pub use insight_generator::InsightGenerator;
pub use llamacpp::LlamaCppClient;
#[allow(unused_imports)] #[allow(unused_imports)]
pub use llm_client::{ pub use llm_client::{
ChatMessage, LlmClient, ModelCapabilities, Tool, ToolCall, ToolCallFunction, ToolFunction, ChatMessage, LlmClient, ModelCapabilities, Tool, ToolCall, ToolCallFunction, ToolFunction,
}; };
pub use llamacpp::LlamaCppClient;
pub use ollama::{EMBEDDING_MODEL, OllamaClient}; pub use ollama::{EMBEDDING_MODEL, OllamaClient};
pub use sms_client::{SmsApiClient, SmsMessage}; pub use sms_client::{SmsApiClient, SmsMessage};