ai: add llamacpp backend (llama-swap) as third LLM client

Wires a new LlamaCppClient (OpenAI-compatible /v1 wire format) alongside
OllamaClient and OpenRouterClient. Per-slot routing for chat/vision/embed
via env (LLAMA_SWAP_URL + *_MODEL vars); capability inference uses an
env allowlist since /v1/models doesn't report modality.

InsightGenerator + InsightChatService gain three-way dispatch on
chat_backend = "local" | "hybrid" | "llamacpp". Hybrid and llamacpp
share the describe-then-inline path (text-only chat after a separate
vision describe). HYBRID_VISION_BACKEND=llamacpp lets hybrid route its
describe pass through llama-swap's vision slot while chat still goes
to OpenRouter.

Cross-replay matrix added (validate_cross_replay): local<->llamacpp
and hybrid<->llamacpp allowed; local->hybrid and llamacpp->hybrid
rejected. New /insights/llamacpp/models handler mirrors the OpenRouter
shape.
This commit is contained in:
Cameron Cordes
2026-05-20 17:52:33 -04:00
parent d04b86e32c
commit f0927f5355
9 changed files with 1468 additions and 102 deletions

View File

@@ -12,6 +12,7 @@ use std::sync::{Arc, Mutex};
use crate::ai::apollo_client::{ApolloClient, ApolloPlace};
use crate::ai::llm_client::LlmClient;
use crate::ai::ollama::{ChatMessage, OllamaClient, Tool};
use crate::ai::llamacpp::LlamaCppClient;
use crate::ai::openrouter::OpenRouterClient;
use crate::ai::sms_client::{SmsApiClient, SmsSearchHit, SmsSearchParams};
use crate::ai::user_display_name;
@@ -68,6 +69,9 @@ pub struct InsightGenerator {
/// Optional OpenRouter client, used when `backend=hybrid` is requested.
/// `None` when `OPENROUTER_API_KEY` is not configured.
openrouter: Option<Arc<OpenRouterClient>>,
/// Optional llama-swap client, used when `backend=llamacpp` is requested.
/// `None` when `LLAMA_SWAP_URL` is not configured.
llamacpp: Option<Arc<LlamaCppClient>>,
sms_client: SmsApiClient,
/// Optional integration with Apollo's user-defined Places. When the
/// integration is disabled (`APOLLO_API_BASE_URL` unset), every
@@ -120,6 +124,7 @@ impl InsightGenerator {
pub fn new(
ollama: OllamaClient,
openrouter: Option<Arc<OpenRouterClient>>,
llamacpp: Option<Arc<LlamaCppClient>>,
sms_client: SmsApiClient,
apollo_client: ApolloClient,
insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
@@ -137,6 +142,7 @@ impl InsightGenerator {
Self {
ollama,
openrouter,
llamacpp,
sms_client,
apollo_client,
insight_dao,
@@ -3574,23 +3580,31 @@ Return ONLY the summary, nothing else."#,
.map(|s| s.trim().to_lowercase())
.filter(|s| !s.is_empty())
.unwrap_or_else(|| "local".to_string());
if !matches!(backend_label.as_str(), "local" | "hybrid") {
if !matches!(backend_label.as_str(), "local" | "hybrid" | "llamacpp") {
return Err(anyhow::anyhow!(
"unknown backend '{}'; expected 'local' or 'hybrid'",
"unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'",
backend_label
));
}
span.set_attribute(KeyValue::new("backend", backend_label.clone()));
let is_hybrid = backend_label == "hybrid";
let is_llamacpp = backend_label == "llamacpp";
// In hybrid + llamacpp modes the chat model never sees the image
// directly; we describe-then-inline locally before the agentic loop
// starts. Tracked as a single flag so vision/tool-gate logic doesn't
// have to branch twice.
let describes_then_inlines = is_hybrid || is_llamacpp;
// 1b. Always build an Ollama client. In local mode it owns the chat
// loop; in hybrid mode it still handles describe_image + any
// tool-local calls (e.g. if a future tool needs embeddings).
// Sampling overrides only apply in local mode — in hybrid the
// user's params belong to the OpenRouter chat client.
let apply_sampling_to_ollama = !is_hybrid;
// loop; in hybrid/llamacpp mode it still handles tool-local calls
// (e.g. future embedding-backed tools). The chat backend is
// selected separately below.
// Sampling overrides only apply in local mode — in
// hybrid/llamacpp the user's params belong to the alternate chat
// client.
let apply_sampling_to_ollama = !describes_then_inlines;
let mut ollama_client = if let Some(ref model) = custom_model
&& !is_hybrid
&& !describes_then_inlines
{
log::info!("Using custom model for agentic: {}", model);
span.set_attribute(KeyValue::new("custom_model", model.clone()));
@@ -3601,7 +3615,7 @@ Return ONLY the summary, nothing else."#,
Some(model.clone()),
)
} else {
if !is_hybrid {
if !describes_then_inlines {
span.set_attribute(KeyValue::new("model", self.ollama.primary_model.clone()));
}
self.ollama.clone()
@@ -3674,6 +3688,44 @@ Return ONLY the summary, nothing else."#,
None
};
// 1d. In llamacpp mode, clone the configured LlamaCpp client and
// apply per-request overrides. Same shape as the openrouter
// branch above; describe_image will route through the vision
// slot configured on the client.
let llamacpp_client: Option<LlamaCppClient> = if is_llamacpp {
let arc = self.llamacpp.as_ref().ok_or_else(|| {
anyhow::anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured")
})?;
let mut c: LlamaCppClient = (**arc).clone();
if let Some(ref m) = custom_model {
c.primary_model = m.clone();
span.set_attribute(KeyValue::new("custom_model", m.clone()));
}
span.set_attribute(KeyValue::new("llamacpp_model", c.primary_model.clone()));
if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() {
if let Some(t) = temperature {
span.set_attribute(KeyValue::new("temperature", t as f64));
}
if let Some(p) = top_p {
span.set_attribute(KeyValue::new("top_p", p as f64));
}
if let Some(k) = top_k {
span.set_attribute(KeyValue::new("top_k", k as i64));
}
if let Some(m) = min_p {
span.set_attribute(KeyValue::new("min_p", m as f64));
}
c.set_sampling_params(temperature, top_p, top_k, min_p);
}
if let Some(ctx) = num_ctx {
span.set_attribute(KeyValue::new("num_ctx", ctx as i64));
c.set_num_ctx(Some(ctx));
}
Some(c)
} else {
None
};
let insight_cx = current_cx.with_span(span);
// 2. Verify chat model supports tool calling.
@@ -3681,10 +3733,11 @@ Return ONLY the summary, nothing else."#,
// - hybrid: trust the operator's curated allowlist
// (OPENROUTER_ALLOWED_MODELS) — no live precheck. A bad model id
// surfaces as a chat-call error on the next step.
let has_vision = if is_hybrid {
// In hybrid mode the chat model never sees images directly — we
// describe-then-inject, so `has_vision` drives only whether we
// bother loading the image to describe it, which we always do.
let has_vision = if describes_then_inlines {
// In hybrid + llamacpp modes the chat model never sees images
// directly — we describe-then-inject, so `has_vision` drives only
// whether we bother loading the image to describe it, which we
// always do.
true
} else {
if let Some(ref model_name) = custom_model {
@@ -3864,24 +3917,61 @@ Return ONLY the summary, nothing else."#,
None
};
let hybrid_visual_description: Option<String> = if is_hybrid {
// describe-then-inline path. In hybrid mode the vision backend
// defaults to Ollama but can be flipped to llamacpp via
// `HYBRID_VISION_BACKEND=llamacpp` (so chat goes to OpenRouter while
// vision/audio routes through llama-swap). In llamacpp mode we always
// use the llamacpp client's configured vision slot.
let inlined_visual_description: Option<String> = if describes_then_inlines {
match image_base64.as_deref() {
Some(b64) => match self.ollama.describe_image(b64).await {
Ok(desc) => {
log::info!(
"Hybrid: local vision describe succeeded ({} chars)",
desc.len()
);
Some(desc)
Some(b64) => {
let use_llamacpp_vision = if is_llamacpp {
true
} else {
// is_hybrid branch — consult env switch
matches!(
std::env::var("HYBRID_VISION_BACKEND")
.ok()
.as_deref()
.map(|s| s.trim().to_lowercase())
.as_deref(),
Some("llamacpp")
)
};
let described = if use_llamacpp_vision {
match self.llamacpp.as_ref() {
Some(c) => c.describe_image(b64).await,
None => {
log::warn!(
"describe-then-inline: requested llamacpp vision but LLAMA_SWAP_URL is unset, falling back to Ollama"
);
self.ollama.describe_image(b64).await
}
}
} else {
self.ollama.describe_image(b64).await
};
match described {
Ok(desc) => {
log::info!(
"{}: vision describe succeeded ({} chars)",
backend_label,
desc.len()
);
Some(desc)
}
Err(e) => {
log::warn!(
"{}: vision describe failed, continuing without: {}",
backend_label,
e
);
None
}
}
Err(e) => {
log::warn!(
"Hybrid: local vision describe failed, continuing without: {}",
e
);
None
}
},
}
None => None,
}
} else {
@@ -3934,7 +4024,7 @@ Return ONLY the summary, nothing else."#,
.map(|c| format!("Contact/Person: {}", c))
.unwrap_or_else(|| "Contact/Person: unknown".to_string());
let visual_block = hybrid_visual_description
let visual_block = inlined_visual_description
.as_deref()
.map(|d| format!("Visual description (from local vision model):\n{}\n\n", d))
.unwrap_or_default();
@@ -3954,25 +4044,28 @@ Return ONLY the summary, nothing else."#,
);
// 10. Define tools. Gate flags computed from current data presence;
// hybrid mode omits describe_photo since the chat model receives
// the visual description inline (so we pass `false` for has_vision
// in hybrid mode regardless of the model's actual capability).
let gate_opts = self.current_gate_opts(has_vision && !is_hybrid);
// describe-then-inline modes (hybrid, llamacpp) omit describe_photo
// since the chat model receives the visual description inline (so
// we pass `false` for has_vision in those modes regardless of the
// model's actual capability).
let gate_opts = self.current_gate_opts(has_vision && !describes_then_inlines);
let tools = Self::build_tool_definitions(gate_opts);
// 11. Build initial messages. In hybrid mode images are never
// attached to the wire message — the description is part of
// `user_content`.
// 11. Build initial messages. In describe-then-inline modes images
// are never attached to the wire message — the description is part
// of `user_content`.
let system_msg = ChatMessage::system(system_content);
let mut user_msg = ChatMessage::user(user_content);
if !is_hybrid && let Some(ref img) = image_base64 {
if !describes_then_inlines && let Some(ref img) = image_base64 {
user_msg.images = Some(vec![img.clone()]);
}
let mut messages = vec![system_msg, user_msg];
// 12. Agentic loop — dispatch through the selected backend.
let chat_backend: &dyn LlmClient = if let Some(ref or_c) = openrouter_client {
let chat_backend: &dyn LlmClient = if let Some(ref lc_c) = llamacpp_client {
lc_c
} else if let Some(ref or_c) = openrouter_client {
or_c
} else {
&ollama_client