ai: add llamacpp backend (llama-swap) as third LLM client
Wires a new LlamaCppClient (OpenAI-compatible /v1 wire format) alongside OllamaClient and OpenRouterClient. Per-slot routing for chat/vision/embed via env (LLAMA_SWAP_URL + *_MODEL vars); capability inference uses an env allowlist since /v1/models doesn't report modality. InsightGenerator + InsightChatService gain three-way dispatch on chat_backend = "local" | "hybrid" | "llamacpp". Hybrid and llamacpp share the describe-then-inline path (text-only chat after a separate vision describe). HYBRID_VISION_BACKEND=llamacpp lets hybrid route its describe pass through llama-swap's vision slot while chat still goes to OpenRouter. Cross-replay matrix added (validate_cross_replay): local<->llamacpp and hybrid<->llamacpp allowed; local->hybrid and llamacpp->hybrid rejected. New /insights/llamacpp/models handler mirrors the OpenRouter shape.
This commit is contained in:
@@ -12,6 +12,7 @@ use std::sync::{Arc, Mutex};
|
||||
use crate::ai::apollo_client::{ApolloClient, ApolloPlace};
|
||||
use crate::ai::llm_client::LlmClient;
|
||||
use crate::ai::ollama::{ChatMessage, OllamaClient, Tool};
|
||||
use crate::ai::llamacpp::LlamaCppClient;
|
||||
use crate::ai::openrouter::OpenRouterClient;
|
||||
use crate::ai::sms_client::{SmsApiClient, SmsSearchHit, SmsSearchParams};
|
||||
use crate::ai::user_display_name;
|
||||
@@ -68,6 +69,9 @@ pub struct InsightGenerator {
|
||||
/// Optional OpenRouter client, used when `backend=hybrid` is requested.
|
||||
/// `None` when `OPENROUTER_API_KEY` is not configured.
|
||||
openrouter: Option<Arc<OpenRouterClient>>,
|
||||
/// Optional llama-swap client, used when `backend=llamacpp` is requested.
|
||||
/// `None` when `LLAMA_SWAP_URL` is not configured.
|
||||
llamacpp: Option<Arc<LlamaCppClient>>,
|
||||
sms_client: SmsApiClient,
|
||||
/// Optional integration with Apollo's user-defined Places. When the
|
||||
/// integration is disabled (`APOLLO_API_BASE_URL` unset), every
|
||||
@@ -120,6 +124,7 @@ impl InsightGenerator {
|
||||
pub fn new(
|
||||
ollama: OllamaClient,
|
||||
openrouter: Option<Arc<OpenRouterClient>>,
|
||||
llamacpp: Option<Arc<LlamaCppClient>>,
|
||||
sms_client: SmsApiClient,
|
||||
apollo_client: ApolloClient,
|
||||
insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
|
||||
@@ -137,6 +142,7 @@ impl InsightGenerator {
|
||||
Self {
|
||||
ollama,
|
||||
openrouter,
|
||||
llamacpp,
|
||||
sms_client,
|
||||
apollo_client,
|
||||
insight_dao,
|
||||
@@ -3574,23 +3580,31 @@ Return ONLY the summary, nothing else."#,
|
||||
.map(|s| s.trim().to_lowercase())
|
||||
.filter(|s| !s.is_empty())
|
||||
.unwrap_or_else(|| "local".to_string());
|
||||
if !matches!(backend_label.as_str(), "local" | "hybrid") {
|
||||
if !matches!(backend_label.as_str(), "local" | "hybrid" | "llamacpp") {
|
||||
return Err(anyhow::anyhow!(
|
||||
"unknown backend '{}'; expected 'local' or 'hybrid'",
|
||||
"unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'",
|
||||
backend_label
|
||||
));
|
||||
}
|
||||
span.set_attribute(KeyValue::new("backend", backend_label.clone()));
|
||||
let is_hybrid = backend_label == "hybrid";
|
||||
let is_llamacpp = backend_label == "llamacpp";
|
||||
// In hybrid + llamacpp modes the chat model never sees the image
|
||||
// directly; we describe-then-inline locally before the agentic loop
|
||||
// starts. Tracked as a single flag so vision/tool-gate logic doesn't
|
||||
// have to branch twice.
|
||||
let describes_then_inlines = is_hybrid || is_llamacpp;
|
||||
|
||||
// 1b. Always build an Ollama client. In local mode it owns the chat
|
||||
// loop; in hybrid mode it still handles describe_image + any
|
||||
// tool-local calls (e.g. if a future tool needs embeddings).
|
||||
// Sampling overrides only apply in local mode — in hybrid the
|
||||
// user's params belong to the OpenRouter chat client.
|
||||
let apply_sampling_to_ollama = !is_hybrid;
|
||||
// loop; in hybrid/llamacpp mode it still handles tool-local calls
|
||||
// (e.g. future embedding-backed tools). The chat backend is
|
||||
// selected separately below.
|
||||
// Sampling overrides only apply in local mode — in
|
||||
// hybrid/llamacpp the user's params belong to the alternate chat
|
||||
// client.
|
||||
let apply_sampling_to_ollama = !describes_then_inlines;
|
||||
let mut ollama_client = if let Some(ref model) = custom_model
|
||||
&& !is_hybrid
|
||||
&& !describes_then_inlines
|
||||
{
|
||||
log::info!("Using custom model for agentic: {}", model);
|
||||
span.set_attribute(KeyValue::new("custom_model", model.clone()));
|
||||
@@ -3601,7 +3615,7 @@ Return ONLY the summary, nothing else."#,
|
||||
Some(model.clone()),
|
||||
)
|
||||
} else {
|
||||
if !is_hybrid {
|
||||
if !describes_then_inlines {
|
||||
span.set_attribute(KeyValue::new("model", self.ollama.primary_model.clone()));
|
||||
}
|
||||
self.ollama.clone()
|
||||
@@ -3674,6 +3688,44 @@ Return ONLY the summary, nothing else."#,
|
||||
None
|
||||
};
|
||||
|
||||
// 1d. In llamacpp mode, clone the configured LlamaCpp client and
|
||||
// apply per-request overrides. Same shape as the openrouter
|
||||
// branch above; describe_image will route through the vision
|
||||
// slot configured on the client.
|
||||
let llamacpp_client: Option<LlamaCppClient> = if is_llamacpp {
|
||||
let arc = self.llamacpp.as_ref().ok_or_else(|| {
|
||||
anyhow::anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured")
|
||||
})?;
|
||||
let mut c: LlamaCppClient = (**arc).clone();
|
||||
if let Some(ref m) = custom_model {
|
||||
c.primary_model = m.clone();
|
||||
span.set_attribute(KeyValue::new("custom_model", m.clone()));
|
||||
}
|
||||
span.set_attribute(KeyValue::new("llamacpp_model", c.primary_model.clone()));
|
||||
if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() {
|
||||
if let Some(t) = temperature {
|
||||
span.set_attribute(KeyValue::new("temperature", t as f64));
|
||||
}
|
||||
if let Some(p) = top_p {
|
||||
span.set_attribute(KeyValue::new("top_p", p as f64));
|
||||
}
|
||||
if let Some(k) = top_k {
|
||||
span.set_attribute(KeyValue::new("top_k", k as i64));
|
||||
}
|
||||
if let Some(m) = min_p {
|
||||
span.set_attribute(KeyValue::new("min_p", m as f64));
|
||||
}
|
||||
c.set_sampling_params(temperature, top_p, top_k, min_p);
|
||||
}
|
||||
if let Some(ctx) = num_ctx {
|
||||
span.set_attribute(KeyValue::new("num_ctx", ctx as i64));
|
||||
c.set_num_ctx(Some(ctx));
|
||||
}
|
||||
Some(c)
|
||||
} else {
|
||||
None
|
||||
};
|
||||
|
||||
let insight_cx = current_cx.with_span(span);
|
||||
|
||||
// 2. Verify chat model supports tool calling.
|
||||
@@ -3681,10 +3733,11 @@ Return ONLY the summary, nothing else."#,
|
||||
// - hybrid: trust the operator's curated allowlist
|
||||
// (OPENROUTER_ALLOWED_MODELS) — no live precheck. A bad model id
|
||||
// surfaces as a chat-call error on the next step.
|
||||
let has_vision = if is_hybrid {
|
||||
// In hybrid mode the chat model never sees images directly — we
|
||||
// describe-then-inject, so `has_vision` drives only whether we
|
||||
// bother loading the image to describe it, which we always do.
|
||||
let has_vision = if describes_then_inlines {
|
||||
// In hybrid + llamacpp modes the chat model never sees images
|
||||
// directly — we describe-then-inject, so `has_vision` drives only
|
||||
// whether we bother loading the image to describe it, which we
|
||||
// always do.
|
||||
true
|
||||
} else {
|
||||
if let Some(ref model_name) = custom_model {
|
||||
@@ -3864,24 +3917,61 @@ Return ONLY the summary, nothing else."#,
|
||||
None
|
||||
};
|
||||
|
||||
let hybrid_visual_description: Option<String> = if is_hybrid {
|
||||
// describe-then-inline path. In hybrid mode the vision backend
|
||||
// defaults to Ollama but can be flipped to llamacpp via
|
||||
// `HYBRID_VISION_BACKEND=llamacpp` (so chat goes to OpenRouter while
|
||||
// vision/audio routes through llama-swap). In llamacpp mode we always
|
||||
// use the llamacpp client's configured vision slot.
|
||||
let inlined_visual_description: Option<String> = if describes_then_inlines {
|
||||
match image_base64.as_deref() {
|
||||
Some(b64) => match self.ollama.describe_image(b64).await {
|
||||
Ok(desc) => {
|
||||
log::info!(
|
||||
"Hybrid: local vision describe succeeded ({} chars)",
|
||||
desc.len()
|
||||
);
|
||||
Some(desc)
|
||||
Some(b64) => {
|
||||
let use_llamacpp_vision = if is_llamacpp {
|
||||
true
|
||||
} else {
|
||||
// is_hybrid branch — consult env switch
|
||||
matches!(
|
||||
std::env::var("HYBRID_VISION_BACKEND")
|
||||
.ok()
|
||||
.as_deref()
|
||||
.map(|s| s.trim().to_lowercase())
|
||||
.as_deref(),
|
||||
Some("llamacpp")
|
||||
)
|
||||
};
|
||||
|
||||
let described = if use_llamacpp_vision {
|
||||
match self.llamacpp.as_ref() {
|
||||
Some(c) => c.describe_image(b64).await,
|
||||
None => {
|
||||
log::warn!(
|
||||
"describe-then-inline: requested llamacpp vision but LLAMA_SWAP_URL is unset, falling back to Ollama"
|
||||
);
|
||||
self.ollama.describe_image(b64).await
|
||||
}
|
||||
}
|
||||
} else {
|
||||
self.ollama.describe_image(b64).await
|
||||
};
|
||||
|
||||
match described {
|
||||
Ok(desc) => {
|
||||
log::info!(
|
||||
"{}: vision describe succeeded ({} chars)",
|
||||
backend_label,
|
||||
desc.len()
|
||||
);
|
||||
Some(desc)
|
||||
}
|
||||
Err(e) => {
|
||||
log::warn!(
|
||||
"{}: vision describe failed, continuing without: {}",
|
||||
backend_label,
|
||||
e
|
||||
);
|
||||
None
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
log::warn!(
|
||||
"Hybrid: local vision describe failed, continuing without: {}",
|
||||
e
|
||||
);
|
||||
None
|
||||
}
|
||||
},
|
||||
}
|
||||
None => None,
|
||||
}
|
||||
} else {
|
||||
@@ -3934,7 +4024,7 @@ Return ONLY the summary, nothing else."#,
|
||||
.map(|c| format!("Contact/Person: {}", c))
|
||||
.unwrap_or_else(|| "Contact/Person: unknown".to_string());
|
||||
|
||||
let visual_block = hybrid_visual_description
|
||||
let visual_block = inlined_visual_description
|
||||
.as_deref()
|
||||
.map(|d| format!("Visual description (from local vision model):\n{}\n\n", d))
|
||||
.unwrap_or_default();
|
||||
@@ -3954,25 +4044,28 @@ Return ONLY the summary, nothing else."#,
|
||||
);
|
||||
|
||||
// 10. Define tools. Gate flags computed from current data presence;
|
||||
// hybrid mode omits describe_photo since the chat model receives
|
||||
// the visual description inline (so we pass `false` for has_vision
|
||||
// in hybrid mode regardless of the model's actual capability).
|
||||
let gate_opts = self.current_gate_opts(has_vision && !is_hybrid);
|
||||
// describe-then-inline modes (hybrid, llamacpp) omit describe_photo
|
||||
// since the chat model receives the visual description inline (so
|
||||
// we pass `false` for has_vision in those modes regardless of the
|
||||
// model's actual capability).
|
||||
let gate_opts = self.current_gate_opts(has_vision && !describes_then_inlines);
|
||||
let tools = Self::build_tool_definitions(gate_opts);
|
||||
|
||||
// 11. Build initial messages. In hybrid mode images are never
|
||||
// attached to the wire message — the description is part of
|
||||
// `user_content`.
|
||||
// 11. Build initial messages. In describe-then-inline modes images
|
||||
// are never attached to the wire message — the description is part
|
||||
// of `user_content`.
|
||||
let system_msg = ChatMessage::system(system_content);
|
||||
let mut user_msg = ChatMessage::user(user_content);
|
||||
if !is_hybrid && let Some(ref img) = image_base64 {
|
||||
if !describes_then_inlines && let Some(ref img) = image_base64 {
|
||||
user_msg.images = Some(vec![img.clone()]);
|
||||
}
|
||||
|
||||
let mut messages = vec![system_msg, user_msg];
|
||||
|
||||
// 12. Agentic loop — dispatch through the selected backend.
|
||||
let chat_backend: &dyn LlmClient = if let Some(ref or_c) = openrouter_client {
|
||||
let chat_backend: &dyn LlmClient = if let Some(ref lc_c) = llamacpp_client {
|
||||
lc_c
|
||||
} else if let Some(ref or_c) = openrouter_client {
|
||||
or_c
|
||||
} else {
|
||||
&ollama_client
|
||||
|
||||
Reference in New Issue
Block a user