ai: add llamacpp backend (llama-swap) as third LLM client

Wires a new LlamaCppClient (OpenAI-compatible /v1 wire format) alongside
OllamaClient and OpenRouterClient. Per-slot routing for chat/vision/embed
via env (LLAMA_SWAP_URL + *_MODEL vars); capability inference uses an
env allowlist since /v1/models doesn't report modality.

InsightGenerator + InsightChatService gain three-way dispatch on
chat_backend = "local" | "hybrid" | "llamacpp". Hybrid and llamacpp
share the describe-then-inline path (text-only chat after a separate
vision describe). HYBRID_VISION_BACKEND=llamacpp lets hybrid route its
describe pass through llama-swap's vision slot while chat still goes
to OpenRouter.

Cross-replay matrix added (validate_cross_replay): local<->llamacpp
and hybrid<->llamacpp allowed; local->hybrid and llamacpp->hybrid
rejected. New /insights/llamacpp/models handler mirrors the OpenRouter
shape.
This commit is contained in:
Cameron Cordes
2026-05-20 17:52:33 -04:00
parent d04b86e32c
commit f0927f5355
9 changed files with 1468 additions and 102 deletions
+206 -60
View File
@@ -9,6 +9,7 @@ use tokio::sync::Mutex as TokioMutex;
use crate::ai::insight_generator::InsightGenerator;
use crate::ai::llm_client::{ChatMessage, LlmClient, LlmStreamEvent, Tool};
use crate::ai::ollama::OllamaClient;
use crate::ai::llamacpp::LlamaCppClient;
use crate::ai::openrouter::OpenRouterClient;
use crate::database::InsightDao;
use crate::database::models::InsertPhotoInsight;
@@ -93,6 +94,7 @@ pub struct InsightChatService {
generator: Arc<InsightGenerator>,
ollama: OllamaClient,
openrouter: Option<Arc<OpenRouterClient>>,
llamacpp: Option<Arc<LlamaCppClient>>,
insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
chat_locks: ChatLockMap,
}
@@ -102,6 +104,7 @@ impl InsightChatService {
generator: Arc<InsightGenerator>,
ollama: OllamaClient,
openrouter: Option<Arc<OpenRouterClient>>,
llamacpp: Option<Arc<LlamaCppClient>>,
insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
chat_locks: ChatLockMap,
) -> Self {
@@ -109,6 +112,7 @@ impl InsightChatService {
generator,
ollama,
openrouter,
llamacpp,
insight_dao,
chat_locks,
}
@@ -303,23 +307,15 @@ impl InsightChatService {
.map(|s| s.trim().to_lowercase())
.filter(|s| !s.is_empty())
.unwrap_or_else(|| stored_backend.clone());
if !matches!(effective_backend.as_str(), "local" | "hybrid") {
bail!(
"unknown backend '{}'; expected 'local' or 'hybrid'",
effective_backend
);
}
if stored_backend == "local" && effective_backend == "hybrid" {
bail!(
"switching from local to hybrid mid-chat isn't supported yet; \
regenerate the insight in hybrid mode if you want OpenRouter chat"
);
}
validate_cross_replay(&stored_backend, &effective_backend)?;
let is_hybrid = effective_backend == "hybrid";
let is_llamacpp = effective_backend == "llamacpp";
let describes_then_inlines = is_hybrid || is_llamacpp;
span.set_attribute(KeyValue::new("backend", effective_backend.clone()));
// 4. Build the chat backend client. Ollama in local mode, a freshly
// cloned OpenRouter client in hybrid mode (clone so per-request
// cloned OpenRouter client in hybrid mode, a freshly cloned
// LlamaCppClient in llamacpp mode (clone so per-request
// sampling/model overrides don't leak into shared state).
let max_iterations = req
.max_iterations
@@ -336,6 +332,7 @@ impl InsightChatService {
let mut ollama_client = self.ollama.clone();
let mut openrouter_client: Option<OpenRouterClient> = None;
let mut llamacpp_client: Option<LlamaCppClient> = None;
if is_hybrid {
let arc = self.openrouter.as_ref().ok_or_else(|| {
@@ -356,6 +353,25 @@ impl InsightChatService {
c.set_num_ctx(Some(ctx));
}
openrouter_client = Some(c);
} else if is_llamacpp {
let arc = self.llamacpp.as_ref().ok_or_else(|| {
anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured")
})?;
let mut c: LlamaCppClient = (**arc).clone();
if let Some(ref m) = custom_model {
c.primary_model = m.clone();
}
if req.temperature.is_some()
|| req.top_p.is_some()
|| req.top_k.is_some()
|| req.min_p.is_some()
{
c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
}
if let Some(ctx) = req.num_ctx {
c.set_num_ctx(Some(ctx));
}
llamacpp_client = Some(c);
} else {
// Local-mode model swap. Build a new client when the chat model
// differs from the configured one (mirrors the agentic pattern).
@@ -381,7 +397,9 @@ impl InsightChatService {
}
}
let chat_backend: &dyn LlmClient = if let Some(ref c) = openrouter_client {
let chat_backend: &dyn LlmClient = if let Some(ref c) = llamacpp_client {
c
} else if let Some(ref c) = openrouter_client {
c
} else {
&ollama_client
@@ -389,18 +407,19 @@ impl InsightChatService {
let model_used = chat_backend.primary_model().to_string();
span.set_attribute(KeyValue::new("model", model_used.clone()));
// 5. Decide vision + tool set. In hybrid we always omit
// `describe_photo` (matches the original generation flow). In
// local we trust the stored history's first-user shape: if it
// carries `images`, the original model was vision-capable, and
// we keep `describe_photo` available.
// 5. Decide vision + tool set. In describe-then-inline modes
// (hybrid, llamacpp) we always omit `describe_photo` (matches the
// original generation flow). In local we trust the stored
// history's first-user shape: if it carries `images`, the
// original model was vision-capable, and we keep `describe_photo`
// available.
let local_first_user_has_image = messages
.iter()
.find(|m| m.role == "user")
.and_then(|m| m.images.as_ref())
.map(|imgs| !imgs.is_empty())
.unwrap_or(false);
let offer_describe_tool = !is_hybrid && local_first_user_has_image;
let offer_describe_tool = !describes_then_inlines && local_first_user_has_image;
// current_gate_opts(has_vision) sets gate_opts.has_vision = has_vision
// and probes the per-table presence flags. Pass `offer_describe_tool`
// directly — the `!is_hybrid && local_first_user_has_image` decision
@@ -799,19 +818,10 @@ impl InsightChatService {
.map(|s| s.trim().to_lowercase())
.filter(|s| !s.is_empty())
.unwrap_or_else(|| stored_backend.clone());
if !matches!(effective_backend.as_str(), "local" | "hybrid") {
bail!(
"unknown backend '{}'; expected 'local' or 'hybrid'",
effective_backend
);
}
if stored_backend == "local" && effective_backend == "hybrid" {
bail!(
"switching from local to hybrid mid-chat isn't supported yet; \
regenerate the insight in hybrid mode if you want OpenRouter chat"
);
}
validate_cross_replay(&stored_backend, &effective_backend)?;
let is_hybrid = effective_backend == "hybrid";
let is_llamacpp = effective_backend == "llamacpp";
let describes_then_inlines = is_hybrid || is_llamacpp;
let max_iterations = req
.max_iterations
@@ -826,20 +836,21 @@ impl InsightChatService {
.filter(|m| !m.is_empty());
let (chat_backend_holder, ollama_client) =
self.build_chat_clients(is_hybrid, custom_model.as_deref(), &req)?;
self.build_chat_clients(&effective_backend, custom_model.as_deref(), &req)?;
let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref();
let model_used = chat_backend.primary_model().to_string();
// Tool set — local mode + first user turn carries an image →
// offer describe_photo. Hybrid: visual description was inlined
// when the insight was bootstrapped, no describe tool needed.
// offer describe_photo. Describe-then-inline modes (hybrid /
// llamacpp): visual description was inlined when the insight was
// bootstrapped, no describe tool needed.
let local_first_user_has_image = messages
.iter()
.find(|m| m.role == "user")
.and_then(|m| m.images.as_ref())
.map(|imgs| !imgs.is_empty())
.unwrap_or(false);
let offer_describe_tool = !is_hybrid && local_first_user_has_image;
let offer_describe_tool = !describes_then_inlines && local_first_user_has_image;
let gate_opts = self.generator.current_gate_opts_for_persona(
offer_describe_tool,
Some((req.user_id, &active_persona)),
@@ -976,6 +987,8 @@ impl InsightChatService {
.unwrap_or_else(|| "default".to_string());
let effective_backend = resolve_bootstrap_backend(req.backend.as_deref())?;
let is_hybrid = effective_backend == "hybrid";
let is_llamacpp = effective_backend == "llamacpp";
let describes_then_inlines = is_hybrid || is_llamacpp;
let max_iterations = req
.max_iterations
@@ -984,7 +997,7 @@ impl InsightChatService {
let custom_model = req.model.clone().filter(|m| !m.is_empty());
let (chat_backend_holder, ollama_client) =
self.build_chat_clients(is_hybrid, custom_model.as_deref(), &req)?;
self.build_chat_clients(&effective_backend, custom_model.as_deref(), &req)?;
let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref();
let model_used = chat_backend.primary_model().to_string();
@@ -1007,21 +1020,48 @@ impl InsightChatService {
_ => None,
});
// Hybrid backend: pre-describe the image via local Ollama vision
// so OpenRouter chat models (which can't see images directly) get
// the visual description as text. Mirrors the same pre-describe
// pass that `generate_agentic_insight_for_photo` does for hybrid.
let visual_block = if is_hybrid {
// Describe-then-inline backends (hybrid, llamacpp): pre-describe the
// image so a text-only chat model gets the visual description inline.
// Vision source: llamacpp's vision slot in llamacpp mode; in hybrid
// mode Ollama by default, llamacpp via `HYBRID_VISION_BACKEND=llamacpp`.
let visual_block = if describes_then_inlines {
match image_base64.as_deref() {
Some(b64) => match self.ollama.describe_image(b64).await {
Ok(desc) => {
format!("Visual description (from local vision model):\n{}\n", desc)
Some(b64) => {
let use_llamacpp_vision = if is_llamacpp {
true
} else {
matches!(
std::env::var("HYBRID_VISION_BACKEND")
.ok()
.as_deref()
.map(|s| s.trim().to_lowercase())
.as_deref(),
Some("llamacpp")
)
};
let described = if use_llamacpp_vision {
match self.llamacpp.as_ref() {
Some(c) => c.describe_image(b64).await,
None => {
log::warn!(
"bootstrap: requested llamacpp vision but LLAMA_SWAP_URL unset; falling back to Ollama"
);
self.ollama.describe_image(b64).await
}
}
} else {
self.ollama.describe_image(b64).await
};
match described {
Ok(desc) => {
format!("Visual description (from local vision model):\n{}\n", desc)
}
Err(e) => {
log::warn!("{} bootstrap: describe_image failed: {}", effective_backend, e);
String::new()
}
}
Err(e) => {
log::warn!("hybrid bootstrap: local describe_image failed: {}", e);
String::new()
}
},
}
None => String::new(),
}
} else {
@@ -1031,7 +1071,7 @@ impl InsightChatService {
// Tool gates. Local + image present → expose describe_photo so
// the chat model can re-look at the photo on demand. Hybrid:
// already inlined, no tool needed.
let offer_describe_tool = !is_hybrid && image_base64.is_some();
let offer_describe_tool = !describes_then_inlines && image_base64.is_some();
let gate_opts = self.generator.current_gate_opts_for_persona(
offer_describe_tool,
Some((req.user_id, &active_persona)),
@@ -1057,7 +1097,7 @@ impl InsightChatService {
);
let system_msg = ChatMessage::system(system_content);
let mut user_msg = ChatMessage::user(req.user_message.clone());
if !is_hybrid && let Some(ref img) = image_base64 {
if !describes_then_inlines && let Some(ref img) = image_base64 {
user_msg.images = Some(vec![img.clone()]);
}
let mut messages = vec![system_msg, user_msg];
@@ -1130,19 +1170,22 @@ impl InsightChatService {
Ok(())
}
/// Set up chat clients (Ollama + optional OpenRouter) shared by
/// bootstrap and continuation. Returns the chat-side backend client
/// (boxed because hybrid and local return different concrete types)
/// and the Ollama client used for describe-image / local tool calls.
/// Set up chat clients (Ollama + optional OpenRouter / LlamaCpp) shared
/// by bootstrap and continuation. Returns the chat-side backend client
/// (boxed because each backend has a different concrete type) and the
/// Ollama client used for describe-image / local tool calls.
///
/// `effective_backend` must be one of `"local"`, `"hybrid"`, `"llamacpp"`
/// (validated upstream).
fn build_chat_clients(
&self,
is_hybrid: bool,
effective_backend: &str,
custom_model: Option<&str>,
req: &ChatTurnRequest,
) -> Result<(Box<dyn LlmClient>, OllamaClient)> {
let mut ollama_client = self.ollama.clone();
if is_hybrid {
if effective_backend == "hybrid" {
let arc = self.openrouter.as_ref().ok_or_else(|| {
anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
})?;
@@ -1163,6 +1206,27 @@ impl InsightChatService {
return Ok((Box::new(c), ollama_client));
}
if effective_backend == "llamacpp" {
let arc = self.llamacpp.as_ref().ok_or_else(|| {
anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured")
})?;
let mut c: LlamaCppClient = (**arc).clone();
if let Some(m) = custom_model {
c.primary_model = m.to_string();
}
if req.temperature.is_some()
|| req.top_p.is_some()
|| req.top_k.is_some()
|| req.min_p.is_some()
{
c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
}
if let Some(ctx) = req.num_ctx {
c.set_num_ctx(Some(ctx));
}
return Ok((Box::new(c), ollama_client));
}
if let Some(m) = custom_model
&& m != self.ollama.primary_model
{
@@ -1459,6 +1523,49 @@ fn resolve_date_taken_for_context(
.map(|dt| dt.format("%Y-%m-%d").to_string())
}
/// Validate a stored→effective backend transition for a chat continuation.
/// Continuation runs against a transcript that was generated with a specific
/// backend; some transitions break the conversation shape:
///
/// - `local → hybrid` — the stored transcript has images embedded in the
/// first user message; the openrouter chat client surfaces them through
/// the wire, but vision-only models routed via the hybrid path may not
/// accept that shape consistently across providers. Reject to keep the
/// `regenerate-in-hybrid-mode` workflow as the supported answer.
/// - `llamacpp → hybrid` — the stored transcript already has an inlined
/// visual description produced by llama-swap's vision slot. Switching
/// to hybrid mid-conversation would mix description sources across
/// subsequent turns (any new image in the chat continuation would be
/// described by ollama-vision while the original was described by
/// llama-vision). Reject for consistency.
///
/// All other transitions are allowed. `local ↔ llamacpp` works because
/// LlamaCppClient passes image content-parts through to the chat slot —
/// the user is responsible for picking a vision-capable chat model in
/// that case. `hybrid ↔ llamacpp` works because both transcripts are
/// text-only (visual description inlined at bootstrap).
fn validate_cross_replay(stored: &str, effective: &str) -> Result<()> {
if !matches!(effective, "local" | "hybrid" | "llamacpp") {
bail!(
"unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'",
effective
);
}
if stored == "local" && effective == "hybrid" {
bail!(
"switching from local to hybrid mid-chat isn't supported yet; \
regenerate the insight in hybrid mode if you want OpenRouter chat"
);
}
if stored == "llamacpp" && effective == "hybrid" {
bail!(
"switching from llamacpp to hybrid mid-chat isn't supported yet; \
regenerate the insight in hybrid mode if you want OpenRouter chat"
);
}
Ok(())
}
/// Pick the backend label for bootstrap. Bootstrap has no stored insight
/// to defer to (that's continuation's behaviour), so the default is
/// `"local"`. Returns an error if the supplied label is non-empty but
@@ -1469,8 +1576,11 @@ fn resolve_bootstrap_backend(supplied: Option<&str>) -> Result<String> {
.map(|s| s.trim().to_lowercase())
.filter(|s| !s.is_empty())
.unwrap_or_else(|| "local".to_string());
if !matches!(lower.as_str(), "local" | "hybrid") {
bail!("unknown backend '{}'; expected 'local' or 'hybrid'", lower);
if !matches!(lower.as_str(), "local" | "hybrid" | "llamacpp") {
bail!(
"unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'",
lower
);
}
Ok(lower)
}
@@ -2074,6 +2184,10 @@ mod tests {
fn bootstrap_backend_accepts_local_and_hybrid_case_insensitively() {
assert_eq!(resolve_bootstrap_backend(Some("LOCAL")).unwrap(), "local");
assert_eq!(resolve_bootstrap_backend(Some("Hybrid")).unwrap(), "hybrid");
assert_eq!(
resolve_bootstrap_backend(Some("Llamacpp")).unwrap(),
"llamacpp"
);
assert_eq!(
resolve_bootstrap_backend(Some(" local ")).unwrap(),
"local"
@@ -2088,6 +2202,38 @@ mod tests {
assert!(msg.contains("openrouter"));
}
#[test]
fn cross_replay_rejects_local_to_hybrid() {
let err = validate_cross_replay("local", "hybrid").unwrap_err();
assert!(format!("{}", err).contains("local to hybrid"));
}
#[test]
fn cross_replay_rejects_llamacpp_to_hybrid() {
let err = validate_cross_replay("llamacpp", "hybrid").unwrap_err();
assert!(format!("{}", err).contains("llamacpp to hybrid"));
}
#[test]
fn cross_replay_allows_local_llamacpp_and_hybrid_llamacpp_transitions() {
// Local ↔ llamacpp: user is responsible for picking a vision-capable
// chat slot when the transcript has images.
assert!(validate_cross_replay("local", "llamacpp").is_ok());
assert!(validate_cross_replay("llamacpp", "local").is_ok());
// Hybrid ↔ llamacpp: both transcripts are text-only.
assert!(validate_cross_replay("hybrid", "llamacpp").is_ok());
// Same-backend replays are always fine.
assert!(validate_cross_replay("local", "local").is_ok());
assert!(validate_cross_replay("hybrid", "hybrid").is_ok());
assert!(validate_cross_replay("llamacpp", "llamacpp").is_ok());
}
#[test]
fn cross_replay_rejects_unknown_effective() {
let err = validate_cross_replay("local", "openrouter").unwrap_err();
assert!(format!("{}", err).contains("unknown backend"));
}
#[test]
fn bootstrap_system_message_includes_path_and_persona() {
let out = build_bootstrap_system_message("you are helpful", "pics/IMG.jpg", None, None, "");