ai: add llamacpp backend (llama-swap) as third LLM client

Wires a new LlamaCppClient (OpenAI-compatible /v1 wire format) alongside
OllamaClient and OpenRouterClient. Per-slot routing for chat/vision/embed
via env (LLAMA_SWAP_URL + *_MODEL vars); capability inference uses an
env allowlist since /v1/models doesn't report modality.

InsightGenerator + InsightChatService gain three-way dispatch on
chat_backend = "local" | "hybrid" | "llamacpp". Hybrid and llamacpp
share the describe-then-inline path (text-only chat after a separate
vision describe). HYBRID_VISION_BACKEND=llamacpp lets hybrid route its
describe pass through llama-swap's vision slot while chat still goes
to OpenRouter.

Cross-replay matrix added (validate_cross_replay): local<->llamacpp
and hybrid<->llamacpp allowed; local->hybrid and llamacpp->hybrid
rejected. New /insights/llamacpp/models handler mirrors the OpenRouter
shape.
This commit is contained in:
Cameron Cordes
2026-05-20 17:52:33 -04:00
parent d04b86e32c
commit f0927f5355
9 changed files with 1468 additions and 102 deletions

View File

@@ -2,6 +2,7 @@ use crate::ai::apollo_client::ApolloClient;
use crate::ai::clip_client::ClipClient;
use crate::ai::face_client::FaceClient;
use crate::ai::insight_chat::{ChatLockMap, InsightChatService};
use crate::ai::llamacpp::LlamaCppClient;
use crate::ai::openrouter::OpenRouterClient;
use crate::ai::{InsightGenerator, OllamaClient, SmsApiClient};
use crate::database::{
@@ -62,6 +63,16 @@ pub struct AppState {
/// Curated list of OpenRouter model ids exposed to clients. Sourced from
/// `OPENROUTER_ALLOWED_MODELS` (comma-separated). Empty when unset.
pub openrouter_allowed_models: Vec<String>,
/// `None` when `LLAMA_SWAP_URL` is not configured. Consulted only when a
/// request explicitly opts into `backend=llamacpp`. Same shape as the
/// `openrouter` slot — present here so handlers can route to it without
/// threading through the generator.
#[allow(dead_code)]
pub llamacpp: Option<Arc<LlamaCppClient>>,
/// Curated list of llama-swap model ids exposed to clients. Sourced from
/// `LLAMA_SWAP_ALLOWED_MODELS` (comma-separated). Empty when unset; the
/// server then falls back to `LLAMA_SWAP_PRIMARY_MODEL`.
pub llamacpp_allowed_models: Vec<String>,
pub sms_client: SmsApiClient,
pub insight_generator: InsightGenerator,
/// Chat continuation service. Hold an Arc so handlers can clone cheaply.
@@ -105,6 +116,8 @@ impl AppState {
ollama: OllamaClient,
openrouter: Option<Arc<OpenRouterClient>>,
openrouter_allowed_models: Vec<String>,
llamacpp: Option<Arc<LlamaCppClient>>,
llamacpp_allowed_models: Vec<String>,
sms_client: SmsApiClient,
insight_generator: InsightGenerator,
insight_chat: Arc<InsightChatService>,
@@ -145,6 +158,8 @@ impl AppState {
ollama,
openrouter,
openrouter_allowed_models,
llamacpp,
llamacpp_allowed_models,
sms_client,
insight_generator,
insight_chat,
@@ -186,6 +201,9 @@ impl Default for AppState {
let openrouter = build_openrouter_from_env();
let openrouter_allowed_models = parse_openrouter_allowed_models();
let llamacpp = build_llamacpp_from_env();
let llamacpp_allowed_models = parse_llamacpp_allowed_models();
let sms_api_url =
env::var("SMS_API_URL").unwrap_or_else(|_| "http://localhost:8000".to_string());
let sms_api_token = env::var("SMS_API_TOKEN").ok();
@@ -250,6 +268,7 @@ impl Default for AppState {
let insight_generator = InsightGenerator::new(
ollama.clone(),
openrouter.clone(),
llamacpp.clone(),
sms_client.clone(),
apollo_client.clone(),
insight_dao.clone(),
@@ -273,6 +292,7 @@ impl Default for AppState {
Arc::new(insight_generator.clone()),
ollama.clone(),
openrouter.clone(),
llamacpp.clone(),
insight_dao.clone(),
chat_locks,
));
@@ -294,6 +314,8 @@ impl Default for AppState {
ollama,
openrouter,
openrouter_allowed_models,
llamacpp,
llamacpp_allowed_models,
sms_client,
insight_generator,
insight_chat,
@@ -335,6 +357,50 @@ fn parse_openrouter_allowed_models() -> Vec<String> {
.collect()
}
/// Build a `LlamaCppClient` from environment variables. Returns `None` when
/// `LLAMA_SWAP_URL` is unset (the llamacpp backend is then unavailable and
/// requests for it return a clear error). The slot ids default to the
/// names the bundled `llama-swap/config.yaml` uses — `chat` / `vision` /
/// `embed` — so a minimal deploy only needs to set `LLAMA_SWAP_URL`.
fn build_llamacpp_from_env() -> Option<Arc<LlamaCppClient>> {
let base_url = env::var("LLAMA_SWAP_URL").ok()?;
let primary_model = env::var("LLAMA_SWAP_PRIMARY_MODEL").ok();
let mut client = LlamaCppClient::new(Some(base_url), primary_model);
if let Ok(model) = env::var("LLAMA_SWAP_EMBEDDING_MODEL") {
client.set_embedding_model(model);
}
if let Ok(model) = env::var("LLAMA_SWAP_VISION_MODEL") {
client.set_vision_model(model);
}
client.set_vision_models(parse_llamacpp_vision_models());
Some(Arc::new(client))
}
/// Parse `LLAMA_SWAP_ALLOWED_MODELS` (comma-separated) into a vec. Used to
/// drive `/insights/llamacpp/models`; empty when unset.
fn parse_llamacpp_allowed_models() -> Vec<String> {
env::var("LLAMA_SWAP_ALLOWED_MODELS")
.unwrap_or_default()
.split(',')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect()
}
/// Parse `LLAMA_SWAP_VISION_MODELS` (comma-separated) — slot ids that report
/// `has_vision = true` in capability lookups. The configured `vision_model`
/// (default `vision`) is always considered vision-capable regardless of this
/// list, so a deploy that only uses the default vision slot can leave it
/// unset.
fn parse_llamacpp_vision_models() -> Vec<String> {
env::var("LLAMA_SWAP_VISION_MODELS")
.unwrap_or_default()
.split(',')
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
.collect()
}
#[cfg(test)]
impl AppState {
/// Creates an AppState instance for testing with temporary directories
@@ -397,6 +463,7 @@ impl AppState {
let insight_generator = InsightGenerator::new(
ollama.clone(),
None,
None,
sms_client.clone(),
apollo_client.clone(),
insight_dao.clone(),
@@ -418,6 +485,7 @@ impl AppState {
Arc::new(insight_generator.clone()),
ollama.clone(),
None,
None,
insight_dao.clone(),
chat_locks,
));
@@ -445,6 +513,8 @@ impl AppState {
ollama,
None,
Vec::new(),
None,
Vec::new(),
sms_client,
insight_generator,
insight_chat,