ai: add llamacpp backend (llama-swap) as third LLM client
Wires a new LlamaCppClient (OpenAI-compatible /v1 wire format) alongside OllamaClient and OpenRouterClient. Per-slot routing for chat/vision/embed via env (LLAMA_SWAP_URL + *_MODEL vars); capability inference uses an env allowlist since /v1/models doesn't report modality. InsightGenerator + InsightChatService gain three-way dispatch on chat_backend = "local" | "hybrid" | "llamacpp". Hybrid and llamacpp share the describe-then-inline path (text-only chat after a separate vision describe). HYBRID_VISION_BACKEND=llamacpp lets hybrid route its describe pass through llama-swap's vision slot while chat still goes to OpenRouter. Cross-replay matrix added (validate_cross_replay): local<->llamacpp and hybrid<->llamacpp allowed; local->hybrid and llamacpp->hybrid rejected. New /insights/llamacpp/models handler mirrors the OpenRouter shape.
This commit is contained in:
70
src/state.rs
70
src/state.rs
@@ -2,6 +2,7 @@ use crate::ai::apollo_client::ApolloClient;
|
||||
use crate::ai::clip_client::ClipClient;
|
||||
use crate::ai::face_client::FaceClient;
|
||||
use crate::ai::insight_chat::{ChatLockMap, InsightChatService};
|
||||
use crate::ai::llamacpp::LlamaCppClient;
|
||||
use crate::ai::openrouter::OpenRouterClient;
|
||||
use crate::ai::{InsightGenerator, OllamaClient, SmsApiClient};
|
||||
use crate::database::{
|
||||
@@ -62,6 +63,16 @@ pub struct AppState {
|
||||
/// Curated list of OpenRouter model ids exposed to clients. Sourced from
|
||||
/// `OPENROUTER_ALLOWED_MODELS` (comma-separated). Empty when unset.
|
||||
pub openrouter_allowed_models: Vec<String>,
|
||||
/// `None` when `LLAMA_SWAP_URL` is not configured. Consulted only when a
|
||||
/// request explicitly opts into `backend=llamacpp`. Same shape as the
|
||||
/// `openrouter` slot — present here so handlers can route to it without
|
||||
/// threading through the generator.
|
||||
#[allow(dead_code)]
|
||||
pub llamacpp: Option<Arc<LlamaCppClient>>,
|
||||
/// Curated list of llama-swap model ids exposed to clients. Sourced from
|
||||
/// `LLAMA_SWAP_ALLOWED_MODELS` (comma-separated). Empty when unset; the
|
||||
/// server then falls back to `LLAMA_SWAP_PRIMARY_MODEL`.
|
||||
pub llamacpp_allowed_models: Vec<String>,
|
||||
pub sms_client: SmsApiClient,
|
||||
pub insight_generator: InsightGenerator,
|
||||
/// Chat continuation service. Hold an Arc so handlers can clone cheaply.
|
||||
@@ -105,6 +116,8 @@ impl AppState {
|
||||
ollama: OllamaClient,
|
||||
openrouter: Option<Arc<OpenRouterClient>>,
|
||||
openrouter_allowed_models: Vec<String>,
|
||||
llamacpp: Option<Arc<LlamaCppClient>>,
|
||||
llamacpp_allowed_models: Vec<String>,
|
||||
sms_client: SmsApiClient,
|
||||
insight_generator: InsightGenerator,
|
||||
insight_chat: Arc<InsightChatService>,
|
||||
@@ -145,6 +158,8 @@ impl AppState {
|
||||
ollama,
|
||||
openrouter,
|
||||
openrouter_allowed_models,
|
||||
llamacpp,
|
||||
llamacpp_allowed_models,
|
||||
sms_client,
|
||||
insight_generator,
|
||||
insight_chat,
|
||||
@@ -186,6 +201,9 @@ impl Default for AppState {
|
||||
let openrouter = build_openrouter_from_env();
|
||||
let openrouter_allowed_models = parse_openrouter_allowed_models();
|
||||
|
||||
let llamacpp = build_llamacpp_from_env();
|
||||
let llamacpp_allowed_models = parse_llamacpp_allowed_models();
|
||||
|
||||
let sms_api_url =
|
||||
env::var("SMS_API_URL").unwrap_or_else(|_| "http://localhost:8000".to_string());
|
||||
let sms_api_token = env::var("SMS_API_TOKEN").ok();
|
||||
@@ -250,6 +268,7 @@ impl Default for AppState {
|
||||
let insight_generator = InsightGenerator::new(
|
||||
ollama.clone(),
|
||||
openrouter.clone(),
|
||||
llamacpp.clone(),
|
||||
sms_client.clone(),
|
||||
apollo_client.clone(),
|
||||
insight_dao.clone(),
|
||||
@@ -273,6 +292,7 @@ impl Default for AppState {
|
||||
Arc::new(insight_generator.clone()),
|
||||
ollama.clone(),
|
||||
openrouter.clone(),
|
||||
llamacpp.clone(),
|
||||
insight_dao.clone(),
|
||||
chat_locks,
|
||||
));
|
||||
@@ -294,6 +314,8 @@ impl Default for AppState {
|
||||
ollama,
|
||||
openrouter,
|
||||
openrouter_allowed_models,
|
||||
llamacpp,
|
||||
llamacpp_allowed_models,
|
||||
sms_client,
|
||||
insight_generator,
|
||||
insight_chat,
|
||||
@@ -335,6 +357,50 @@ fn parse_openrouter_allowed_models() -> Vec<String> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Build a `LlamaCppClient` from environment variables. Returns `None` when
|
||||
/// `LLAMA_SWAP_URL` is unset (the llamacpp backend is then unavailable and
|
||||
/// requests for it return a clear error). The slot ids default to the
|
||||
/// names the bundled `llama-swap/config.yaml` uses — `chat` / `vision` /
|
||||
/// `embed` — so a minimal deploy only needs to set `LLAMA_SWAP_URL`.
|
||||
fn build_llamacpp_from_env() -> Option<Arc<LlamaCppClient>> {
|
||||
let base_url = env::var("LLAMA_SWAP_URL").ok()?;
|
||||
let primary_model = env::var("LLAMA_SWAP_PRIMARY_MODEL").ok();
|
||||
let mut client = LlamaCppClient::new(Some(base_url), primary_model);
|
||||
if let Ok(model) = env::var("LLAMA_SWAP_EMBEDDING_MODEL") {
|
||||
client.set_embedding_model(model);
|
||||
}
|
||||
if let Ok(model) = env::var("LLAMA_SWAP_VISION_MODEL") {
|
||||
client.set_vision_model(model);
|
||||
}
|
||||
client.set_vision_models(parse_llamacpp_vision_models());
|
||||
Some(Arc::new(client))
|
||||
}
|
||||
|
||||
/// Parse `LLAMA_SWAP_ALLOWED_MODELS` (comma-separated) into a vec. Used to
|
||||
/// drive `/insights/llamacpp/models`; empty when unset.
|
||||
fn parse_llamacpp_allowed_models() -> Vec<String> {
|
||||
env::var("LLAMA_SWAP_ALLOWED_MODELS")
|
||||
.unwrap_or_default()
|
||||
.split(',')
|
||||
.map(|s| s.trim().to_string())
|
||||
.filter(|s| !s.is_empty())
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Parse `LLAMA_SWAP_VISION_MODELS` (comma-separated) — slot ids that report
|
||||
/// `has_vision = true` in capability lookups. The configured `vision_model`
|
||||
/// (default `vision`) is always considered vision-capable regardless of this
|
||||
/// list, so a deploy that only uses the default vision slot can leave it
|
||||
/// unset.
|
||||
fn parse_llamacpp_vision_models() -> Vec<String> {
|
||||
env::var("LLAMA_SWAP_VISION_MODELS")
|
||||
.unwrap_or_default()
|
||||
.split(',')
|
||||
.map(|s| s.trim().to_string())
|
||||
.filter(|s| !s.is_empty())
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl AppState {
|
||||
/// Creates an AppState instance for testing with temporary directories
|
||||
@@ -397,6 +463,7 @@ impl AppState {
|
||||
let insight_generator = InsightGenerator::new(
|
||||
ollama.clone(),
|
||||
None,
|
||||
None,
|
||||
sms_client.clone(),
|
||||
apollo_client.clone(),
|
||||
insight_dao.clone(),
|
||||
@@ -418,6 +485,7 @@ impl AppState {
|
||||
Arc::new(insight_generator.clone()),
|
||||
ollama.clone(),
|
||||
None,
|
||||
None,
|
||||
insight_dao.clone(),
|
||||
chat_locks,
|
||||
));
|
||||
@@ -445,6 +513,8 @@ impl AppState {
|
||||
ollama,
|
||||
None,
|
||||
Vec::new(),
|
||||
None,
|
||||
Vec::new(),
|
||||
sms_client,
|
||||
insight_generator,
|
||||
insight_chat,
|
||||
|
||||
Reference in New Issue
Block a user