From f0927f535510ccfecf91c1896041b93c2c156603 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Wed, 20 May 2026 17:52:33 -0400 Subject: [PATCH] ai: add llamacpp backend (llama-swap) as third LLM client Wires a new LlamaCppClient (OpenAI-compatible /v1 wire format) alongside OllamaClient and OpenRouterClient. Per-slot routing for chat/vision/embed via env (LLAMA_SWAP_URL + *_MODEL vars); capability inference uses an env allowlist since /v1/models doesn't report modality. InsightGenerator + InsightChatService gain three-way dispatch on chat_backend = "local" | "hybrid" | "llamacpp". Hybrid and llamacpp share the describe-then-inline path (text-only chat after a separate vision describe). HYBRID_VISION_BACKEND=llamacpp lets hybrid route its describe pass through llama-swap's vision slot while chat still goes to OpenRouter. Cross-replay matrix added (validate_cross_replay): local<->llamacpp and hybrid<->llamacpp allowed; local->hybrid and llamacpp->hybrid rejected. New /insights/llamacpp/models handler mirrors the OpenRouter shape. --- CLAUDE.md | 49 +- src/ai/handlers.rs | 30 ++ src/ai/insight_chat.rs | 266 ++++++--- src/ai/insight_generator.rs | 171 ++++-- src/ai/llamacpp.rs | 978 ++++++++++++++++++++++++++++++++++ src/ai/mod.rs | 4 +- src/bin/populate_knowledge.rs | 1 + src/main.rs | 1 + src/state.rs | 70 +++ 9 files changed, 1468 insertions(+), 102 deletions(-) create mode 100644 src/ai/llamacpp.rs diff --git a/CLAUDE.md b/CLAUDE.md index 7e605cc..d3419e6 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -475,6 +475,7 @@ POST /insights/generate/agentic (tool-calling loop; body: { file_path, back GET /insights?path=...&library=... GET /insights/models (local Ollama models + capabilities) GET /insights/openrouter/models (curated OpenRouter allowlist) +GET /insights/llamacpp/models (curated llama-swap slot allowlist) POST /insights/rate (thumbs up/down for training data) // Insight Chat Continuation @@ -631,6 +632,23 @@ OPENROUTER_EMBEDDING_MODEL=openai/text-embedding-3-small # Optional, embeddings OPENROUTER_HTTP_REFERER=https://your-site.example # Optional attribution header OPENROUTER_APP_TITLE=ImageApi # Optional attribution header +# llama.cpp / llama-swap (Llamacpp Backend) - sibling to Ollama; OpenAI-compatible +# proxy hosting one or more llama-server processes (chat / vision / embed slots). +LLAMA_SWAP_URL=http://localhost:9292/v1 # Required to enable llamacpp backend +LLAMA_SWAP_PRIMARY_MODEL=chat # Chat slot id (matches config.yaml) +LLAMA_SWAP_VISION_MODEL=vision # Vision slot id; describe_image routes here +LLAMA_SWAP_EMBEDDING_MODEL=embed # Embedding slot id (when local embeddings via llamacpp) +LLAMA_SWAP_VISION_MODELS=qwen-vl,llava # Comma-separated slot ids known to have vision. + # Drives `has_vision` in /insights/llamacpp/models. + # `LLAMA_SWAP_VISION_MODEL` is auto-included. +LLAMA_SWAP_ALLOWED_MODELS=chat,coder # Curated allowlist exposed to clients via + # GET /insights/llamacpp/models. Empty = no picker. +LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180 # Per-request timeout; bump for slow CPU offload +HYBRID_VISION_BACKEND=llamacpp # Optional override for hybrid mode's describe_image: + # `ollama` (default) or `llamacpp`. When `llamacpp`, + # hybrid still routes chat to OpenRouter but uses + # llama-swap's vision slot to describe images. + # Insight Chat Continuation AGENTIC_CHAT_MAX_ITERATIONS=6 # Cap on tool-calling iterations per chat turn (default 6) ``` @@ -652,8 +670,11 @@ This allows runtime verification of model availability before generating insight **Hybrid Backend (OpenRouter):** - Per-request opt-in via `backend=hybrid` on `POST /insights/generate/agentic`. -- Local Ollama still describes the image (vision); the description is inlined - into the chat prompt and the agentic loop runs on OpenRouter. +- Vision describe happens before the agentic loop; the description is inlined + into the chat prompt and the agentic loop runs on OpenRouter. By default + vision uses local Ollama, but `HYBRID_VISION_BACKEND=llamacpp` flips it to + llama-swap's vision slot (useful when you want chat on a frontier model and + vision on a local-but-not-Ollama path). - `request.model` (if provided) overrides `OPENROUTER_DEFAULT_MODEL` for that call. The mobile picker reads from `OPENROUTER_ALLOWED_MODELS`. - No live capability precheck — the operator-curated allowlist is trusted. @@ -661,6 +682,30 @@ This allows runtime verification of model availability before generating insight - `GET /insights/openrouter/models` returns `{ models, default_model, configured }` for client picker UIs. +**Llamacpp Backend (llama-swap):** +- Per-request opt-in via `backend=llamacpp` on `POST /insights/generate/agentic`. +- Sibling to Ollama: a local OpenAI-compatible proxy (mostlygeek/llama-swap) + fronting one or more `llama-server` processes. The chat slot is text-only + by default; vision and embeddings have their own slots (`LLAMA_SWAP_VISION_MODEL`, + `LLAMA_SWAP_EMBEDDING_MODEL`) that llama-swap routes to by model id. The + bundled `docker-compose.yml` + `llama-swap/config.yaml` in the opencode root + is the reference deploy. +- Operates in the same describe-then-inline shape as hybrid: the chat model + never sees raw images. Vision describe routes through llama-swap's vision + slot (`describe_image` on `LlamaCppClient`). +- `request.model` (if provided) overrides `LLAMA_SWAP_PRIMARY_MODEL` for that + call (must match a slot id in llama-swap's `config.yaml`). The mobile picker + reads from `LLAMA_SWAP_ALLOWED_MODELS`. +- No live capability precheck — slot ids are trusted. Tool calling is assumed + for every slot (llama-swap entries typically launch with `--jinja`). +- `GET /insights/llamacpp/models` returns `{ models, default_model, configured }`. +- Cross-replay matrix (chat continuation): `local ↔ llamacpp` allowed (the + LlamaCppClient passes images through to the chat slot — you're responsible + for a vision-capable slot if the stored transcript carries images); + `hybrid ↔ llamacpp` allowed (both transcripts are text-only); `local → + hybrid` and `llamacpp → hybrid` rejected (mid-conversation description + source change isn't supported). + **Insight Chat Continuation:** After an agentic insight is generated, the full `Vec` transcript is diff --git a/src/ai/handlers.rs b/src/ai/handlers.rs index 0e46057..4809b25 100644 --- a/src/ai/handlers.rs +++ b/src/ai/handlers.rs @@ -549,6 +549,36 @@ pub async fn get_openrouter_models_handler( HttpResponse::Ok().json(response) } +#[derive(serde::Serialize)] +pub struct LlamaCppModelsResponse { + pub models: Vec, + pub default_model: Option, + pub configured: bool, +} + +/// GET /insights/llamacpp/models - Curated llama-swap model ids exposed +/// to clients for the llamacpp backend. Returned verbatim from +/// `LLAMA_SWAP_ALLOWED_MODELS`; no live call to llama-swap. Use +/// `LLAMA_SWAP_URL` plus `LLAMA_SWAP_PRIMARY_MODEL` on the server side to +/// pick the actual chat slot. +#[get("/insights/llamacpp/models")] +pub async fn get_llamacpp_models_handler( + _claims: Claims, + app_state: web::Data, +) -> impl Responder { + let configured = app_state.llamacpp.is_some(); + let default_model = app_state + .llamacpp + .as_ref() + .map(|c| c.primary_model.clone()); + let response = LlamaCppModelsResponse { + models: app_state.llamacpp_allowed_models.clone(), + default_model, + configured, + }; + HttpResponse::Ok().json(response) +} + /// POST /insights/rate - Rate an insight (thumbs up/down for training data) #[post("/insights/rate")] pub async fn rate_insight_handler( diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs index b2a7af8..4e87d52 100644 --- a/src/ai/insight_chat.rs +++ b/src/ai/insight_chat.rs @@ -9,6 +9,7 @@ use tokio::sync::Mutex as TokioMutex; use crate::ai::insight_generator::InsightGenerator; use crate::ai::llm_client::{ChatMessage, LlmClient, LlmStreamEvent, Tool}; use crate::ai::ollama::OllamaClient; +use crate::ai::llamacpp::LlamaCppClient; use crate::ai::openrouter::OpenRouterClient; use crate::database::InsightDao; use crate::database::models::InsertPhotoInsight; @@ -93,6 +94,7 @@ pub struct InsightChatService { generator: Arc, ollama: OllamaClient, openrouter: Option>, + llamacpp: Option>, insight_dao: Arc>>, chat_locks: ChatLockMap, } @@ -102,6 +104,7 @@ impl InsightChatService { generator: Arc, ollama: OllamaClient, openrouter: Option>, + llamacpp: Option>, insight_dao: Arc>>, chat_locks: ChatLockMap, ) -> Self { @@ -109,6 +112,7 @@ impl InsightChatService { generator, ollama, openrouter, + llamacpp, insight_dao, chat_locks, } @@ -303,23 +307,15 @@ impl InsightChatService { .map(|s| s.trim().to_lowercase()) .filter(|s| !s.is_empty()) .unwrap_or_else(|| stored_backend.clone()); - if !matches!(effective_backend.as_str(), "local" | "hybrid") { - bail!( - "unknown backend '{}'; expected 'local' or 'hybrid'", - effective_backend - ); - } - if stored_backend == "local" && effective_backend == "hybrid" { - bail!( - "switching from local to hybrid mid-chat isn't supported yet; \ - regenerate the insight in hybrid mode if you want OpenRouter chat" - ); - } + validate_cross_replay(&stored_backend, &effective_backend)?; let is_hybrid = effective_backend == "hybrid"; + let is_llamacpp = effective_backend == "llamacpp"; + let describes_then_inlines = is_hybrid || is_llamacpp; span.set_attribute(KeyValue::new("backend", effective_backend.clone())); // 4. Build the chat backend client. Ollama in local mode, a freshly - // cloned OpenRouter client in hybrid mode (clone so per-request + // cloned OpenRouter client in hybrid mode, a freshly cloned + // LlamaCppClient in llamacpp mode (clone so per-request // sampling/model overrides don't leak into shared state). let max_iterations = req .max_iterations @@ -336,6 +332,7 @@ impl InsightChatService { let mut ollama_client = self.ollama.clone(); let mut openrouter_client: Option = None; + let mut llamacpp_client: Option = None; if is_hybrid { let arc = self.openrouter.as_ref().ok_or_else(|| { @@ -356,6 +353,25 @@ impl InsightChatService { c.set_num_ctx(Some(ctx)); } openrouter_client = Some(c); + } else if is_llamacpp { + let arc = self.llamacpp.as_ref().ok_or_else(|| { + anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured") + })?; + let mut c: LlamaCppClient = (**arc).clone(); + if let Some(ref m) = custom_model { + c.primary_model = m.clone(); + } + if req.temperature.is_some() + || req.top_p.is_some() + || req.top_k.is_some() + || req.min_p.is_some() + { + c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p); + } + if let Some(ctx) = req.num_ctx { + c.set_num_ctx(Some(ctx)); + } + llamacpp_client = Some(c); } else { // Local-mode model swap. Build a new client when the chat model // differs from the configured one (mirrors the agentic pattern). @@ -381,7 +397,9 @@ impl InsightChatService { } } - let chat_backend: &dyn LlmClient = if let Some(ref c) = openrouter_client { + let chat_backend: &dyn LlmClient = if let Some(ref c) = llamacpp_client { + c + } else if let Some(ref c) = openrouter_client { c } else { &ollama_client @@ -389,18 +407,19 @@ impl InsightChatService { let model_used = chat_backend.primary_model().to_string(); span.set_attribute(KeyValue::new("model", model_used.clone())); - // 5. Decide vision + tool set. In hybrid we always omit - // `describe_photo` (matches the original generation flow). In - // local we trust the stored history's first-user shape: if it - // carries `images`, the original model was vision-capable, and - // we keep `describe_photo` available. + // 5. Decide vision + tool set. In describe-then-inline modes + // (hybrid, llamacpp) we always omit `describe_photo` (matches the + // original generation flow). In local we trust the stored + // history's first-user shape: if it carries `images`, the + // original model was vision-capable, and we keep `describe_photo` + // available. let local_first_user_has_image = messages .iter() .find(|m| m.role == "user") .and_then(|m| m.images.as_ref()) .map(|imgs| !imgs.is_empty()) .unwrap_or(false); - let offer_describe_tool = !is_hybrid && local_first_user_has_image; + let offer_describe_tool = !describes_then_inlines && local_first_user_has_image; // current_gate_opts(has_vision) sets gate_opts.has_vision = has_vision // and probes the per-table presence flags. Pass `offer_describe_tool` // directly — the `!is_hybrid && local_first_user_has_image` decision @@ -799,19 +818,10 @@ impl InsightChatService { .map(|s| s.trim().to_lowercase()) .filter(|s| !s.is_empty()) .unwrap_or_else(|| stored_backend.clone()); - if !matches!(effective_backend.as_str(), "local" | "hybrid") { - bail!( - "unknown backend '{}'; expected 'local' or 'hybrid'", - effective_backend - ); - } - if stored_backend == "local" && effective_backend == "hybrid" { - bail!( - "switching from local to hybrid mid-chat isn't supported yet; \ - regenerate the insight in hybrid mode if you want OpenRouter chat" - ); - } + validate_cross_replay(&stored_backend, &effective_backend)?; let is_hybrid = effective_backend == "hybrid"; + let is_llamacpp = effective_backend == "llamacpp"; + let describes_then_inlines = is_hybrid || is_llamacpp; let max_iterations = req .max_iterations @@ -826,20 +836,21 @@ impl InsightChatService { .filter(|m| !m.is_empty()); let (chat_backend_holder, ollama_client) = - self.build_chat_clients(is_hybrid, custom_model.as_deref(), &req)?; + self.build_chat_clients(&effective_backend, custom_model.as_deref(), &req)?; let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref(); let model_used = chat_backend.primary_model().to_string(); // Tool set — local mode + first user turn carries an image → - // offer describe_photo. Hybrid: visual description was inlined - // when the insight was bootstrapped, no describe tool needed. + // offer describe_photo. Describe-then-inline modes (hybrid / + // llamacpp): visual description was inlined when the insight was + // bootstrapped, no describe tool needed. let local_first_user_has_image = messages .iter() .find(|m| m.role == "user") .and_then(|m| m.images.as_ref()) .map(|imgs| !imgs.is_empty()) .unwrap_or(false); - let offer_describe_tool = !is_hybrid && local_first_user_has_image; + let offer_describe_tool = !describes_then_inlines && local_first_user_has_image; let gate_opts = self.generator.current_gate_opts_for_persona( offer_describe_tool, Some((req.user_id, &active_persona)), @@ -976,6 +987,8 @@ impl InsightChatService { .unwrap_or_else(|| "default".to_string()); let effective_backend = resolve_bootstrap_backend(req.backend.as_deref())?; let is_hybrid = effective_backend == "hybrid"; + let is_llamacpp = effective_backend == "llamacpp"; + let describes_then_inlines = is_hybrid || is_llamacpp; let max_iterations = req .max_iterations @@ -984,7 +997,7 @@ impl InsightChatService { let custom_model = req.model.clone().filter(|m| !m.is_empty()); let (chat_backend_holder, ollama_client) = - self.build_chat_clients(is_hybrid, custom_model.as_deref(), &req)?; + self.build_chat_clients(&effective_backend, custom_model.as_deref(), &req)?; let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref(); let model_used = chat_backend.primary_model().to_string(); @@ -1007,21 +1020,48 @@ impl InsightChatService { _ => None, }); - // Hybrid backend: pre-describe the image via local Ollama vision - // so OpenRouter chat models (which can't see images directly) get - // the visual description as text. Mirrors the same pre-describe - // pass that `generate_agentic_insight_for_photo` does for hybrid. - let visual_block = if is_hybrid { + // Describe-then-inline backends (hybrid, llamacpp): pre-describe the + // image so a text-only chat model gets the visual description inline. + // Vision source: llamacpp's vision slot in llamacpp mode; in hybrid + // mode Ollama by default, llamacpp via `HYBRID_VISION_BACKEND=llamacpp`. + let visual_block = if describes_then_inlines { match image_base64.as_deref() { - Some(b64) => match self.ollama.describe_image(b64).await { - Ok(desc) => { - format!("Visual description (from local vision model):\n{}\n", desc) + Some(b64) => { + let use_llamacpp_vision = if is_llamacpp { + true + } else { + matches!( + std::env::var("HYBRID_VISION_BACKEND") + .ok() + .as_deref() + .map(|s| s.trim().to_lowercase()) + .as_deref(), + Some("llamacpp") + ) + }; + let described = if use_llamacpp_vision { + match self.llamacpp.as_ref() { + Some(c) => c.describe_image(b64).await, + None => { + log::warn!( + "bootstrap: requested llamacpp vision but LLAMA_SWAP_URL unset; falling back to Ollama" + ); + self.ollama.describe_image(b64).await + } + } + } else { + self.ollama.describe_image(b64).await + }; + match described { + Ok(desc) => { + format!("Visual description (from local vision model):\n{}\n", desc) + } + Err(e) => { + log::warn!("{} bootstrap: describe_image failed: {}", effective_backend, e); + String::new() + } } - Err(e) => { - log::warn!("hybrid bootstrap: local describe_image failed: {}", e); - String::new() - } - }, + } None => String::new(), } } else { @@ -1031,7 +1071,7 @@ impl InsightChatService { // Tool gates. Local + image present → expose describe_photo so // the chat model can re-look at the photo on demand. Hybrid: // already inlined, no tool needed. - let offer_describe_tool = !is_hybrid && image_base64.is_some(); + let offer_describe_tool = !describes_then_inlines && image_base64.is_some(); let gate_opts = self.generator.current_gate_opts_for_persona( offer_describe_tool, Some((req.user_id, &active_persona)), @@ -1057,7 +1097,7 @@ impl InsightChatService { ); let system_msg = ChatMessage::system(system_content); let mut user_msg = ChatMessage::user(req.user_message.clone()); - if !is_hybrid && let Some(ref img) = image_base64 { + if !describes_then_inlines && let Some(ref img) = image_base64 { user_msg.images = Some(vec![img.clone()]); } let mut messages = vec![system_msg, user_msg]; @@ -1130,19 +1170,22 @@ impl InsightChatService { Ok(()) } - /// Set up chat clients (Ollama + optional OpenRouter) shared by - /// bootstrap and continuation. Returns the chat-side backend client - /// (boxed because hybrid and local return different concrete types) - /// and the Ollama client used for describe-image / local tool calls. + /// Set up chat clients (Ollama + optional OpenRouter / LlamaCpp) shared + /// by bootstrap and continuation. Returns the chat-side backend client + /// (boxed because each backend has a different concrete type) and the + /// Ollama client used for describe-image / local tool calls. + /// + /// `effective_backend` must be one of `"local"`, `"hybrid"`, `"llamacpp"` + /// (validated upstream). fn build_chat_clients( &self, - is_hybrid: bool, + effective_backend: &str, custom_model: Option<&str>, req: &ChatTurnRequest, ) -> Result<(Box, OllamaClient)> { let mut ollama_client = self.ollama.clone(); - if is_hybrid { + if effective_backend == "hybrid" { let arc = self.openrouter.as_ref().ok_or_else(|| { anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured") })?; @@ -1163,6 +1206,27 @@ impl InsightChatService { return Ok((Box::new(c), ollama_client)); } + if effective_backend == "llamacpp" { + let arc = self.llamacpp.as_ref().ok_or_else(|| { + anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured") + })?; + let mut c: LlamaCppClient = (**arc).clone(); + if let Some(m) = custom_model { + c.primary_model = m.to_string(); + } + if req.temperature.is_some() + || req.top_p.is_some() + || req.top_k.is_some() + || req.min_p.is_some() + { + c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p); + } + if let Some(ctx) = req.num_ctx { + c.set_num_ctx(Some(ctx)); + } + return Ok((Box::new(c), ollama_client)); + } + if let Some(m) = custom_model && m != self.ollama.primary_model { @@ -1459,6 +1523,49 @@ fn resolve_date_taken_for_context( .map(|dt| dt.format("%Y-%m-%d").to_string()) } +/// Validate a stored→effective backend transition for a chat continuation. +/// Continuation runs against a transcript that was generated with a specific +/// backend; some transitions break the conversation shape: +/// +/// - `local → hybrid` — the stored transcript has images embedded in the +/// first user message; the openrouter chat client surfaces them through +/// the wire, but vision-only models routed via the hybrid path may not +/// accept that shape consistently across providers. Reject to keep the +/// `regenerate-in-hybrid-mode` workflow as the supported answer. +/// - `llamacpp → hybrid` — the stored transcript already has an inlined +/// visual description produced by llama-swap's vision slot. Switching +/// to hybrid mid-conversation would mix description sources across +/// subsequent turns (any new image in the chat continuation would be +/// described by ollama-vision while the original was described by +/// llama-vision). Reject for consistency. +/// +/// All other transitions are allowed. `local ↔ llamacpp` works because +/// LlamaCppClient passes image content-parts through to the chat slot — +/// the user is responsible for picking a vision-capable chat model in +/// that case. `hybrid ↔ llamacpp` works because both transcripts are +/// text-only (visual description inlined at bootstrap). +fn validate_cross_replay(stored: &str, effective: &str) -> Result<()> { + if !matches!(effective, "local" | "hybrid" | "llamacpp") { + bail!( + "unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'", + effective + ); + } + if stored == "local" && effective == "hybrid" { + bail!( + "switching from local to hybrid mid-chat isn't supported yet; \ + regenerate the insight in hybrid mode if you want OpenRouter chat" + ); + } + if stored == "llamacpp" && effective == "hybrid" { + bail!( + "switching from llamacpp to hybrid mid-chat isn't supported yet; \ + regenerate the insight in hybrid mode if you want OpenRouter chat" + ); + } + Ok(()) +} + /// Pick the backend label for bootstrap. Bootstrap has no stored insight /// to defer to (that's continuation's behaviour), so the default is /// `"local"`. Returns an error if the supplied label is non-empty but @@ -1469,8 +1576,11 @@ fn resolve_bootstrap_backend(supplied: Option<&str>) -> Result { .map(|s| s.trim().to_lowercase()) .filter(|s| !s.is_empty()) .unwrap_or_else(|| "local".to_string()); - if !matches!(lower.as_str(), "local" | "hybrid") { - bail!("unknown backend '{}'; expected 'local' or 'hybrid'", lower); + if !matches!(lower.as_str(), "local" | "hybrid" | "llamacpp") { + bail!( + "unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'", + lower + ); } Ok(lower) } @@ -2074,6 +2184,10 @@ mod tests { fn bootstrap_backend_accepts_local_and_hybrid_case_insensitively() { assert_eq!(resolve_bootstrap_backend(Some("LOCAL")).unwrap(), "local"); assert_eq!(resolve_bootstrap_backend(Some("Hybrid")).unwrap(), "hybrid"); + assert_eq!( + resolve_bootstrap_backend(Some("Llamacpp")).unwrap(), + "llamacpp" + ); assert_eq!( resolve_bootstrap_backend(Some(" local ")).unwrap(), "local" @@ -2088,6 +2202,38 @@ mod tests { assert!(msg.contains("openrouter")); } + #[test] + fn cross_replay_rejects_local_to_hybrid() { + let err = validate_cross_replay("local", "hybrid").unwrap_err(); + assert!(format!("{}", err).contains("local to hybrid")); + } + + #[test] + fn cross_replay_rejects_llamacpp_to_hybrid() { + let err = validate_cross_replay("llamacpp", "hybrid").unwrap_err(); + assert!(format!("{}", err).contains("llamacpp to hybrid")); + } + + #[test] + fn cross_replay_allows_local_llamacpp_and_hybrid_llamacpp_transitions() { + // Local ↔ llamacpp: user is responsible for picking a vision-capable + // chat slot when the transcript has images. + assert!(validate_cross_replay("local", "llamacpp").is_ok()); + assert!(validate_cross_replay("llamacpp", "local").is_ok()); + // Hybrid ↔ llamacpp: both transcripts are text-only. + assert!(validate_cross_replay("hybrid", "llamacpp").is_ok()); + // Same-backend replays are always fine. + assert!(validate_cross_replay("local", "local").is_ok()); + assert!(validate_cross_replay("hybrid", "hybrid").is_ok()); + assert!(validate_cross_replay("llamacpp", "llamacpp").is_ok()); + } + + #[test] + fn cross_replay_rejects_unknown_effective() { + let err = validate_cross_replay("local", "openrouter").unwrap_err(); + assert!(format!("{}", err).contains("unknown backend")); + } + #[test] fn bootstrap_system_message_includes_path_and_persona() { let out = build_bootstrap_system_message("you are helpful", "pics/IMG.jpg", None, None, ""); diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs index 2e2da33..2a11e29 100644 --- a/src/ai/insight_generator.rs +++ b/src/ai/insight_generator.rs @@ -12,6 +12,7 @@ use std::sync::{Arc, Mutex}; use crate::ai::apollo_client::{ApolloClient, ApolloPlace}; use crate::ai::llm_client::LlmClient; use crate::ai::ollama::{ChatMessage, OllamaClient, Tool}; +use crate::ai::llamacpp::LlamaCppClient; use crate::ai::openrouter::OpenRouterClient; use crate::ai::sms_client::{SmsApiClient, SmsSearchHit, SmsSearchParams}; use crate::ai::user_display_name; @@ -68,6 +69,9 @@ pub struct InsightGenerator { /// Optional OpenRouter client, used when `backend=hybrid` is requested. /// `None` when `OPENROUTER_API_KEY` is not configured. openrouter: Option>, + /// Optional llama-swap client, used when `backend=llamacpp` is requested. + /// `None` when `LLAMA_SWAP_URL` is not configured. + llamacpp: Option>, sms_client: SmsApiClient, /// Optional integration with Apollo's user-defined Places. When the /// integration is disabled (`APOLLO_API_BASE_URL` unset), every @@ -120,6 +124,7 @@ impl InsightGenerator { pub fn new( ollama: OllamaClient, openrouter: Option>, + llamacpp: Option>, sms_client: SmsApiClient, apollo_client: ApolloClient, insight_dao: Arc>>, @@ -137,6 +142,7 @@ impl InsightGenerator { Self { ollama, openrouter, + llamacpp, sms_client, apollo_client, insight_dao, @@ -3574,23 +3580,31 @@ Return ONLY the summary, nothing else."#, .map(|s| s.trim().to_lowercase()) .filter(|s| !s.is_empty()) .unwrap_or_else(|| "local".to_string()); - if !matches!(backend_label.as_str(), "local" | "hybrid") { + if !matches!(backend_label.as_str(), "local" | "hybrid" | "llamacpp") { return Err(anyhow::anyhow!( - "unknown backend '{}'; expected 'local' or 'hybrid'", + "unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'", backend_label )); } span.set_attribute(KeyValue::new("backend", backend_label.clone())); let is_hybrid = backend_label == "hybrid"; + let is_llamacpp = backend_label == "llamacpp"; + // In hybrid + llamacpp modes the chat model never sees the image + // directly; we describe-then-inline locally before the agentic loop + // starts. Tracked as a single flag so vision/tool-gate logic doesn't + // have to branch twice. + let describes_then_inlines = is_hybrid || is_llamacpp; // 1b. Always build an Ollama client. In local mode it owns the chat - // loop; in hybrid mode it still handles describe_image + any - // tool-local calls (e.g. if a future tool needs embeddings). - // Sampling overrides only apply in local mode — in hybrid the - // user's params belong to the OpenRouter chat client. - let apply_sampling_to_ollama = !is_hybrid; + // loop; in hybrid/llamacpp mode it still handles tool-local calls + // (e.g. future embedding-backed tools). The chat backend is + // selected separately below. + // Sampling overrides only apply in local mode — in + // hybrid/llamacpp the user's params belong to the alternate chat + // client. + let apply_sampling_to_ollama = !describes_then_inlines; let mut ollama_client = if let Some(ref model) = custom_model - && !is_hybrid + && !describes_then_inlines { log::info!("Using custom model for agentic: {}", model); span.set_attribute(KeyValue::new("custom_model", model.clone())); @@ -3601,7 +3615,7 @@ Return ONLY the summary, nothing else."#, Some(model.clone()), ) } else { - if !is_hybrid { + if !describes_then_inlines { span.set_attribute(KeyValue::new("model", self.ollama.primary_model.clone())); } self.ollama.clone() @@ -3674,6 +3688,44 @@ Return ONLY the summary, nothing else."#, None }; + // 1d. In llamacpp mode, clone the configured LlamaCpp client and + // apply per-request overrides. Same shape as the openrouter + // branch above; describe_image will route through the vision + // slot configured on the client. + let llamacpp_client: Option = if is_llamacpp { + let arc = self.llamacpp.as_ref().ok_or_else(|| { + anyhow::anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured") + })?; + let mut c: LlamaCppClient = (**arc).clone(); + if let Some(ref m) = custom_model { + c.primary_model = m.clone(); + span.set_attribute(KeyValue::new("custom_model", m.clone())); + } + span.set_attribute(KeyValue::new("llamacpp_model", c.primary_model.clone())); + if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() { + if let Some(t) = temperature { + span.set_attribute(KeyValue::new("temperature", t as f64)); + } + if let Some(p) = top_p { + span.set_attribute(KeyValue::new("top_p", p as f64)); + } + if let Some(k) = top_k { + span.set_attribute(KeyValue::new("top_k", k as i64)); + } + if let Some(m) = min_p { + span.set_attribute(KeyValue::new("min_p", m as f64)); + } + c.set_sampling_params(temperature, top_p, top_k, min_p); + } + if let Some(ctx) = num_ctx { + span.set_attribute(KeyValue::new("num_ctx", ctx as i64)); + c.set_num_ctx(Some(ctx)); + } + Some(c) + } else { + None + }; + let insight_cx = current_cx.with_span(span); // 2. Verify chat model supports tool calling. @@ -3681,10 +3733,11 @@ Return ONLY the summary, nothing else."#, // - hybrid: trust the operator's curated allowlist // (OPENROUTER_ALLOWED_MODELS) — no live precheck. A bad model id // surfaces as a chat-call error on the next step. - let has_vision = if is_hybrid { - // In hybrid mode the chat model never sees images directly — we - // describe-then-inject, so `has_vision` drives only whether we - // bother loading the image to describe it, which we always do. + let has_vision = if describes_then_inlines { + // In hybrid + llamacpp modes the chat model never sees images + // directly — we describe-then-inject, so `has_vision` drives only + // whether we bother loading the image to describe it, which we + // always do. true } else { if let Some(ref model_name) = custom_model { @@ -3864,24 +3917,61 @@ Return ONLY the summary, nothing else."#, None }; - let hybrid_visual_description: Option = if is_hybrid { + // describe-then-inline path. In hybrid mode the vision backend + // defaults to Ollama but can be flipped to llamacpp via + // `HYBRID_VISION_BACKEND=llamacpp` (so chat goes to OpenRouter while + // vision/audio routes through llama-swap). In llamacpp mode we always + // use the llamacpp client's configured vision slot. + let inlined_visual_description: Option = if describes_then_inlines { match image_base64.as_deref() { - Some(b64) => match self.ollama.describe_image(b64).await { - Ok(desc) => { - log::info!( - "Hybrid: local vision describe succeeded ({} chars)", - desc.len() - ); - Some(desc) + Some(b64) => { + let use_llamacpp_vision = if is_llamacpp { + true + } else { + // is_hybrid branch — consult env switch + matches!( + std::env::var("HYBRID_VISION_BACKEND") + .ok() + .as_deref() + .map(|s| s.trim().to_lowercase()) + .as_deref(), + Some("llamacpp") + ) + }; + + let described = if use_llamacpp_vision { + match self.llamacpp.as_ref() { + Some(c) => c.describe_image(b64).await, + None => { + log::warn!( + "describe-then-inline: requested llamacpp vision but LLAMA_SWAP_URL is unset, falling back to Ollama" + ); + self.ollama.describe_image(b64).await + } + } + } else { + self.ollama.describe_image(b64).await + }; + + match described { + Ok(desc) => { + log::info!( + "{}: vision describe succeeded ({} chars)", + backend_label, + desc.len() + ); + Some(desc) + } + Err(e) => { + log::warn!( + "{}: vision describe failed, continuing without: {}", + backend_label, + e + ); + None + } } - Err(e) => { - log::warn!( - "Hybrid: local vision describe failed, continuing without: {}", - e - ); - None - } - }, + } None => None, } } else { @@ -3934,7 +4024,7 @@ Return ONLY the summary, nothing else."#, .map(|c| format!("Contact/Person: {}", c)) .unwrap_or_else(|| "Contact/Person: unknown".to_string()); - let visual_block = hybrid_visual_description + let visual_block = inlined_visual_description .as_deref() .map(|d| format!("Visual description (from local vision model):\n{}\n\n", d)) .unwrap_or_default(); @@ -3954,25 +4044,28 @@ Return ONLY the summary, nothing else."#, ); // 10. Define tools. Gate flags computed from current data presence; - // hybrid mode omits describe_photo since the chat model receives - // the visual description inline (so we pass `false` for has_vision - // in hybrid mode regardless of the model's actual capability). - let gate_opts = self.current_gate_opts(has_vision && !is_hybrid); + // describe-then-inline modes (hybrid, llamacpp) omit describe_photo + // since the chat model receives the visual description inline (so + // we pass `false` for has_vision in those modes regardless of the + // model's actual capability). + let gate_opts = self.current_gate_opts(has_vision && !describes_then_inlines); let tools = Self::build_tool_definitions(gate_opts); - // 11. Build initial messages. In hybrid mode images are never - // attached to the wire message — the description is part of - // `user_content`. + // 11. Build initial messages. In describe-then-inline modes images + // are never attached to the wire message — the description is part + // of `user_content`. let system_msg = ChatMessage::system(system_content); let mut user_msg = ChatMessage::user(user_content); - if !is_hybrid && let Some(ref img) = image_base64 { + if !describes_then_inlines && let Some(ref img) = image_base64 { user_msg.images = Some(vec![img.clone()]); } let mut messages = vec![system_msg, user_msg]; // 12. Agentic loop — dispatch through the selected backend. - let chat_backend: &dyn LlmClient = if let Some(ref or_c) = openrouter_client { + let chat_backend: &dyn LlmClient = if let Some(ref lc_c) = llamacpp_client { + lc_c + } else if let Some(ref or_c) = openrouter_client { or_c } else { &ollama_client diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs new file mode 100644 index 0000000..100020c --- /dev/null +++ b/src/ai/llamacpp.rs @@ -0,0 +1,978 @@ +// LlamaCppClient — talks to a llama-swap proxy that fronts one or more +// llama-server processes. llama-swap exposes an OpenAI-compatible HTTP +// surface (`/v1/chat/completions`, `/v1/embeddings`, `/v1/models`), so the +// wire translation mirrors `OpenRouterClient` almost exactly. +// +// Differences from OpenRouter: +// - No bearer auth or attribution headers; llama-swap is LAN-only. +// - Three model slots (`primary_model` = chat, `vision_model`, `embedding_model`) +// each map to a model id in the llama-swap config. `describe_image` and +// `generate_embeddings` issue requests with the appropriate slot id in the +// `model` field, which is how llama-swap selects which backend process to +// run. +// - `/v1/models` returns only the configured slot ids — capabilities aren't +// reported by the API, so `vision_models` is a config-time allowlist (env +// `LLAMA_SWAP_VISION_MODELS`) used to set `has_vision` on responses. +// `has_tool_calling` is assumed true for every slot, since llama-swap entries +// default to launching llama-server with `--jinja`. +// +// First consumer lands alongside the three-way backend dispatch in +// insight_generator / insight_chat. +#![allow(dead_code)] + +use anyhow::{Context, Result, anyhow, bail}; +use async_trait::async_trait; +use reqwest::Client; +use serde::Deserialize; +use serde_json::{Value, json}; +use std::time::Duration; + +use crate::ai::llm_client::{ + ChatMessage, LlmClient, LlmStreamEvent, ModelCapabilities, Tool, ToolCall, ToolCallFunction, +}; +use futures::stream::{BoxStream, StreamExt}; + +const DEFAULT_BASE_URL: &str = "http://localhost:9292/v1"; +const DEFAULT_PRIMARY_MODEL: &str = "chat"; +const DEFAULT_VISION_MODEL: &str = "vision"; +const DEFAULT_EMBEDDING_MODEL: &str = "embed"; +const DEFAULT_REQUEST_TIMEOUT_SECS: u64 = 180; + +/// OpenAI-compatible client targeting a llama-swap proxy in front of one or +/// more llama-server processes. See the module doc-comment for the slot model. +#[derive(Clone)] +pub struct LlamaCppClient { + client: Client, + pub base_url: String, + /// Chat model slot id (e.g. `"chat"`). Used for `generate` / + /// `chat_with_tools` / `chat_with_tools_stream`. + pub primary_model: String, + /// Embedding model slot id (e.g. `"embed"`). Used for + /// `generate_embeddings`. + pub embedding_model: String, + /// Vision model slot id (e.g. `"vision"`). Used for `describe_image` and + /// included in `vision_models` automatically so capability lookups for + /// the default vision slot report `has_vision = true` even when the env + /// allowlist is empty. + pub vision_model: String, + /// Operator-curated set of slot ids known to be multimodal. Drives the + /// `has_vision` field in `list_models` / `model_capabilities`, since + /// llama-swap's `/v1/models` doesn't report modality. Empty allowlist + /// still marks `vision_model` as vision-capable. + pub vision_models: Vec, + num_ctx: Option, + temperature: Option, + top_p: Option, + top_k: Option, + min_p: Option, +} + +impl LlamaCppClient { + pub fn new(base_url: Option, primary_model: Option) -> Self { + let timeout_secs = std::env::var("LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS") + .ok() + .and_then(|v| v.parse::().ok()) + .unwrap_or(DEFAULT_REQUEST_TIMEOUT_SECS); + Self { + client: Client::builder() + .connect_timeout(Duration::from_secs(10)) + .timeout(Duration::from_secs(timeout_secs)) + .build() + .unwrap_or_else(|_| Client::new()), + base_url: base_url.unwrap_or_else(|| DEFAULT_BASE_URL.to_string()), + primary_model: primary_model.unwrap_or_else(|| DEFAULT_PRIMARY_MODEL.to_string()), + embedding_model: DEFAULT_EMBEDDING_MODEL.to_string(), + vision_model: DEFAULT_VISION_MODEL.to_string(), + vision_models: Vec::new(), + num_ctx: None, + temperature: None, + top_p: None, + top_k: None, + min_p: None, + } + } + + pub fn set_embedding_model(&mut self, model: String) { + self.embedding_model = model; + } + + pub fn set_vision_model(&mut self, model: String) { + self.vision_model = model; + } + + pub fn set_vision_models(&mut self, models: Vec) { + self.vision_models = models; + } + + pub fn set_num_ctx(&mut self, num_ctx: Option) { + self.num_ctx = num_ctx; + } + + pub fn set_sampling_params( + &mut self, + temperature: Option, + top_p: Option, + top_k: Option, + min_p: Option, + ) { + self.temperature = temperature; + self.top_p = top_p; + self.top_k = top_k; + self.min_p = min_p; + } + + /// Translate canonical messages to the OpenAI-compatible wire shape. + /// Behaviorally identical to `OpenRouterClient::messages_to_openai` — + /// stringify tool-call arguments, rewrite images into content-parts, attach + /// `tool_call_id` to `role=tool` messages based on the preceding assistant + /// turn's tool calls. + fn messages_to_openai(messages: &[ChatMessage]) -> Vec { + let mut out = Vec::with_capacity(messages.len()); + let mut last_tool_call_ids: Vec = Vec::new(); + let mut next_tool_result_idx: usize = 0; + + for msg in messages { + let mut obj = serde_json::Map::new(); + obj.insert("role".into(), Value::String(msg.role.clone())); + + match &msg.images { + Some(images) if !images.is_empty() => { + let mut parts: Vec = Vec::new(); + if !msg.content.is_empty() { + parts.push(json!({"type": "text", "text": msg.content})); + } + for img in images { + let url = image_to_data_url(img); + parts.push(json!({ + "type": "image_url", + "image_url": { "url": url } + })); + } + obj.insert("content".into(), Value::Array(parts)); + } + _ => { + obj.insert("content".into(), Value::String(msg.content.clone())); + } + } + + if let Some(tcs) = &msg.tool_calls + && msg.role == "assistant" + { + let converted: Vec = tcs + .iter() + .enumerate() + .map(|(i, call)| { + let id = call.id.clone().unwrap_or_else(|| format!("call_{}", i)); + let args_str = serde_json::to_string(&call.function.arguments) + .unwrap_or_else(|_| "{}".to_string()); + json!({ + "id": id, + "type": "function", + "function": { + "name": call.function.name, + "arguments": args_str, + } + }) + }) + .collect(); + last_tool_call_ids = converted + .iter() + .filter_map(|v| v.get("id").and_then(|x| x.as_str()).map(String::from)) + .collect(); + next_tool_result_idx = 0; + obj.insert("tool_calls".into(), Value::Array(converted)); + } + + if msg.role == "tool" { + let id = last_tool_call_ids + .get(next_tool_result_idx) + .cloned() + .unwrap_or_else(|| "call_0".to_string()); + obj.insert("tool_call_id".into(), Value::String(id)); + next_tool_result_idx += 1; + } + + out.push(Value::Object(obj)); + } + + out + } + + /// Parse an OpenAI-compatible assistant message back into canonical shape. + /// llama.cpp emits `reasoning_content` on thinking models; we drop it for + /// parity with OpenRouter (which also strips upstream reasoning fields). + fn openai_message_to_chat(msg: &Value) -> Result { + let obj = msg + .as_object() + .ok_or_else(|| anyhow!("response message is not an object"))?; + let role = obj + .get("role") + .and_then(|v| v.as_str()) + .unwrap_or("assistant") + .to_string(); + let content = obj + .get("content") + .and_then(|v| v.as_str()) + .unwrap_or("") + .to_string(); + + let tool_calls = if let Some(tcs) = obj.get("tool_calls").and_then(|v| v.as_array()) { + let mut parsed = Vec::with_capacity(tcs.len()); + for tc in tcs { + let id = tc.get("id").and_then(|v| v.as_str()).map(String::from); + let function = tc + .get("function") + .ok_or_else(|| anyhow!("tool_call missing function field"))?; + let name = function + .get("name") + .and_then(|v| v.as_str()) + .unwrap_or_default() + .to_string(); + let args_value = match function.get("arguments") { + Some(Value::String(s)) => { + serde_json::from_str::(s).unwrap_or_else(|_| json!({})) + } + Some(v @ Value::Object(_)) => v.clone(), + _ => json!({}), + }; + parsed.push(ToolCall { + id, + function: ToolCallFunction { + name, + arguments: args_value, + }, + }); + } + Some(parsed) + } else { + None + }; + + Ok(ChatMessage { + role, + content, + tool_calls, + images: None, + }) + } + + fn build_options(&self) -> Vec<(&'static str, Value)> { + let mut v = Vec::new(); + if let Some(t) = self.temperature { + v.push(("temperature", json!(t))); + } + if let Some(p) = self.top_p { + v.push(("top_p", json!(p))); + } + if let Some(k) = self.top_k { + v.push(("top_k", json!(k))); + } + if let Some(m) = self.min_p { + v.push(("min_p", json!(m))); + } + // num_ctx isn't an OpenAI param; llama-server bakes ctx in at launch + // via -c, so we silently drop the override here. The config.yaml + // entry is the source of truth for context size. + let _ = self.num_ctx; + v + } + + /// Issue a chat request with an explicit model id override. Used by + /// `describe_image` to route through the vision slot without mutating + /// `self.primary_model`. + async fn chat_completion_with_model( + &self, + model: &str, + messages: Vec, + tools: Vec, + ) -> Result<(ChatMessage, Option, Option)> { + let url = format!("{}/chat/completions", self.base_url); + let mut body = serde_json::Map::new(); + body.insert("model".into(), Value::String(model.to_string())); + body.insert( + "messages".into(), + Value::Array(Self::messages_to_openai(&messages)), + ); + body.insert("stream".into(), Value::Bool(false)); + if !tools.is_empty() { + body.insert( + "tools".into(), + serde_json::to_value(&tools).context("serializing tools")?, + ); + } + for (k, v) in self.build_options() { + body.insert(k.into(), v); + } + + let resp = self + .client + .post(&url) + .json(&Value::Object(body)) + .send() + .await + .with_context(|| format!("POST {} failed", url))?; + + if !resp.status().is_success() { + let status = resp.status(); + let body = resp.text().await.unwrap_or_default(); + bail!("llama-swap chat request failed: {} — {}", status, body); + } + + let parsed: Value = resp.json().await.context("parsing chat response")?; + let choice = parsed + .get("choices") + .and_then(|v| v.as_array()) + .and_then(|a| a.first()) + .ok_or_else(|| { + anyhow!( + "response missing choices[0]: {}", + extract_error_detail(&parsed) + ) + })?; + let msg = choice.get("message").ok_or_else(|| { + anyhow!( + "choices[0] missing message: {}", + extract_error_detail(&parsed) + ) + })?; + let chat_msg = Self::openai_message_to_chat(msg)?; + + let usage = parsed.get("usage"); + let prompt_tokens = usage + .and_then(|u| u.get("prompt_tokens")) + .and_then(|v| v.as_i64()) + .map(|n| n as i32); + let completion_tokens = usage + .and_then(|u| u.get("completion_tokens")) + .and_then(|v| v.as_i64()) + .map(|n| n as i32); + + Ok((chat_msg, prompt_tokens, completion_tokens)) + } +} + +#[async_trait] +impl LlmClient for LlamaCppClient { + async fn generate( + &self, + prompt: &str, + system: Option<&str>, + images: Option>, + ) -> Result { + let mut messages: Vec = Vec::new(); + if let Some(sys) = system { + messages.push(ChatMessage::system(sys)); + } + let mut user = ChatMessage::user(prompt); + user.images = images; + messages.push(user); + + let (reply, _, _) = self.chat_with_tools(messages, Vec::new()).await?; + Ok(reply.content) + } + + async fn chat_with_tools( + &self, + messages: Vec, + tools: Vec, + ) -> Result<(ChatMessage, Option, Option)> { + log::info!( + "llama-swap chat_with_tools: model={} messages={} tools={}", + self.primary_model, + messages.len(), + tools.len() + ); + self.chat_completion_with_model(&self.primary_model.clone(), messages, tools) + .await + } + + async fn chat_with_tools_stream( + &self, + messages: Vec, + tools: Vec, + ) -> Result>> { + let url = format!("{}/chat/completions", self.base_url); + let mut body = serde_json::Map::new(); + body.insert( + "model".into(), + Value::String(self.primary_model.clone()), + ); + body.insert( + "messages".into(), + Value::Array(Self::messages_to_openai(&messages)), + ); + body.insert("stream".into(), Value::Bool(true)); + body.insert( + "stream_options".into(), + serde_json::json!({ "include_usage": true }), + ); + if !tools.is_empty() { + body.insert( + "tools".into(), + serde_json::to_value(&tools).context("serializing tools")?, + ); + } + for (k, v) in self.build_options() { + body.insert(k.into(), v); + } + + let resp = self + .client + .post(&url) + .json(&Value::Object(body)) + .send() + .await + .with_context(|| format!("POST {} failed", url))?; + + if !resp.status().is_success() { + let status = resp.status(); + let body = resp.text().await.unwrap_or_default(); + bail!("llama-swap stream request failed: {} — {}", status, body); + } + + let byte_stream = resp.bytes_stream(); + let stream = async_stream::stream! { + let mut byte_stream = byte_stream; + let mut buf: Vec = Vec::new(); + let mut accumulated_content = String::new(); + let mut tool_state: std::collections::BTreeMap< + usize, + (Option, Option, String), + > = std::collections::BTreeMap::new(); + let mut role = "assistant".to_string(); + let mut prompt_tokens: Option = None; + let mut completion_tokens: Option = None; + let mut done_seen = false; + + while let Some(chunk) = byte_stream.next().await { + let chunk = match chunk { + Ok(b) => b, + Err(e) => { + yield Err(anyhow!("stream read failed: {}", e)); + return; + } + }; + buf.extend_from_slice(&chunk); + + while let Some(sep) = find_double_newline(&buf) { + let frame = buf.drain(..sep + 2).collect::>(); + let frame_str = match std::str::from_utf8(&frame) { + Ok(s) => s, + Err(_) => continue, + }; + for line in frame_str.lines() { + let line = line.trim_end_matches('\r'); + let payload = match line.strip_prefix("data: ") { + Some(p) => p, + None => continue, + }; + if payload == "[DONE]" { + done_seen = true; + break; + } + let v: Value = match serde_json::from_str(payload) { + Ok(v) => v, + Err(e) => { + log::warn!( + "malformed llama-swap SSE frame: {} ({})", + payload, + e + ); + continue; + } + }; + + if let Some(usage) = v.get("usage") { + prompt_tokens = usage + .get("prompt_tokens") + .and_then(|n| n.as_i64()) + .map(|n| n as i32); + completion_tokens = usage + .get("completion_tokens") + .and_then(|n| n.as_i64()) + .map(|n| n as i32); + } + + let Some(choices) = v.get("choices").and_then(|c| c.as_array()) + else { + continue; + }; + let Some(choice) = choices.first() else { continue }; + let delta = match choice.get("delta") { + Some(d) => d, + None => continue, + }; + if let Some(r) = delta.get("role").and_then(|v| v.as_str()) { + role = r.to_string(); + } + if let Some(content) = + delta.get("content").and_then(|v| v.as_str()) + && !content.is_empty() + { + accumulated_content.push_str(content); + yield Ok(LlmStreamEvent::TextDelta(content.to_string())); + } + if let Some(tcs) = delta.get("tool_calls").and_then(|v| v.as_array()) { + for tc_delta in tcs { + let idx = tc_delta + .get("index") + .and_then(|n| n.as_u64()) + .unwrap_or(0) as usize; + let entry = tool_state + .entry(idx) + .or_insert((None, None, String::new())); + if let Some(id) = + tc_delta.get("id").and_then(|v| v.as_str()) + { + entry.0 = Some(id.to_string()); + } + if let Some(func) = tc_delta.get("function") { + if let Some(name) = + func.get("name").and_then(|v| v.as_str()) + { + entry.1 = Some(name.to_string()); + } + if let Some(args) = + func.get("arguments").and_then(|v| v.as_str()) + { + entry.2.push_str(args); + } + } + } + } + } + if done_seen { + break; + } + } + if done_seen { + break; + } + } + + let tool_calls: Option> = if tool_state.is_empty() { + None + } else { + let mut v = Vec::with_capacity(tool_state.len()); + for (_idx, (id, name, args)) in tool_state { + let arguments: Value = if args.trim().is_empty() { + Value::Object(Default::default()) + } else { + serde_json::from_str(&args).unwrap_or_else(|_| { + Value::Object(Default::default()) + }) + }; + v.push(ToolCall { + id, + function: ToolCallFunction { + name: name.unwrap_or_default(), + arguments, + }, + }); + } + Some(v) + }; + + let message = ChatMessage { + role, + content: accumulated_content, + tool_calls, + images: None, + }; + yield Ok(LlmStreamEvent::Done { + message, + prompt_eval_count: prompt_tokens, + eval_count: completion_tokens, + }); + }; + + Ok(Box::pin(stream)) + } + + async fn generate_embeddings(&self, texts: &[&str]) -> Result>> { + let url = format!("{}/embeddings", self.base_url); + let body = json!({ + "model": self.embedding_model, + "input": texts, + }); + + let resp = self + .client + .post(&url) + .json(&body) + .send() + .await + .with_context(|| format!("POST {} failed", url))?; + + if !resp.status().is_success() { + let status = resp.status(); + let body = resp.text().await.unwrap_or_default(); + bail!("llama-swap embedding request failed: {} — {}", status, body); + } + + #[derive(Deserialize)] + struct EmbedResponse { + data: Vec, + } + #[derive(Deserialize)] + struct EmbedItem { + embedding: Vec, + } + + let parsed: EmbedResponse = resp.json().await.context("parsing embed response")?; + Ok(parsed.data.into_iter().map(|i| i.embedding).collect()) + } + + async fn describe_image(&self, image_base64: &str) -> Result { + let prompt = "Briefly describe what you see in this image in 1-2 sentences. \ + Focus on the people, location, and activity."; + let system = "You are a scene description assistant. Be concise and factual."; + + let messages = vec![ + ChatMessage::system(system), + ChatMessage { + role: "user".to_string(), + content: prompt.to_string(), + tool_calls: None, + images: Some(vec![image_base64.to_string()]), + }, + ]; + + let (reply, _, _) = self + .chat_completion_with_model(&self.vision_model.clone(), messages, Vec::new()) + .await?; + Ok(reply.content) + } + + async fn list_models(&self) -> Result> { + let url = format!("{}/models", self.base_url); + let resp = self + .client + .get(&url) + .send() + .await + .with_context(|| format!("GET {} failed", url))?; + + if !resp.status().is_success() { + let status = resp.status(); + let body = resp.text().await.unwrap_or_default(); + bail!("llama-swap list_models failed: {} — {}", status, body); + } + + let parsed: Value = resp.json().await.context("parsing models response")?; + let data = parsed + .get("data") + .and_then(|v| v.as_array()) + .ok_or_else(|| anyhow!("models response missing data[]"))?; + + let caps: Vec = data + .iter() + .map(|m| self.parse_model_capabilities(m)) + .collect(); + + Ok(caps) + } + + async fn model_capabilities(&self, model: &str) -> Result { + let all = self.list_models().await?; + all.into_iter() + .find(|m| m.name == model) + .ok_or_else(|| anyhow!("model '{}' not found on llama-swap", model)) + } + + fn primary_model(&self) -> &str { + &self.primary_model + } +} + +impl LlamaCppClient { + fn parse_model_capabilities(&self, m: &Value) -> ModelCapabilities { + let name = m + .get("id") + .and_then(|v| v.as_str()) + .unwrap_or_default() + .to_string(); + let has_vision = name == self.vision_model || self.vision_models.iter().any(|v| v == &name); + // Tool calling is the default for llama-swap entries we configure + // (--jinja flag); no negative-list mechanism yet, so report true. + ModelCapabilities { + name, + has_vision, + has_tool_calling: true, + } + } +} + +/// Extract a diagnostic fragment from a llama-swap / llama-server response +/// that doesn't match the expected `{choices: [...]}` shape. llama-server +/// returns errors as `{"error": {"message": "...", "code": N, "type": "..."}}`; +/// llama-swap itself sometimes wraps subprocess failures with its own +/// `{"error": "..."}` flat shape. Surface either when present, otherwise fall +/// back to a truncated raw-JSON view. +fn extract_error_detail(parsed: &Value) -> String { + if let Some(err) = parsed.get("error") { + match err { + Value::Object(_) => { + let message = err + .get("message") + .and_then(|v| v.as_str()) + .unwrap_or("(no message)"); + let code = err + .get("code") + .map(|v| match v { + Value::String(s) => s.clone(), + other => other.to_string(), + }) + .unwrap_or_else(|| "?".to_string()); + let short_message: String = message.chars().take(240).collect(); + return format!("error code={} message=\"{}\"", code, short_message); + } + Value::String(s) => { + let short: String = s.chars().take(240).collect(); + return format!("error=\"{}\"", short); + } + _ => {} + } + } + let raw = parsed.to_string(); + raw.chars().take(300).collect() +} + +fn find_double_newline(buf: &[u8]) -> Option { + for i in 0..buf.len().saturating_sub(1) { + if buf[i] == b'\n' && buf[i + 1] == b'\n' { + return Some(i); + } + if i + 3 < buf.len() + && buf[i] == b'\r' + && buf[i + 1] == b'\n' + && buf[i + 2] == b'\r' + && buf[i + 3] == b'\n' + { + return Some(i + 1); + } + } + None +} + +fn image_to_data_url(img: &str) -> String { + if img.starts_with("data:") { + img.to_string() + } else { + format!("data:image/jpeg;base64,{}", img) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn tool_call_arguments_stringified_on_send() { + let msg = ChatMessage { + role: "assistant".into(), + content: String::new(), + tool_calls: Some(vec![ToolCall { + id: Some("call_abc".into()), + function: ToolCallFunction { + name: "search_sms".into(), + arguments: json!({"query": "hello", "limit": 5}), + }, + }]), + images: None, + }; + + let wire = LlamaCppClient::messages_to_openai(&[msg]); + let tcs = wire[0] + .get("tool_calls") + .and_then(|v| v.as_array()) + .expect("tool_calls present"); + let args = tcs[0] + .get("function") + .and_then(|f| f.get("arguments")) + .and_then(|a| a.as_str()) + .expect("arguments stringified"); + let parsed: Value = serde_json::from_str(args).unwrap(); + assert_eq!(parsed["query"], "hello"); + assert_eq!(parsed["limit"], 5); + } + + #[test] + fn tool_call_arguments_parsed_on_receive() { + let response_msg = json!({ + "role": "assistant", + "content": "", + "tool_calls": [{ + "id": "call_xyz", + "type": "function", + "function": { + "name": "get_weather", + "arguments": "{\"city\":\"Boston\",\"units\":\"celsius\"}" + } + }] + }); + let parsed = LlamaCppClient::openai_message_to_chat(&response_msg).unwrap(); + let tcs = parsed.tool_calls.unwrap(); + assert_eq!(tcs.len(), 1); + assert_eq!(tcs[0].function.name, "get_weather"); + assert_eq!(tcs[0].function.arguments["city"], "Boston"); + assert_eq!(tcs[0].function.arguments["units"], "celsius"); + assert_eq!(tcs[0].id.as_deref(), Some("call_xyz")); + } + + #[test] + fn tool_call_arguments_accept_native_json_on_receive() { + // Some llama.cpp builds emit arguments as a JSON object directly when + // jinja's tool-output strict-string rule isn't applied — accept both. + let response_msg = json!({ + "role": "assistant", + "content": "", + "tool_calls": [{ + "id": "call_1", + "type": "function", + "function": { + "name": "foo", + "arguments": {"nested": {"k": 1}} + } + }] + }); + let parsed = LlamaCppClient::openai_message_to_chat(&response_msg).unwrap(); + let tc = &parsed.tool_calls.unwrap()[0]; + assert_eq!(tc.function.arguments["nested"]["k"], 1); + } + + #[test] + fn images_become_content_parts() { + let mut msg = ChatMessage::user("What is in this photo?"); + msg.images = Some(vec!["BASE64DATA".into()]); + + let wire = LlamaCppClient::messages_to_openai(&[msg]); + let content = wire[0].get("content").and_then(|v| v.as_array()).unwrap(); + assert_eq!(content.len(), 2); + assert_eq!(content[0]["type"], "text"); + assert_eq!(content[0]["text"], "What is in this photo?"); + assert_eq!(content[1]["type"], "image_url"); + assert_eq!( + content[1]["image_url"]["url"], + "data:image/jpeg;base64,BASE64DATA" + ); + } + + #[test] + fn data_url_images_pass_through_unchanged() { + let mut msg = ChatMessage::user(""); + msg.images = Some(vec!["data:image/png;base64,ABCDEF".into()]); + let wire = LlamaCppClient::messages_to_openai(&[msg]); + let content = wire[0].get("content").and_then(|v| v.as_array()).unwrap(); + assert_eq!(content.len(), 1); + assert_eq!( + content[0]["image_url"]["url"], + "data:image/png;base64,ABCDEF" + ); + } + + #[test] + fn text_only_message_stays_string() { + let msg = ChatMessage::user("hello"); + let wire = LlamaCppClient::messages_to_openai(&[msg]); + assert_eq!(wire[0]["content"], "hello"); + assert!(wire[0]["content"].as_str().is_some()); + } + + #[test] + fn tool_result_inherits_tool_call_id_from_prior_assistant() { + let assistant = ChatMessage { + role: "assistant".into(), + content: String::new(), + tool_calls: Some(vec![ToolCall { + id: Some("call_42".into()), + function: ToolCallFunction { + name: "lookup".into(), + arguments: json!({}), + }, + }]), + images: None, + }; + let tool_result = ChatMessage::tool_result("found it"); + + let wire = LlamaCppClient::messages_to_openai(&[assistant, tool_result]); + assert_eq!(wire[1]["role"], "tool"); + assert_eq!(wire[1]["tool_call_id"], "call_42"); + } + + #[test] + fn multiple_tool_results_map_to_sequential_call_ids() { + let assistant = ChatMessage { + role: "assistant".into(), + content: String::new(), + tool_calls: Some(vec![ + ToolCall { + id: Some("call_A".into()), + function: ToolCallFunction { + name: "a".into(), + arguments: json!({}), + }, + }, + ToolCall { + id: Some("call_B".into()), + function: ToolCallFunction { + name: "b".into(), + arguments: json!({}), + }, + }, + ]), + images: None, + }; + let r1 = ChatMessage::tool_result("a result"); + let r2 = ChatMessage::tool_result("b result"); + + let wire = LlamaCppClient::messages_to_openai(&[assistant, r1, r2]); + assert_eq!(wire[1]["tool_call_id"], "call_A"); + assert_eq!(wire[2]["tool_call_id"], "call_B"); + } + + #[test] + fn missing_tool_call_id_gets_synthetic_fallback() { + let assistant = ChatMessage { + role: "assistant".into(), + content: String::new(), + tool_calls: Some(vec![ToolCall { + id: None, + function: ToolCallFunction { + name: "noid".into(), + arguments: json!({}), + }, + }]), + images: None, + }; + let wire = LlamaCppClient::messages_to_openai(&[assistant]); + let tcs = wire[0] + .get("tool_calls") + .and_then(|v| v.as_array()) + .unwrap(); + assert_eq!(tcs[0]["id"], "call_0"); + } + + #[test] + fn capability_inference_uses_vision_model_and_allowlist() { + let mut c = LlamaCppClient::new(None, Some("chat".into())); + c.set_vision_model("vision".into()); + c.set_vision_models(vec!["qwen-vl".into()]); + + let m_chat = json!({ "id": "chat" }); + let m_vision = json!({ "id": "vision" }); + let m_qwen = json!({ "id": "qwen-vl" }); + let m_other = json!({ "id": "embed" }); + + let chat = c.parse_model_capabilities(&m_chat); + let vision = c.parse_model_capabilities(&m_vision); + let qwen = c.parse_model_capabilities(&m_qwen); + let other = c.parse_model_capabilities(&m_other); + + assert!(!chat.has_vision); + assert!(chat.has_tool_calling); + assert!(vision.has_vision); + assert!(qwen.has_vision); + assert!(!other.has_vision); + } +} diff --git a/src/ai/mod.rs b/src/ai/mod.rs index 3468325..204da04 100644 --- a/src/ai/mod.rs +++ b/src/ai/mod.rs @@ -5,6 +5,7 @@ pub mod face_client; pub mod handlers; pub mod insight_chat; pub mod insight_generator; +pub mod llamacpp; pub mod llm_client; pub mod ollama; pub mod openrouter; @@ -20,7 +21,8 @@ pub use handlers::{ chat_history_handler, chat_rewind_handler, chat_stream_handler, chat_turn_handler, delete_insight_handler, export_training_data_handler, generate_agentic_insight_handler, generate_insight_handler, get_all_insights_handler, get_available_models_handler, - get_insight_handler, get_openrouter_models_handler, rate_insight_handler, + get_insight_handler, get_llamacpp_models_handler, get_openrouter_models_handler, + rate_insight_handler, }; pub use insight_generator::InsightGenerator; #[allow(unused_imports)] diff --git a/src/bin/populate_knowledge.rs b/src/bin/populate_knowledge.rs index 29945d7..71f2f8a 100644 --- a/src/bin/populate_knowledge.rs +++ b/src/bin/populate_knowledge.rs @@ -195,6 +195,7 @@ async fn main() -> anyhow::Result<()> { let generator = InsightGenerator::new( ollama, None, + None, sms_client, apollo_client, insight_dao.clone(), diff --git a/src/main.rs b/src/main.rs index 63013ce..3bd3656 100644 --- a/src/main.rs +++ b/src/main.rs @@ -313,6 +313,7 @@ fn main() -> std::io::Result<()> { .service(ai::get_all_insights_handler) .service(ai::get_available_models_handler) .service(ai::get_openrouter_models_handler) + .service(ai::get_llamacpp_models_handler) .service(ai::chat_turn_handler) .service(ai::chat_stream_handler) .service(ai::chat_history_handler) diff --git a/src/state.rs b/src/state.rs index 8f1bd4e..96d1c22 100644 --- a/src/state.rs +++ b/src/state.rs @@ -2,6 +2,7 @@ use crate::ai::apollo_client::ApolloClient; use crate::ai::clip_client::ClipClient; use crate::ai::face_client::FaceClient; use crate::ai::insight_chat::{ChatLockMap, InsightChatService}; +use crate::ai::llamacpp::LlamaCppClient; use crate::ai::openrouter::OpenRouterClient; use crate::ai::{InsightGenerator, OllamaClient, SmsApiClient}; use crate::database::{ @@ -62,6 +63,16 @@ pub struct AppState { /// Curated list of OpenRouter model ids exposed to clients. Sourced from /// `OPENROUTER_ALLOWED_MODELS` (comma-separated). Empty when unset. pub openrouter_allowed_models: Vec, + /// `None` when `LLAMA_SWAP_URL` is not configured. Consulted only when a + /// request explicitly opts into `backend=llamacpp`. Same shape as the + /// `openrouter` slot — present here so handlers can route to it without + /// threading through the generator. + #[allow(dead_code)] + pub llamacpp: Option>, + /// Curated list of llama-swap model ids exposed to clients. Sourced from + /// `LLAMA_SWAP_ALLOWED_MODELS` (comma-separated). Empty when unset; the + /// server then falls back to `LLAMA_SWAP_PRIMARY_MODEL`. + pub llamacpp_allowed_models: Vec, pub sms_client: SmsApiClient, pub insight_generator: InsightGenerator, /// Chat continuation service. Hold an Arc so handlers can clone cheaply. @@ -105,6 +116,8 @@ impl AppState { ollama: OllamaClient, openrouter: Option>, openrouter_allowed_models: Vec, + llamacpp: Option>, + llamacpp_allowed_models: Vec, sms_client: SmsApiClient, insight_generator: InsightGenerator, insight_chat: Arc, @@ -145,6 +158,8 @@ impl AppState { ollama, openrouter, openrouter_allowed_models, + llamacpp, + llamacpp_allowed_models, sms_client, insight_generator, insight_chat, @@ -186,6 +201,9 @@ impl Default for AppState { let openrouter = build_openrouter_from_env(); let openrouter_allowed_models = parse_openrouter_allowed_models(); + let llamacpp = build_llamacpp_from_env(); + let llamacpp_allowed_models = parse_llamacpp_allowed_models(); + let sms_api_url = env::var("SMS_API_URL").unwrap_or_else(|_| "http://localhost:8000".to_string()); let sms_api_token = env::var("SMS_API_TOKEN").ok(); @@ -250,6 +268,7 @@ impl Default for AppState { let insight_generator = InsightGenerator::new( ollama.clone(), openrouter.clone(), + llamacpp.clone(), sms_client.clone(), apollo_client.clone(), insight_dao.clone(), @@ -273,6 +292,7 @@ impl Default for AppState { Arc::new(insight_generator.clone()), ollama.clone(), openrouter.clone(), + llamacpp.clone(), insight_dao.clone(), chat_locks, )); @@ -294,6 +314,8 @@ impl Default for AppState { ollama, openrouter, openrouter_allowed_models, + llamacpp, + llamacpp_allowed_models, sms_client, insight_generator, insight_chat, @@ -335,6 +357,50 @@ fn parse_openrouter_allowed_models() -> Vec { .collect() } +/// Build a `LlamaCppClient` from environment variables. Returns `None` when +/// `LLAMA_SWAP_URL` is unset (the llamacpp backend is then unavailable and +/// requests for it return a clear error). The slot ids default to the +/// names the bundled `llama-swap/config.yaml` uses — `chat` / `vision` / +/// `embed` — so a minimal deploy only needs to set `LLAMA_SWAP_URL`. +fn build_llamacpp_from_env() -> Option> { + let base_url = env::var("LLAMA_SWAP_URL").ok()?; + let primary_model = env::var("LLAMA_SWAP_PRIMARY_MODEL").ok(); + let mut client = LlamaCppClient::new(Some(base_url), primary_model); + if let Ok(model) = env::var("LLAMA_SWAP_EMBEDDING_MODEL") { + client.set_embedding_model(model); + } + if let Ok(model) = env::var("LLAMA_SWAP_VISION_MODEL") { + client.set_vision_model(model); + } + client.set_vision_models(parse_llamacpp_vision_models()); + Some(Arc::new(client)) +} + +/// Parse `LLAMA_SWAP_ALLOWED_MODELS` (comma-separated) into a vec. Used to +/// drive `/insights/llamacpp/models`; empty when unset. +fn parse_llamacpp_allowed_models() -> Vec { + env::var("LLAMA_SWAP_ALLOWED_MODELS") + .unwrap_or_default() + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect() +} + +/// Parse `LLAMA_SWAP_VISION_MODELS` (comma-separated) — slot ids that report +/// `has_vision = true` in capability lookups. The configured `vision_model` +/// (default `vision`) is always considered vision-capable regardless of this +/// list, so a deploy that only uses the default vision slot can leave it +/// unset. +fn parse_llamacpp_vision_models() -> Vec { + env::var("LLAMA_SWAP_VISION_MODELS") + .unwrap_or_default() + .split(',') + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .collect() +} + #[cfg(test)] impl AppState { /// Creates an AppState instance for testing with temporary directories @@ -397,6 +463,7 @@ impl AppState { let insight_generator = InsightGenerator::new( ollama.clone(), None, + None, sms_client.clone(), apollo_client.clone(), insight_dao.clone(), @@ -418,6 +485,7 @@ impl AppState { Arc::new(insight_generator.clone()), ollama.clone(), None, + None, insight_dao.clone(), chat_locks, )); @@ -445,6 +513,8 @@ impl AppState { ollama, None, Vec::new(), + None, + Vec::new(), sms_client, insight_generator, insight_chat,