diff --git a/.env.example b/.env.example index 63d8436..81fab4e 100644 --- a/.env.example +++ b/.env.example @@ -53,26 +53,30 @@ AGENTIC_CHAT_MAX_ITERATIONS=6 # OPENROUTER_HTTP_REFERER=https://your-site.example # OPENROUTER_APP_TITLE=ImageApi +# ── AI Insights — local backend switch ────────────────────────────────── +# Picks which local LLM stack the server uses for chat, vision describe, +# and embeddings. `ollama` (default) uses the OLLAMA_* settings above; +# `llamacpp` uses the LLAMA_SWAP_* settings below. The switch is global +# and applies to both `backend=local` and `backend=hybrid` (hybrid keeps +# chat on OpenRouter but still uses this stack for the describe pass). +# Don't flip mid-deploy without re-embedding existing index rows — +# mixed vector spaces break similarity search. +# LLM_BACKEND=ollama + # ── AI Insights — llama.cpp / llama-swap (optional) ───────────────────── -# Set LLAMA_SWAP_URL to enable the `llamacpp` chat_backend. Talks -# OpenAI-compatible /v1 to a llama-swap proxy that fronts per-slot -# llama-server instances (chat / vision / embed). Like hybrid, the -# agentic loop describes images via the vision slot then inlines the -# text into the chat slot — so the chat slot itself can be text-only. +# Set LLAMA_SWAP_URL plus LLM_BACKEND=llamacpp to swap the local stack +# off Ollama. Talks OpenAI-compatible /v1 to a llama-swap proxy fronting +# per-slot llama-server instances (chat / vision / embed). The chat slot +# is treated as text-only — images are pre-described via the vision slot +# and inlined into the prompt. # LLAMA_SWAP_URL=http://localhost:9292/v1 # LLAMA_SWAP_PRIMARY_MODEL=chat # LLAMA_SWAP_VISION_MODEL=vision # LLAMA_SWAP_EMBEDDING_MODEL=embed -# Comma-separated allowlist of model ids the /v1/models endpoint should -# advertise as vision-capable (llama-swap doesn't report modality). -# LLAMA_SWAP_VISION_MODELS=vision -# Comma-separated allowlist surfaced by /insights/llamacpp/models. +# Comma-separated allowlist surfaced by /insights/models when +# LLM_BACKEND=llamacpp. # LLAMA_SWAP_ALLOWED_MODELS=chat,vision,embed -# LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=120 -# Routes hybrid mode's vision-describe pass through llama-swap's vision -# slot instead of Ollama (chat still goes to OpenRouter). Values: -# `ollama` (default) | `llamacpp`. -# HYBRID_VISION_BACKEND=ollama +# LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180 # ── AI Insights — sibling services (optional) ─────────────────────────── # Apollo (places, face inference, CLIP encoders). Single-Apollo deploys diff --git a/CLAUDE.md b/CLAUDE.md index d3419e6..d06b29f 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -473,9 +473,8 @@ GET /memories?path=...&recursive=true POST /insights/generate (non-agentic single-shot) POST /insights/generate/agentic (tool-calling loop; body: { file_path, backend?, model?, ... }) GET /insights?path=...&library=... -GET /insights/models (local Ollama models + capabilities) +GET /insights/models (local-backend models + capabilities; Ollama OR llama-swap based on LLM_BACKEND) GET /insights/openrouter/models (curated OpenRouter allowlist) -GET /insights/llamacpp/models (curated llama-swap slot allowlist) POST /insights/rate (thumbs up/down for training data) // Insight Chat Continuation @@ -632,22 +631,27 @@ OPENROUTER_EMBEDDING_MODEL=openai/text-embedding-3-small # Optional, embeddings OPENROUTER_HTTP_REFERER=https://your-site.example # Optional attribution header OPENROUTER_APP_TITLE=ImageApi # Optional attribution header -# llama.cpp / llama-swap (Llamacpp Backend) - sibling to Ollama; OpenAI-compatible +# Local LLM backend switch. `ollama` (default) keeps the OLLAMA_* settings +# above; `llamacpp` swaps the entire local stack (chat + vision describe + +# embeddings) over to llama-swap. The switch is global and applies to +# `backend=local` requests and to `backend=hybrid`'s describe pass (hybrid +# chat still goes to OpenRouter). Don't flip mid-deploy without +# re-embedding — mixed vector spaces break similarity search. +LLM_BACKEND=ollama + +# llama.cpp / llama-swap (used when LLM_BACKEND=llamacpp). OpenAI-compatible # proxy hosting one or more llama-server processes (chat / vision / embed slots). -LLAMA_SWAP_URL=http://localhost:9292/v1 # Required to enable llamacpp backend +LLAMA_SWAP_URL=http://localhost:9292/v1 # Required when LLM_BACKEND=llamacpp LLAMA_SWAP_PRIMARY_MODEL=chat # Chat slot id (matches config.yaml) -LLAMA_SWAP_VISION_MODEL=vision # Vision slot id; describe_image routes here -LLAMA_SWAP_EMBEDDING_MODEL=embed # Embedding slot id (when local embeddings via llamacpp) -LLAMA_SWAP_VISION_MODELS=qwen-vl,llava # Comma-separated slot ids known to have vision. - # Drives `has_vision` in /insights/llamacpp/models. - # `LLAMA_SWAP_VISION_MODEL` is auto-included. -LLAMA_SWAP_ALLOWED_MODELS=chat,coder # Curated allowlist exposed to clients via - # GET /insights/llamacpp/models. Empty = no picker. +LLAMA_SWAP_VISION_MODEL=vision # Vision slot id; describe_image routes here. + # The only slot reported as has_vision=true in + # /insights/models — chat slots are treated as + # text-only (images pre-described and inlined). +LLAMA_SWAP_EMBEDDING_MODEL=embed # Embedding slot id +LLAMA_SWAP_ALLOWED_MODELS=chat,coder # Curated allowlist surfaced by GET /insights/models + # when LLM_BACKEND=llamacpp. Empty = picker shows + # only the configured primary model. LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180 # Per-request timeout; bump for slow CPU offload -HYBRID_VISION_BACKEND=llamacpp # Optional override for hybrid mode's describe_image: - # `ollama` (default) or `llamacpp`. When `llamacpp`, - # hybrid still routes chat to OpenRouter but uses - # llama-swap's vision slot to describe images. # Insight Chat Continuation AGENTIC_CHAT_MAX_ITERATIONS=6 # Cap on tool-calling iterations per chat turn (default 6) @@ -668,13 +672,36 @@ The `OllamaClient` provides methods to query available models: This allows runtime verification of model availability before generating insights. +**Local backend switch (`LLM_BACKEND`):** + +One env var decides which "local" stack the server runs against — `ollama` +(default) or `llamacpp`. It's global on purpose: chat, vision describe, and +embeddings all route through the same backend, so the embedding-vector +column in SQLite stays in one vector space. Don't flip mid-deploy without +re-embedding the affected rows — similarity search will collapse. + +- `LLM_BACKEND=ollama`: chat and embeddings use Ollama; vision describe + uses Ollama's multimodal model. +- `LLM_BACKEND=llamacpp`: chat hits llama-swap's `chat` slot (which is + treated as text-only — images are pre-described via the `vision` slot + and inlined), embeddings hit the `embed` slot, vision describe hits the + `vision` slot. Requires `LLAMA_SWAP_URL`. + +The per-request `backend=hybrid` override is orthogonal: it always sends +chat to OpenRouter, but the describe pass still routes through whichever +`LLM_BACKEND` is configured. + +`GET /insights/models` returns the local-backend models with capabilities +in the same envelope shape regardless of `LLM_BACKEND`: Ollama servers +when `ollama`, llama-swap slots (from `LLAMA_SWAP_ALLOWED_MODELS`) when +`llamacpp`. No `/insights/llamacpp/models` — the picker reads a single +endpoint. + **Hybrid Backend (OpenRouter):** - Per-request opt-in via `backend=hybrid` on `POST /insights/generate/agentic`. - Vision describe happens before the agentic loop; the description is inlined - into the chat prompt and the agentic loop runs on OpenRouter. By default - vision uses local Ollama, but `HYBRID_VISION_BACKEND=llamacpp` flips it to - llama-swap's vision slot (useful when you want chat on a frontier model and - vision on a local-but-not-Ollama path). + into the chat prompt and the agentic loop runs on OpenRouter. Vision + routes through whichever `LLM_BACKEND` is configured. - `request.model` (if provided) overrides `OPENROUTER_DEFAULT_MODEL` for that call. The mobile picker reads from `OPENROUTER_ALLOWED_MODELS`. - No live capability precheck — the operator-curated allowlist is trusted. @@ -682,29 +709,14 @@ This allows runtime verification of model availability before generating insight - `GET /insights/openrouter/models` returns `{ models, default_model, configured }` for client picker UIs. -**Llamacpp Backend (llama-swap):** -- Per-request opt-in via `backend=llamacpp` on `POST /insights/generate/agentic`. -- Sibling to Ollama: a local OpenAI-compatible proxy (mostlygeek/llama-swap) - fronting one or more `llama-server` processes. The chat slot is text-only - by default; vision and embeddings have their own slots (`LLAMA_SWAP_VISION_MODEL`, - `LLAMA_SWAP_EMBEDDING_MODEL`) that llama-swap routes to by model id. The - bundled `docker-compose.yml` + `llama-swap/config.yaml` in the opencode root - is the reference deploy. -- Operates in the same describe-then-inline shape as hybrid: the chat model - never sees raw images. Vision describe routes through llama-swap's vision - slot (`describe_image` on `LlamaCppClient`). -- `request.model` (if provided) overrides `LLAMA_SWAP_PRIMARY_MODEL` for that - call (must match a slot id in llama-swap's `config.yaml`). The mobile picker - reads from `LLAMA_SWAP_ALLOWED_MODELS`. -- No live capability precheck — slot ids are trusted. Tool calling is assumed - for every slot (llama-swap entries typically launch with `--jinja`). -- `GET /insights/llamacpp/models` returns `{ models, default_model, configured }`. -- Cross-replay matrix (chat continuation): `local ↔ llamacpp` allowed (the - LlamaCppClient passes images through to the chat slot — you're responsible - for a vision-capable slot if the stored transcript carries images); - `hybrid ↔ llamacpp` allowed (both transcripts are text-only); `local → - hybrid` and `llamacpp → hybrid` rejected (mid-conversation description - source change isn't supported). +**Cross-replay matrix (chat continuation):** +- `local → local` allowed (whether served by Ollama or llama-swap; that's + a deploy-time decision, not a request-time one). +- `hybrid → hybrid` allowed. +- `hybrid → local` allowed (the inlined description replays as text). +- `local → hybrid` rejected — the stored transcript has raw images in the + first user message and OpenRouter providers don't accept that shape + consistently. Regenerate the insight in hybrid mode instead. **Insight Chat Continuation:** diff --git a/src/ai/handlers.rs b/src/ai/handlers.rs index 4809b25..c86a5c8 100644 --- a/src/ai/handlers.rs +++ b/src/ai/handlers.rs @@ -470,7 +470,16 @@ pub async fn generate_agentic_insight_handler( } } -/// GET /insights/models - List available models from both servers with capabilities +/// GET /insights/models - Local-backend models with capabilities. Returns +/// Ollama servers when `LLM_BACKEND=ollama` (default), or llama-swap slots +/// when `LLM_BACKEND=llamacpp`. Same envelope shape either way so the +/// client picker doesn't have to branch on backend kind. +/// +/// For llama-swap: `models` comes verbatim from `LLAMA_SWAP_ALLOWED_MODELS` +/// (no live `/v1/models` probe), `has_vision` is true only for the +/// configured `LLAMA_SWAP_VISION_MODEL` slot id, and `has_tool_calling` is +/// reported as true for every slot (llama-server is launched with `--jinja` +/// by convention — a misconfigured slot surfaces as a chat-call error). #[get("/insights/models")] pub async fn get_available_models_handler( _claims: Claims, @@ -478,6 +487,29 @@ pub async fn get_available_models_handler( ) -> impl Responder { log::debug!("Fetching available models with capabilities"); + if crate::ai::local_backend_is_llamacpp() + && let Some(lc) = app_state.llamacpp.as_ref() + { + let models: Vec = app_state + .llamacpp_allowed_models + .iter() + .map(|name| ModelCapabilities { + name: name.clone(), + has_vision: name == &lc.vision_model, + has_tool_calling: true, + }) + .collect(); + let primary = ServerModels { + url: lc.base_url.clone(), + models, + default_model: lc.primary_model.clone(), + }; + return HttpResponse::Ok().json(AvailableModelsResponse { + primary, + fallback: None, + }); + } + let ollama_client = &app_state.ollama; // Fetch models with capabilities from primary server @@ -549,36 +581,6 @@ pub async fn get_openrouter_models_handler( HttpResponse::Ok().json(response) } -#[derive(serde::Serialize)] -pub struct LlamaCppModelsResponse { - pub models: Vec, - pub default_model: Option, - pub configured: bool, -} - -/// GET /insights/llamacpp/models - Curated llama-swap model ids exposed -/// to clients for the llamacpp backend. Returned verbatim from -/// `LLAMA_SWAP_ALLOWED_MODELS`; no live call to llama-swap. Use -/// `LLAMA_SWAP_URL` plus `LLAMA_SWAP_PRIMARY_MODEL` on the server side to -/// pick the actual chat slot. -#[get("/insights/llamacpp/models")] -pub async fn get_llamacpp_models_handler( - _claims: Claims, - app_state: web::Data, -) -> impl Responder { - let configured = app_state.llamacpp.is_some(); - let default_model = app_state - .llamacpp - .as_ref() - .map(|c| c.primary_model.clone()); - let response = LlamaCppModelsResponse { - models: app_state.llamacpp_allowed_models.clone(), - default_model, - configured, - }; - HttpResponse::Ok().json(response) -} - /// POST /insights/rate - Rate an insight (thumbs up/down for training data) #[post("/insights/rate")] pub async fn rate_insight_handler( diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs index 4e87d52..b549dab 100644 --- a/src/ai/insight_chat.rs +++ b/src/ai/insight_chat.rs @@ -309,14 +309,15 @@ impl InsightChatService { .unwrap_or_else(|| stored_backend.clone()); validate_cross_replay(&stored_backend, &effective_backend)?; let is_hybrid = effective_backend == "hybrid"; - let is_llamacpp = effective_backend == "llamacpp"; - let describes_then_inlines = is_hybrid || is_llamacpp; + let local_via_llamacpp = + crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); + let describes_then_inlines = is_hybrid || local_via_llamacpp; span.set_attribute(KeyValue::new("backend", effective_backend.clone())); - // 4. Build the chat backend client. Ollama in local mode, a freshly - // cloned OpenRouter client in hybrid mode, a freshly cloned - // LlamaCppClient in llamacpp mode (clone so per-request - // sampling/model overrides don't leak into shared state). + // 4. Build the chat backend client. Hybrid → OpenRouter; local with + // `LLM_BACKEND=llamacpp` → llama-swap; otherwise Ollama. Clones + // so per-request sampling/model overrides don't leak into shared + // state. let max_iterations = req .max_iterations .unwrap_or(DEFAULT_MAX_ITERATIONS) @@ -353,9 +354,9 @@ impl InsightChatService { c.set_num_ctx(Some(ctx)); } openrouter_client = Some(c); - } else if is_llamacpp { + } else if local_via_llamacpp { let arc = self.llamacpp.as_ref().ok_or_else(|| { - anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured") + anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured") })?; let mut c: LlamaCppClient = (**arc).clone(); if let Some(ref m) = custom_model { @@ -373,8 +374,8 @@ impl InsightChatService { } llamacpp_client = Some(c); } else { - // Local-mode model swap. Build a new client when the chat model - // differs from the configured one (mirrors the agentic pattern). + // Pure local (Ollama): model swap. Build a new client when the + // chat model differs from the configured one. if let Some(ref m) = custom_model && m != &self.ollama.primary_model { @@ -820,8 +821,9 @@ impl InsightChatService { .unwrap_or_else(|| stored_backend.clone()); validate_cross_replay(&stored_backend, &effective_backend)?; let is_hybrid = effective_backend == "hybrid"; - let is_llamacpp = effective_backend == "llamacpp"; - let describes_then_inlines = is_hybrid || is_llamacpp; + let local_via_llamacpp = + crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); + let describes_then_inlines = is_hybrid || local_via_llamacpp; let max_iterations = req .max_iterations @@ -841,9 +843,9 @@ impl InsightChatService { let model_used = chat_backend.primary_model().to_string(); // Tool set — local mode + first user turn carries an image → - // offer describe_photo. Describe-then-inline modes (hybrid / - // llamacpp): visual description was inlined when the insight was - // bootstrapped, no describe tool needed. + // offer describe_photo. Describe-then-inline modes (hybrid OR + // local_via_llamacpp): visual description was inlined when the + // insight was bootstrapped, no describe tool needed. let local_first_user_has_image = messages .iter() .find(|m| m.role == "user") @@ -987,8 +989,9 @@ impl InsightChatService { .unwrap_or_else(|| "default".to_string()); let effective_backend = resolve_bootstrap_backend(req.backend.as_deref())?; let is_hybrid = effective_backend == "hybrid"; - let is_llamacpp = effective_backend == "llamacpp"; - let describes_then_inlines = is_hybrid || is_llamacpp; + let local_via_llamacpp = + crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); + let describes_then_inlines = is_hybrid || local_via_llamacpp; let max_iterations = req .max_iterations @@ -1020,35 +1023,19 @@ impl InsightChatService { _ => None, }); - // Describe-then-inline backends (hybrid, llamacpp): pre-describe the - // image so a text-only chat model gets the visual description inline. - // Vision source: llamacpp's vision slot in llamacpp mode; in hybrid - // mode Ollama by default, llamacpp via `HYBRID_VISION_BACKEND=llamacpp`. + // Describe-then-inline (hybrid OR local_via_llamacpp): pre-describe + // the image so a text-only chat model gets the visual description + // inline. Vision source follows `LLM_BACKEND`: llama-swap when + // `local_via_llamacpp`, else Ollama. let visual_block = if describes_then_inlines { match image_base64.as_deref() { Some(b64) => { - let use_llamacpp_vision = if is_llamacpp { - true - } else { - matches!( - std::env::var("HYBRID_VISION_BACKEND") - .ok() - .as_deref() - .map(|s| s.trim().to_lowercase()) - .as_deref(), - Some("llamacpp") - ) - }; - let described = if use_llamacpp_vision { - match self.llamacpp.as_ref() { - Some(c) => c.describe_image(b64).await, - None => { - log::warn!( - "bootstrap: requested llamacpp vision but LLAMA_SWAP_URL unset; falling back to Ollama" - ); - self.ollama.describe_image(b64).await - } - } + let described = if local_via_llamacpp { + self.llamacpp + .as_ref() + .expect("local_via_llamacpp guarantees Some") + .describe_image(b64) + .await } else { self.ollama.describe_image(b64).await }; @@ -1175,8 +1162,11 @@ impl InsightChatService { /// (boxed because each backend has a different concrete type) and the /// Ollama client used for describe-image / local tool calls. /// - /// `effective_backend` must be one of `"local"`, `"hybrid"`, `"llamacpp"` - /// (validated upstream). + /// `effective_backend` must be one of `"local"` or `"hybrid"` (validated + /// upstream). Hybrid → OpenRouter; local with `LLM_BACKEND=llamacpp` → + /// llama-swap; pure local → Ollama. Returns the dispatched chat client + /// plus the (possibly per-request) Ollama client that the caller uses + /// for non-chat helpers (image describe in non-llamacpp mode, tool ops). fn build_chat_clients( &self, effective_backend: &str, @@ -1206,10 +1196,10 @@ impl InsightChatService { return Ok((Box::new(c), ollama_client)); } - if effective_backend == "llamacpp" { - let arc = self.llamacpp.as_ref().ok_or_else(|| { - anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured") - })?; + // Local mode — env switch decides between Ollama and llama-swap. + if crate::ai::local_backend_is_llamacpp() + && let Some(arc) = self.llamacpp.as_ref() + { let mut c: LlamaCppClient = (**arc).clone(); if let Some(m) = custom_model { c.primary_model = m.to_string(); @@ -1525,41 +1515,26 @@ fn resolve_date_taken_for_context( /// Validate a stored→effective backend transition for a chat continuation. /// Continuation runs against a transcript that was generated with a specific -/// backend; some transitions break the conversation shape: +/// backend; the only blocked transition is `local → hybrid`, because the +/// stored transcript has images embedded in the first user message and the +/// hybrid path (OpenRouter chat with describe-then-inline) can't replay +/// raw image bytes through OpenRouter consistently across providers. +/// `hybrid → local` is allowed (the inlined description replays verbatim +/// as text). /// -/// - `local → hybrid` — the stored transcript has images embedded in the -/// first user message; the openrouter chat client surfaces them through -/// the wire, but vision-only models routed via the hybrid path may not -/// accept that shape consistently across providers. Reject to keep the -/// `regenerate-in-hybrid-mode` workflow as the supported answer. -/// - `llamacpp → hybrid` — the stored transcript already has an inlined -/// visual description produced by llama-swap's vision slot. Switching -/// to hybrid mid-conversation would mix description sources across -/// subsequent turns (any new image in the chat continuation would be -/// described by ollama-vision while the original was described by -/// llama-vision). Reject for consistency. -/// -/// All other transitions are allowed. `local ↔ llamacpp` works because -/// LlamaCppClient passes image content-parts through to the chat slot — -/// the user is responsible for picking a vision-capable chat model in -/// that case. `hybrid ↔ llamacpp` works because both transcripts are -/// text-only (visual description inlined at bootstrap). +/// Whether "local" routes through Ollama or llama-swap is decided at +/// startup by `LLM_BACKEND`; both share the same transcript shape from +/// the chat-replay perspective. fn validate_cross_replay(stored: &str, effective: &str) -> Result<()> { - if !matches!(effective, "local" | "hybrid" | "llamacpp") { + if !matches!(effective, "local" | "hybrid") { bail!( - "unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'", + "unknown backend '{}'; expected 'local' or 'hybrid'", effective ); } if stored == "local" && effective == "hybrid" { bail!( - "switching from local to hybrid mid-chat isn't supported yet; \ - regenerate the insight in hybrid mode if you want OpenRouter chat" - ); - } - if stored == "llamacpp" && effective == "hybrid" { - bail!( - "switching from llamacpp to hybrid mid-chat isn't supported yet; \ + "switching from local to hybrid mid-chat isn't supported; \ regenerate the insight in hybrid mode if you want OpenRouter chat" ); } @@ -1576,9 +1551,9 @@ fn resolve_bootstrap_backend(supplied: Option<&str>) -> Result { .map(|s| s.trim().to_lowercase()) .filter(|s| !s.is_empty()) .unwrap_or_else(|| "local".to_string()); - if !matches!(lower.as_str(), "local" | "hybrid" | "llamacpp") { + if !matches!(lower.as_str(), "local" | "hybrid") { bail!( - "unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'", + "unknown backend '{}'; expected 'local' or 'hybrid'", lower ); } @@ -2184,10 +2159,6 @@ mod tests { fn bootstrap_backend_accepts_local_and_hybrid_case_insensitively() { assert_eq!(resolve_bootstrap_backend(Some("LOCAL")).unwrap(), "local"); assert_eq!(resolve_bootstrap_backend(Some("Hybrid")).unwrap(), "hybrid"); - assert_eq!( - resolve_bootstrap_backend(Some("Llamacpp")).unwrap(), - "llamacpp" - ); assert_eq!( resolve_bootstrap_backend(Some(" local ")).unwrap(), "local" @@ -2196,10 +2167,13 @@ mod tests { #[test] fn bootstrap_backend_rejects_unknown_label() { - let err = resolve_bootstrap_backend(Some("openrouter")).unwrap_err(); - let msg = format!("{}", err); - assert!(msg.contains("unknown backend")); - assert!(msg.contains("openrouter")); + // `llamacpp` is no longer a per-request backend value — it's chosen + // at deploy time via `LLM_BACKEND`. + for label in &["openrouter", "llamacpp", "ollama"] { + let err = resolve_bootstrap_backend(Some(label)).unwrap_err(); + let msg = format!("{}", err); + assert!(msg.contains("unknown backend"), "label={}", label); + } } #[test] @@ -2209,29 +2183,20 @@ mod tests { } #[test] - fn cross_replay_rejects_llamacpp_to_hybrid() { - let err = validate_cross_replay("llamacpp", "hybrid").unwrap_err(); - assert!(format!("{}", err).contains("llamacpp to hybrid")); - } - - #[test] - fn cross_replay_allows_local_llamacpp_and_hybrid_llamacpp_transitions() { - // Local ↔ llamacpp: user is responsible for picking a vision-capable - // chat slot when the transcript has images. - assert!(validate_cross_replay("local", "llamacpp").is_ok()); - assert!(validate_cross_replay("llamacpp", "local").is_ok()); - // Hybrid ↔ llamacpp: both transcripts are text-only. - assert!(validate_cross_replay("hybrid", "llamacpp").is_ok()); - // Same-backend replays are always fine. + fn cross_replay_allows_supported_transitions() { assert!(validate_cross_replay("local", "local").is_ok()); assert!(validate_cross_replay("hybrid", "hybrid").is_ok()); - assert!(validate_cross_replay("llamacpp", "llamacpp").is_ok()); + // Hybrid → local replays the inlined description as plain text. + assert!(validate_cross_replay("hybrid", "local").is_ok()); } #[test] fn cross_replay_rejects_unknown_effective() { - let err = validate_cross_replay("local", "openrouter").unwrap_err(); - assert!(format!("{}", err).contains("unknown backend")); + // Both "openrouter" and the former "llamacpp" value are unknown now. + for label in &["openrouter", "llamacpp"] { + let err = validate_cross_replay("local", label).unwrap_err(); + assert!(format!("{}", err).contains("unknown backend"), "label={}", label); + } } #[test] diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs index 2a11e29..08fcfef 100644 --- a/src/ai/insight_generator.rs +++ b/src/ai/insight_generator.rs @@ -471,8 +471,11 @@ impl InsightGenerator { log::info!("RAG QUERY: {}", query); log::info!("========================================"); - // Generate embedding for the query - let query_embedding = self.ollama.generate_embedding(&query).await?; + // Generate embedding for the query via the configured local backend + // (`LLM_BACKEND` switch). Must match the backend that populated the + // daily-summary embeddings or similarity search will be garbage. + let query_embedding = + crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &query).await?; // Search for similar daily summaries with time-based weighting // This prioritizes summaries temporally close to the query date @@ -563,7 +566,7 @@ impl InsightGenerator { let calendar_cx = parent_cx.with_span(span); let query_embedding = if let Some(loc) = location { - match self.ollama.generate_embedding(loc).await { + match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), loc).await { Ok(emb) => Some(emb), Err(e) => { log::warn!("Failed to generate embedding for location '{}': {}", loc, e); @@ -734,16 +737,17 @@ impl InsightGenerator { ) }; - let query_embedding = match self.ollama.generate_embedding(&query_text).await { - Ok(emb) => emb, - Err(e) => { - log::warn!("Failed to generate search embedding: {}", e); - search_cx.span().set_status(Status::Error { - description: e.to_string().into(), - }); - return Ok(None); - } - }; + let query_embedding = + match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &query_text).await { + Ok(emb) => emb, + Err(e) => { + log::warn!("Failed to generate search embedding: {}", e); + search_cx.span().set_status(Status::Error { + description: e.to_string().into(), + }); + return Ok(None); + } + }; let searches = { let mut dao = self @@ -2608,11 +2612,13 @@ Return ONLY the summary, nothing else."#, } } - /// Tool: store_entity — upsert an entity into the knowledge memory + /// Tool: store_entity — upsert an entity into the knowledge memory. + /// Embeddings go through the configured local backend (`LLM_BACKEND`), + /// independent of the per-request chat backend in the caller. async fn tool_store_entity( &self, args: &serde_json::Value, - ollama: &OllamaClient, + _ollama: &OllamaClient, cx: &opentelemetry::Context, ) -> String { use crate::database::models::InsertEntity; @@ -2672,9 +2678,16 @@ Return ONLY the summary, nothing else."#, .collect() }; - // Generate embedding for name + description (best-effort) + // Generate embedding for name + description (best-effort) via the + // configured local backend. let embed_text = format!("{} {}", name, description); - let embedding: Option> = match ollama.generate_embedding(&embed_text).await { + let embedding: Option> = match crate::ai::embed_one( + &self.ollama, + self.llamacpp.as_deref(), + &embed_text, + ) + .await + { Ok(vec) => { let bytes: Vec = vec.iter().flat_map(|f| f.to_le_bytes()).collect(); Some(bytes) @@ -3580,20 +3593,24 @@ Return ONLY the summary, nothing else."#, .map(|s| s.trim().to_lowercase()) .filter(|s| !s.is_empty()) .unwrap_or_else(|| "local".to_string()); - if !matches!(backend_label.as_str(), "local" | "hybrid" | "llamacpp") { + if !matches!(backend_label.as_str(), "local" | "hybrid") { return Err(anyhow::anyhow!( - "unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'", + "unknown backend '{}'; expected 'local' or 'hybrid'", backend_label )); } span.set_attribute(KeyValue::new("backend", backend_label.clone())); let is_hybrid = backend_label == "hybrid"; - let is_llamacpp = backend_label == "llamacpp"; - // In hybrid + llamacpp modes the chat model never sees the image - // directly; we describe-then-inline locally before the agentic loop - // starts. Tracked as a single flag so vision/tool-gate logic doesn't - // have to branch twice. - let describes_then_inlines = is_hybrid || is_llamacpp; + // `LLM_BACKEND=llamacpp` swaps Ollama out for llama-swap as the + // "local" stack — chat + vision describe + embeddings all route + // through llama-swap. In hybrid mode this still applies to vision + // describe (chat continues to go to OpenRouter). The chat slot is + // text-only in either case, so we describe-then-inline. + let local_via_llamacpp = + crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); + // Describe-then-inline: hybrid (chat is OpenRouter, text-only) or + // any path where chat goes through llama-swap (chat slot is text-only). + let describes_then_inlines = is_hybrid || local_via_llamacpp; // 1b. Always build an Ollama client. In local mode it owns the chat // loop; in hybrid/llamacpp mode it still handles tool-local calls @@ -3688,13 +3705,14 @@ Return ONLY the summary, nothing else."#, None }; - // 1d. In llamacpp mode, clone the configured LlamaCpp client and - // apply per-request overrides. Same shape as the openrouter - // branch above; describe_image will route through the vision - // slot configured on the client. - let llamacpp_client: Option = if is_llamacpp { + // 1d. When `LLM_BACKEND=llamacpp` and we're in local mode (not + // hybrid — hybrid keeps chat on OpenRouter), clone the llamacpp + // client and apply per-request overrides. Same shape as the + // openrouter branch above; describe_image will route through + // the vision slot configured on the client. + let llamacpp_client: Option = if local_via_llamacpp && !is_hybrid { let arc = self.llamacpp.as_ref().ok_or_else(|| { - anyhow::anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured") + anyhow::anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured") })?; let mut c: LlamaCppClient = (**arc).clone(); if let Some(ref m) = custom_model { @@ -3917,38 +3935,19 @@ Return ONLY the summary, nothing else."#, None }; - // describe-then-inline path. In hybrid mode the vision backend - // defaults to Ollama but can be flipped to llamacpp via - // `HYBRID_VISION_BACKEND=llamacpp` (so chat goes to OpenRouter while - // vision/audio routes through llama-swap). In llamacpp mode we always - // use the llamacpp client's configured vision slot. + // describe-then-inline path. Vision describe routes through whichever + // `LLM_BACKEND` is configured — llama-swap when `local_via_llamacpp` + // is set (even in hybrid mode, since chat is OpenRouter but vision + // stays on the local stack), otherwise Ollama. let inlined_visual_description: Option = if describes_then_inlines { match image_base64.as_deref() { Some(b64) => { - let use_llamacpp_vision = if is_llamacpp { - true - } else { - // is_hybrid branch — consult env switch - matches!( - std::env::var("HYBRID_VISION_BACKEND") - .ok() - .as_deref() - .map(|s| s.trim().to_lowercase()) - .as_deref(), - Some("llamacpp") - ) - }; - - let described = if use_llamacpp_vision { - match self.llamacpp.as_ref() { - Some(c) => c.describe_image(b64).await, - None => { - log::warn!( - "describe-then-inline: requested llamacpp vision but LLAMA_SWAP_URL is unset, falling back to Ollama" - ); - self.ollama.describe_image(b64).await - } - } + let described = if local_via_llamacpp { + self.llamacpp + .as_ref() + .expect("local_via_llamacpp guarantees Some") + .describe_image(b64) + .await } else { self.ollama.describe_image(b64).await }; @@ -4044,10 +4043,10 @@ Return ONLY the summary, nothing else."#, ); // 10. Define tools. Gate flags computed from current data presence; - // describe-then-inline modes (hybrid, llamacpp) omit describe_photo - // since the chat model receives the visual description inline (so - // we pass `false` for has_vision in those modes regardless of the - // model's actual capability). + // describe-then-inline modes (hybrid OR local_via_llamacpp) omit + // describe_photo since the chat model receives the visual + // description inline (so we pass `false` for has_vision in + // those modes regardless of the model's actual capability). let gate_opts = self.current_gate_opts(has_vision && !describes_then_inlines); let tools = Self::build_tool_definitions(gate_opts); diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs index 100020c..d23fad5 100644 --- a/src/ai/llamacpp.rs +++ b/src/ai/llamacpp.rs @@ -11,10 +11,10 @@ // `model` field, which is how llama-swap selects which backend process to // run. // - `/v1/models` returns only the configured slot ids — capabilities aren't -// reported by the API, so `vision_models` is a config-time allowlist (env -// `LLAMA_SWAP_VISION_MODELS`) used to set `has_vision` on responses. -// `has_tool_calling` is assumed true for every slot, since llama-swap entries -// default to launching llama-server with `--jinja`. +// reported by the API, so we infer `has_vision` from a single config field +// (`vision_model`, defaulting to `"vision"`) and assume `has_tool_calling` +// is true for every slot, since llama-swap entries default to launching +// llama-server with `--jinja`. // // First consumer lands alongside the three-way backend dispatch in // insight_generator / insight_chat. @@ -50,16 +50,10 @@ pub struct LlamaCppClient { /// Embedding model slot id (e.g. `"embed"`). Used for /// `generate_embeddings`. pub embedding_model: String, - /// Vision model slot id (e.g. `"vision"`). Used for `describe_image` and - /// included in `vision_models` automatically so capability lookups for - /// the default vision slot report `has_vision = true` even when the env - /// allowlist is empty. + /// Vision model slot id (e.g. `"vision"`). Used for `describe_image`, + /// and the only slot that reports `has_vision = true` in capability + /// lookups (llama-swap's `/v1/models` doesn't surface modality). pub vision_model: String, - /// Operator-curated set of slot ids known to be multimodal. Drives the - /// `has_vision` field in `list_models` / `model_capabilities`, since - /// llama-swap's `/v1/models` doesn't report modality. Empty allowlist - /// still marks `vision_model` as vision-capable. - pub vision_models: Vec, num_ctx: Option, temperature: Option, top_p: Option, @@ -83,7 +77,6 @@ impl LlamaCppClient { primary_model: primary_model.unwrap_or_else(|| DEFAULT_PRIMARY_MODEL.to_string()), embedding_model: DEFAULT_EMBEDDING_MODEL.to_string(), vision_model: DEFAULT_VISION_MODEL.to_string(), - vision_models: Vec::new(), num_ctx: None, temperature: None, top_p: None, @@ -100,10 +93,6 @@ impl LlamaCppClient { self.vision_model = model; } - pub fn set_vision_models(&mut self, models: Vec) { - self.vision_models = models; - } - pub fn set_num_ctx(&mut self, num_ctx: Option) { self.num_ctx = num_ctx; } @@ -692,7 +681,7 @@ impl LlamaCppClient { .and_then(|v| v.as_str()) .unwrap_or_default() .to_string(); - let has_vision = name == self.vision_model || self.vision_models.iter().any(|v| v == &name); + let has_vision = name == self.vision_model; // Tool calling is the default for llama-swap entries we configure // (--jinja flag); no negative-list mechanism yet, so report true. ModelCapabilities { @@ -954,25 +943,21 @@ mod tests { } #[test] - fn capability_inference_uses_vision_model_and_allowlist() { + fn capability_inference_marks_only_vision_slot() { let mut c = LlamaCppClient::new(None, Some("chat".into())); c.set_vision_model("vision".into()); - c.set_vision_models(vec!["qwen-vl".into()]); let m_chat = json!({ "id": "chat" }); let m_vision = json!({ "id": "vision" }); - let m_qwen = json!({ "id": "qwen-vl" }); let m_other = json!({ "id": "embed" }); let chat = c.parse_model_capabilities(&m_chat); let vision = c.parse_model_capabilities(&m_vision); - let qwen = c.parse_model_capabilities(&m_qwen); let other = c.parse_model_capabilities(&m_other); assert!(!chat.has_vision); assert!(chat.has_tool_calling); assert!(vision.has_vision); - assert!(qwen.has_vision); assert!(!other.has_vision); } } diff --git a/src/ai/mod.rs b/src/ai/mod.rs index 204da04..8d634fd 100644 --- a/src/ai/mod.rs +++ b/src/ai/mod.rs @@ -21,14 +21,14 @@ pub use handlers::{ chat_history_handler, chat_rewind_handler, chat_stream_handler, chat_turn_handler, delete_insight_handler, export_training_data_handler, generate_agentic_insight_handler, generate_insight_handler, get_all_insights_handler, get_available_models_handler, - get_insight_handler, get_llamacpp_models_handler, get_openrouter_models_handler, - rate_insight_handler, + get_insight_handler, get_openrouter_models_handler, rate_insight_handler, }; pub use insight_generator::InsightGenerator; #[allow(unused_imports)] pub use llm_client::{ ChatMessage, LlmClient, ModelCapabilities, Tool, ToolCall, ToolCallFunction, ToolFunction, }; +pub use llamacpp::LlamaCppClient; pub use ollama::{EMBEDDING_MODEL, OllamaClient}; pub use sms_client::{SmsApiClient, SmsMessage}; @@ -40,3 +40,87 @@ pub use sms_client::{SmsApiClient, SmsMessage}; pub fn user_display_name() -> String { std::env::var("USER_NAME").unwrap_or_else(|_| "Me".to_string()) } + +/// One switch for the "local" LLM stack: when `LLM_BACKEND=llamacpp` is +/// set, chat / vision describe / embeddings all route through llama-swap +/// instead of Ollama. Any other value (including unset, the default) is +/// Ollama. This is intentionally global — embeddings must be drawn from +/// a single source or similarity search across the index breaks (mixed +/// vector spaces, possibly mixed dims). The `backend=hybrid` per-request +/// override remains orthogonal: it always sends chat to OpenRouter, and +/// uses `LLM_BACKEND` for the describe-then-inline vision pass. +pub fn local_backend_is_llamacpp() -> bool { + matches!( + std::env::var("LLM_BACKEND") + .ok() + .as_deref() + .map(|s| s.trim().to_lowercase()) + .as_deref(), + Some("llamacpp") + ) +} + +/// Embed one string via the configured local backend. Routes through +/// llama-swap when `LLM_BACKEND=llamacpp` (and a client is configured), +/// else Ollama. Returns the single embedding vector. See +/// [`local_backend_is_llamacpp`] for the rationale on consistency. +pub async fn embed_one( + ollama: &OllamaClient, + llamacpp: Option<&LlamaCppClient>, + text: &str, +) -> anyhow::Result> { + if local_backend_is_llamacpp() { + if let Some(lc) = llamacpp { + let mut vecs = ::generate_embeddings(lc, &[text]).await?; + return vecs + .pop() + .ok_or_else(|| anyhow::anyhow!("llama-swap returned no embeddings")); + } + log::warn!( + "LLM_BACKEND=llamacpp but LlamaCppClient is unconfigured; falling back to Ollama embeddings" + ); + } + ollama.generate_embedding(text).await +} + +#[cfg(test)] +mod env_dispatch_tests { + use super::*; + + fn with_env(key: &str, val: Option<&str>, f: F) { + let prev = std::env::var(key).ok(); + match val { + Some(v) => unsafe { std::env::set_var(key, v) }, + None => unsafe { std::env::remove_var(key) }, + } + f(); + match prev { + Some(v) => unsafe { std::env::set_var(key, v) }, + None => unsafe { std::env::remove_var(key) }, + } + } + + #[test] + fn llm_backend_defaults_to_ollama() { + with_env("LLM_BACKEND", None, || { + assert!(!local_backend_is_llamacpp()); + }); + } + + #[test] + fn llm_backend_llamacpp_case_insensitive() { + with_env("LLM_BACKEND", Some("LlamaCpp"), || { + assert!(local_backend_is_llamacpp()); + }); + with_env("LLM_BACKEND", Some(" llamacpp "), || { + assert!(local_backend_is_llamacpp()); + }); + } + + #[test] + fn llm_backend_unknown_value_is_ollama() { + with_env("LLM_BACKEND", Some("vllm"), || { + assert!(!local_backend_is_llamacpp()); + }); + } +} diff --git a/src/main.rs b/src/main.rs index 3bd3656..63013ce 100644 --- a/src/main.rs +++ b/src/main.rs @@ -313,7 +313,6 @@ fn main() -> std::io::Result<()> { .service(ai::get_all_insights_handler) .service(ai::get_available_models_handler) .service(ai::get_openrouter_models_handler) - .service(ai::get_llamacpp_models_handler) .service(ai::chat_turn_handler) .service(ai::chat_stream_handler) .service(ai::chat_history_handler) diff --git a/src/state.rs b/src/state.rs index 96d1c22..c4f810a 100644 --- a/src/state.rs +++ b/src/state.rs @@ -358,10 +358,11 @@ fn parse_openrouter_allowed_models() -> Vec { } /// Build a `LlamaCppClient` from environment variables. Returns `None` when -/// `LLAMA_SWAP_URL` is unset (the llamacpp backend is then unavailable and -/// requests for it return a clear error). The slot ids default to the -/// names the bundled `llama-swap/config.yaml` uses — `chat` / `vision` / -/// `embed` — so a minimal deploy only needs to set `LLAMA_SWAP_URL`. +/// `LLAMA_SWAP_URL` is unset. The client is constructed unconditionally +/// when the URL is set (so it's available even under `LLM_BACKEND=ollama` +/// for ad-hoc tooling), but the agentic / chat paths only route through it +/// when `LLM_BACKEND=llamacpp`. Slot ids default to the names the bundled +/// `llama-swap/config.yaml` uses — `chat` / `vision` / `embed`. fn build_llamacpp_from_env() -> Option> { let base_url = env::var("LLAMA_SWAP_URL").ok()?; let primary_model = env::var("LLAMA_SWAP_PRIMARY_MODEL").ok(); @@ -372,12 +373,12 @@ fn build_llamacpp_from_env() -> Option> { if let Ok(model) = env::var("LLAMA_SWAP_VISION_MODEL") { client.set_vision_model(model); } - client.set_vision_models(parse_llamacpp_vision_models()); Some(Arc::new(client)) } /// Parse `LLAMA_SWAP_ALLOWED_MODELS` (comma-separated) into a vec. Used to -/// drive `/insights/llamacpp/models`; empty when unset. +/// populate the model picker when `LLM_BACKEND=llamacpp` — `/insights/models` +/// surfaces these slots with capabilities. Empty when unset. fn parse_llamacpp_allowed_models() -> Vec { env::var("LLAMA_SWAP_ALLOWED_MODELS") .unwrap_or_default() @@ -387,20 +388,6 @@ fn parse_llamacpp_allowed_models() -> Vec { .collect() } -/// Parse `LLAMA_SWAP_VISION_MODELS` (comma-separated) — slot ids that report -/// `has_vision = true` in capability lookups. The configured `vision_model` -/// (default `vision`) is always considered vision-capable regardless of this -/// list, so a deploy that only uses the default vision slot can leave it -/// unset. -fn parse_llamacpp_vision_models() -> Vec { - env::var("LLAMA_SWAP_VISION_MODELS") - .unwrap_or_default() - .split(',') - .map(|s| s.trim().to_string()) - .filter(|s| !s.is_empty()) - .collect() -} - #[cfg(test)] impl AppState { /// Creates an AppState instance for testing with temporary directories