ai: collapse llamacpp into LLM_BACKEND env switch

Reverts the per-request backend="llamacpp" value. Chat/vision/embedding backend is now a deploy-time decision (LLM_BACKEND=ollama|llamacpp), applied globally across chat, vision describe, and embeddings — so embedding vectors stay in one space across the index. - Per-request backend whitelist back to "local"|"hybrid". A request arriving with backend="llamacpp" is rejected. - LLM_BACKEND=llamacpp swaps the entire local stack to llama-swap: chat hits the chat slot, describe hits the vision slot, embeddings hit the embed slot. Hybrid mode still routes chat to OpenRouter but uses LLM_BACKEND for the describe pass. - Drops env vars HYBRID_VISION_BACKEND, LLAMA_SWAP_VISION_MODELS, EMBEDDING_BACKEND (the last never shipped). Drops the LlamaCppClient.vision_models allowlist — capability inference now reports has_vision only for the configured vision_model slot. - Drops the /insights/llamacpp/models handler. /insights/models is the single endpoint; returns Ollama servers under LLM_BACKEND=ollama and llama-swap slots (from LLAMA_SWAP_ALLOWED_MODELS) under LLM_BACKEND=llamacpp. Same envelope shape either way. - New ai::embed_one helper routes embeddings through llama-swap when LLM_BACKEND=llamacpp (else Ollama). Wires it into the four insight_generator embedding sites. - Cross-replay matrix simplifies to pre-llamacpp shape (local↔local, hybrid↔hybrid, hybrid→local allowed; local→hybrid rejected).
2026-05-21 11:36:58 -04:00
parent d14df63f19
commit be51421b38
9 changed files with 338 additions and 301 deletions
@@ -309,14 +309,15 @@ impl InsightChatService {
            .unwrap_or_else(|| stored_backend.clone());
        validate_cross_replay(&stored_backend, &effective_backend)?;
        let is_hybrid = effective_backend == "hybrid";
-        let is_llamacpp = effective_backend == "llamacpp";
-        let describes_then_inlines = is_hybrid || is_llamacpp;
+        let local_via_llamacpp =
+            crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
+        let describes_then_inlines = is_hybrid || local_via_llamacpp;
        span.set_attribute(KeyValue::new("backend", effective_backend.clone()));

-        // 4. Build the chat backend client. Ollama in local mode, a freshly
-        //    cloned OpenRouter client in hybrid mode, a freshly cloned
-        //    LlamaCppClient in llamacpp mode (clone so per-request
-        //    sampling/model overrides don't leak into shared state).
+        // 4. Build the chat backend client. Hybrid → OpenRouter; local with
+        //    `LLM_BACKEND=llamacpp` → llama-swap; otherwise Ollama. Clones
+        //    so per-request sampling/model overrides don't leak into shared
+        //    state.
        let max_iterations = req
            .max_iterations
            .unwrap_or(DEFAULT_MAX_ITERATIONS)
@@ -353,9 +354,9 @@ impl InsightChatService {
                c.set_num_ctx(Some(ctx));
            }
            openrouter_client = Some(c);
-        } else if is_llamacpp {
+        } else if local_via_llamacpp {
            let arc = self.llamacpp.as_ref().ok_or_else(|| {
-                anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured")
+                anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured")
            })?;
            let mut c: LlamaCppClient = (**arc).clone();
            if let Some(ref m) = custom_model {
@@ -373,8 +374,8 @@ impl InsightChatService {
            }
            llamacpp_client = Some(c);
        } else {
-            // Local-mode model swap. Build a new client when the chat model
-            // differs from the configured one (mirrors the agentic pattern).
+            // Pure local (Ollama): model swap. Build a new client when the
+            // chat model differs from the configured one.
            if let Some(ref m) = custom_model
                && m != &self.ollama.primary_model
            {
@@ -820,8 +821,9 @@ impl InsightChatService {
            .unwrap_or_else(|| stored_backend.clone());
        validate_cross_replay(&stored_backend, &effective_backend)?;
        let is_hybrid = effective_backend == "hybrid";
-        let is_llamacpp = effective_backend == "llamacpp";
-        let describes_then_inlines = is_hybrid || is_llamacpp;
+        let local_via_llamacpp =
+            crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
+        let describes_then_inlines = is_hybrid || local_via_llamacpp;

        let max_iterations = req
            .max_iterations
@@ -841,9 +843,9 @@ impl InsightChatService {
        let model_used = chat_backend.primary_model().to_string();

        // Tool set — local mode + first user turn carries an image →
-        // offer describe_photo. Describe-then-inline modes (hybrid /
-        // llamacpp): visual description was inlined when the insight was
-        // bootstrapped, no describe tool needed.
+        // offer describe_photo. Describe-then-inline modes (hybrid OR
+        // local_via_llamacpp): visual description was inlined when the
+        // insight was bootstrapped, no describe tool needed.
        let local_first_user_has_image = messages
            .iter()
            .find(|m| m.role == "user")
@@ -987,8 +989,9 @@ impl InsightChatService {
            .unwrap_or_else(|| "default".to_string());
        let effective_backend = resolve_bootstrap_backend(req.backend.as_deref())?;
        let is_hybrid = effective_backend == "hybrid";
-        let is_llamacpp = effective_backend == "llamacpp";
-        let describes_then_inlines = is_hybrid || is_llamacpp;
+        let local_via_llamacpp =
+            crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
+        let describes_then_inlines = is_hybrid || local_via_llamacpp;

        let max_iterations = req
            .max_iterations
@@ -1020,35 +1023,19 @@ impl InsightChatService {
                _ => None,
            });

-        // Describe-then-inline backends (hybrid, llamacpp): pre-describe the
-        // image so a text-only chat model gets the visual description inline.
-        // Vision source: llamacpp's vision slot in llamacpp mode; in hybrid
-        // mode Ollama by default, llamacpp via `HYBRID_VISION_BACKEND=llamacpp`.
+        // Describe-then-inline (hybrid OR local_via_llamacpp): pre-describe
+        // the image so a text-only chat model gets the visual description
+        // inline. Vision source follows `LLM_BACKEND`: llama-swap when
+        // `local_via_llamacpp`, else Ollama.
        let visual_block = if describes_then_inlines {
            match image_base64.as_deref() {
                Some(b64) => {
-                    let use_llamacpp_vision = if is_llamacpp {
-                        true
-                    } else {
-                        matches!(
-                            std::env::var("HYBRID_VISION_BACKEND")
-                                .ok()
-                                .as_deref()
-                                .map(|s| s.trim().to_lowercase())
-                                .as_deref(),
-                            Some("llamacpp")
-                        )
-                    };
-                    let described = if use_llamacpp_vision {
-                        match self.llamacpp.as_ref() {
-                            Some(c) => c.describe_image(b64).await,
-                            None => {
-                                log::warn!(
-                                    "bootstrap: requested llamacpp vision but LLAMA_SWAP_URL unset; falling back to Ollama"
-                                );
-                                self.ollama.describe_image(b64).await
-                            }
-                        }
+                    let described = if local_via_llamacpp {
+                        self.llamacpp
+                            .as_ref()
+                            .expect("local_via_llamacpp guarantees Some")
+                            .describe_image(b64)
+                            .await
                    } else {
                        self.ollama.describe_image(b64).await
                    };
@@ -1175,8 +1162,11 @@ impl InsightChatService {
    /// (boxed because each backend has a different concrete type) and the
    /// Ollama client used for describe-image / local tool calls.
    ///
-    /// `effective_backend` must be one of `"local"`, `"hybrid"`, `"llamacpp"`
-    /// (validated upstream).
+    /// `effective_backend` must be one of `"local"` or `"hybrid"` (validated
+    /// upstream). Hybrid → OpenRouter; local with `LLM_BACKEND=llamacpp` →
+    /// llama-swap; pure local → Ollama. Returns the dispatched chat client
+    /// plus the (possibly per-request) Ollama client that the caller uses
+    /// for non-chat helpers (image describe in non-llamacpp mode, tool ops).
    fn build_chat_clients(
        &self,
        effective_backend: &str,
@@ -1206,10 +1196,10 @@ impl InsightChatService {
            return Ok((Box::new(c), ollama_client));
        }

-        if effective_backend == "llamacpp" {
-            let arc = self.llamacpp.as_ref().ok_or_else(|| {
-                anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured")
-            })?;
+        // Local mode — env switch decides between Ollama and llama-swap.
+        if crate::ai::local_backend_is_llamacpp()
+            && let Some(arc) = self.llamacpp.as_ref()
+        {
            let mut c: LlamaCppClient = (**arc).clone();
            if let Some(m) = custom_model {
                c.primary_model = m.to_string();
@@ -1525,41 +1515,26 @@ fn resolve_date_taken_for_context(

 /// Validate a stored→effective backend transition for a chat continuation.
 /// Continuation runs against a transcript that was generated with a specific
-/// backend; some transitions break the conversation shape:
+/// backend; the only blocked transition is `local → hybrid`, because the
+/// stored transcript has images embedded in the first user message and the
+/// hybrid path (OpenRouter chat with describe-then-inline) can't replay
+/// raw image bytes through OpenRouter consistently across providers.
+/// `hybrid → local` is allowed (the inlined description replays verbatim
+/// as text).
 ///
-/// - `local → hybrid` — the stored transcript has images embedded in the
-///   first user message; the openrouter chat client surfaces them through
-///   the wire, but vision-only models routed via the hybrid path may not
-///   accept that shape consistently across providers. Reject to keep the
-///   `regenerate-in-hybrid-mode` workflow as the supported answer.
-/// - `llamacpp → hybrid` — the stored transcript already has an inlined
-///   visual description produced by llama-swap's vision slot. Switching
-///   to hybrid mid-conversation would mix description sources across
-///   subsequent turns (any new image in the chat continuation would be
-///   described by ollama-vision while the original was described by
-///   llama-vision). Reject for consistency.
-///
-/// All other transitions are allowed. `local ↔ llamacpp` works because
-/// LlamaCppClient passes image content-parts through to the chat slot —
-/// the user is responsible for picking a vision-capable chat model in
-/// that case. `hybrid ↔ llamacpp` works because both transcripts are
-/// text-only (visual description inlined at bootstrap).
+/// Whether "local" routes through Ollama or llama-swap is decided at
+/// startup by `LLM_BACKEND`; both share the same transcript shape from
+/// the chat-replay perspective.
 fn validate_cross_replay(stored: &str, effective: &str) -> Result<()> {
-    if !matches!(effective, "local" | "hybrid" | "llamacpp") {
+    if !matches!(effective, "local" | "hybrid") {
        bail!(
-            "unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'",
+            "unknown backend '{}'; expected 'local' or 'hybrid'",
            effective
        );
    }
    if stored == "local" && effective == "hybrid" {
        bail!(
-            "switching from local to hybrid mid-chat isn't supported yet; \
-             regenerate the insight in hybrid mode if you want OpenRouter chat"
-        );
-    }
-    if stored == "llamacpp" && effective == "hybrid" {
-        bail!(
-            "switching from llamacpp to hybrid mid-chat isn't supported yet; \
+            "switching from local to hybrid mid-chat isn't supported; \
             regenerate the insight in hybrid mode if you want OpenRouter chat"
        );
    }
@@ -1576,9 +1551,9 @@ fn resolve_bootstrap_backend(supplied: Option<&str>) -> Result<String> {
        .map(|s| s.trim().to_lowercase())
        .filter(|s| !s.is_empty())
        .unwrap_or_else(|| "local".to_string());
-    if !matches!(lower.as_str(), "local" | "hybrid" | "llamacpp") {
+    if !matches!(lower.as_str(), "local" | "hybrid") {
        bail!(
-            "unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'",
+            "unknown backend '{}'; expected 'local' or 'hybrid'",
            lower
        );
    }
@@ -2184,10 +2159,6 @@ mod tests {
    fn bootstrap_backend_accepts_local_and_hybrid_case_insensitively() {
        assert_eq!(resolve_bootstrap_backend(Some("LOCAL")).unwrap(), "local");
        assert_eq!(resolve_bootstrap_backend(Some("Hybrid")).unwrap(), "hybrid");
-        assert_eq!(
-            resolve_bootstrap_backend(Some("Llamacpp")).unwrap(),
-            "llamacpp"
-        );
        assert_eq!(
            resolve_bootstrap_backend(Some("  local  ")).unwrap(),
            "local"
@@ -2196,10 +2167,13 @@ mod tests {

    #[test]
    fn bootstrap_backend_rejects_unknown_label() {
-        let err = resolve_bootstrap_backend(Some("openrouter")).unwrap_err();
-        let msg = format!("{}", err);
-        assert!(msg.contains("unknown backend"));
-        assert!(msg.contains("openrouter"));
+        // `llamacpp` is no longer a per-request backend value — it's chosen
+        // at deploy time via `LLM_BACKEND`.
+        for label in &["openrouter", "llamacpp", "ollama"] {
+            let err = resolve_bootstrap_backend(Some(label)).unwrap_err();
+            let msg = format!("{}", err);
+            assert!(msg.contains("unknown backend"), "label={}", label);
+        }
    }

    #[test]
@@ -2209,29 +2183,20 @@ mod tests {
    }

    #[test]
-    fn cross_replay_rejects_llamacpp_to_hybrid() {
-        let err = validate_cross_replay("llamacpp", "hybrid").unwrap_err();
-        assert!(format!("{}", err).contains("llamacpp to hybrid"));
-    }
-
-    #[test]
-    fn cross_replay_allows_local_llamacpp_and_hybrid_llamacpp_transitions() {
-        // Local ↔ llamacpp: user is responsible for picking a vision-capable
-        // chat slot when the transcript has images.
-        assert!(validate_cross_replay("local", "llamacpp").is_ok());
-        assert!(validate_cross_replay("llamacpp", "local").is_ok());
-        // Hybrid ↔ llamacpp: both transcripts are text-only.
-        assert!(validate_cross_replay("hybrid", "llamacpp").is_ok());
-        // Same-backend replays are always fine.
+    fn cross_replay_allows_supported_transitions() {
        assert!(validate_cross_replay("local", "local").is_ok());
        assert!(validate_cross_replay("hybrid", "hybrid").is_ok());
-        assert!(validate_cross_replay("llamacpp", "llamacpp").is_ok());
+        // Hybrid → local replays the inlined description as plain text.
+        assert!(validate_cross_replay("hybrid", "local").is_ok());
    }

    #[test]
    fn cross_replay_rejects_unknown_effective() {
-        let err = validate_cross_replay("local", "openrouter").unwrap_err();
-        assert!(format!("{}", err).contains("unknown backend"));
+        // Both "openrouter" and the former "llamacpp" value are unknown now.
+        for label in &["openrouter", "llamacpp"] {
+            let err = validate_cross_replay("local", label).unwrap_err();
+            assert!(format!("{}", err).contains("unknown backend"), "label={}", label);
+        }
    }

    #[test]