ai: add llamacpp backend (llama-swap) as third LLM client

Wires a new LlamaCppClient (OpenAI-compatible /v1 wire format) alongside OllamaClient and OpenRouterClient. Per-slot routing for chat/vision/embed via env (LLAMA_SWAP_URL + *_MODEL vars); capability inference uses an env allowlist since /v1/models doesn't report modality. InsightGenerator + InsightChatService gain three-way dispatch on chat_backend = "local" | "hybrid" | "llamacpp". Hybrid and llamacpp share the describe-then-inline path (text-only chat after a separate vision describe). HYBRID_VISION_BACKEND=llamacpp lets hybrid route its describe pass through llama-swap's vision slot while chat still goes to OpenRouter. Cross-replay matrix added (validate_cross_replay): local<->llamacpp and hybrid<->llamacpp allowed; local->hybrid and llamacpp->hybrid rejected. New /insights/llamacpp/models handler mirrors the OpenRouter shape.
2026-05-20 17:52:33 -04:00
parent d04b86e32c
commit f0927f5355
9 changed files with 1468 additions and 102 deletions
@@ -9,6 +9,7 @@ use tokio::sync::Mutex as TokioMutex;
 use crate::ai::insight_generator::InsightGenerator;
 use crate::ai::llm_client::{ChatMessage, LlmClient, LlmStreamEvent, Tool};
 use crate::ai::ollama::OllamaClient;
+use crate::ai::llamacpp::LlamaCppClient;
 use crate::ai::openrouter::OpenRouterClient;
 use crate::database::InsightDao;
 use crate::database::models::InsertPhotoInsight;
@@ -93,6 +94,7 @@ pub struct InsightChatService {
    generator: Arc<InsightGenerator>,
    ollama: OllamaClient,
    openrouter: Option<Arc<OpenRouterClient>>,
+    llamacpp: Option<Arc<LlamaCppClient>>,
    insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
    chat_locks: ChatLockMap,
 }
@@ -102,6 +104,7 @@ impl InsightChatService {
        generator: Arc<InsightGenerator>,
        ollama: OllamaClient,
        openrouter: Option<Arc<OpenRouterClient>>,
+        llamacpp: Option<Arc<LlamaCppClient>>,
        insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
        chat_locks: ChatLockMap,
    ) -> Self {
@@ -109,6 +112,7 @@ impl InsightChatService {
            generator,
            ollama,
            openrouter,
+            llamacpp,
            insight_dao,
            chat_locks,
        }
@@ -303,23 +307,15 @@ impl InsightChatService {
            .map(|s| s.trim().to_lowercase())
            .filter(|s| !s.is_empty())
            .unwrap_or_else(|| stored_backend.clone());
-        if !matches!(effective_backend.as_str(), "local" | "hybrid") {
-            bail!(
-                "unknown backend '{}'; expected 'local' or 'hybrid'",
-                effective_backend
-            );
-        }
-        if stored_backend == "local" && effective_backend == "hybrid" {
-            bail!(
-                "switching from local to hybrid mid-chat isn't supported yet; \
-                 regenerate the insight in hybrid mode if you want OpenRouter chat"
-            );
-        }
+        validate_cross_replay(&stored_backend, &effective_backend)?;
        let is_hybrid = effective_backend == "hybrid";
+        let is_llamacpp = effective_backend == "llamacpp";
+        let describes_then_inlines = is_hybrid || is_llamacpp;
        span.set_attribute(KeyValue::new("backend", effective_backend.clone()));

        // 4. Build the chat backend client. Ollama in local mode, a freshly
-        //    cloned OpenRouter client in hybrid mode (clone so per-request
+        //    cloned OpenRouter client in hybrid mode, a freshly cloned
+        //    LlamaCppClient in llamacpp mode (clone so per-request
        //    sampling/model overrides don't leak into shared state).
        let max_iterations = req
            .max_iterations
@@ -336,6 +332,7 @@ impl InsightChatService {

        let mut ollama_client = self.ollama.clone();
        let mut openrouter_client: Option<OpenRouterClient> = None;
+        let mut llamacpp_client: Option<LlamaCppClient> = None;

        if is_hybrid {
            let arc = self.openrouter.as_ref().ok_or_else(|| {
@@ -356,6 +353,25 @@ impl InsightChatService {
                c.set_num_ctx(Some(ctx));
            }
            openrouter_client = Some(c);
+        } else if is_llamacpp {
+            let arc = self.llamacpp.as_ref().ok_or_else(|| {
+                anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured")
+            })?;
+            let mut c: LlamaCppClient = (**arc).clone();
+            if let Some(ref m) = custom_model {
+                c.primary_model = m.clone();
+            }
+            if req.temperature.is_some()
+                || req.top_p.is_some()
+                || req.top_k.is_some()
+                || req.min_p.is_some()
+            {
+                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
+            }
+            if let Some(ctx) = req.num_ctx {
+                c.set_num_ctx(Some(ctx));
+            }
+            llamacpp_client = Some(c);
        } else {
            // Local-mode model swap. Build a new client when the chat model
            // differs from the configured one (mirrors the agentic pattern).
@@ -381,7 +397,9 @@ impl InsightChatService {
            }
        }

-        let chat_backend: &dyn LlmClient = if let Some(ref c) = openrouter_client {
+        let chat_backend: &dyn LlmClient = if let Some(ref c) = llamacpp_client {
+            c
+        } else if let Some(ref c) = openrouter_client {
            c
        } else {
            &ollama_client
@@ -389,18 +407,19 @@ impl InsightChatService {
        let model_used = chat_backend.primary_model().to_string();
        span.set_attribute(KeyValue::new("model", model_used.clone()));

-        // 5. Decide vision + tool set. In hybrid we always omit
-        //    `describe_photo` (matches the original generation flow). In
-        //    local we trust the stored history's first-user shape: if it
-        //    carries `images`, the original model was vision-capable, and
-        //    we keep `describe_photo` available.
+        // 5. Decide vision + tool set. In describe-then-inline modes
+        //    (hybrid, llamacpp) we always omit `describe_photo` (matches the
+        //    original generation flow). In local we trust the stored
+        //    history's first-user shape: if it carries `images`, the
+        //    original model was vision-capable, and we keep `describe_photo`
+        //    available.
        let local_first_user_has_image = messages
            .iter()
            .find(|m| m.role == "user")
            .and_then(|m| m.images.as_ref())
            .map(|imgs| !imgs.is_empty())
            .unwrap_or(false);
-        let offer_describe_tool = !is_hybrid && local_first_user_has_image;
+        let offer_describe_tool = !describes_then_inlines && local_first_user_has_image;
        // current_gate_opts(has_vision) sets gate_opts.has_vision = has_vision
        // and probes the per-table presence flags. Pass `offer_describe_tool`
        // directly — the `!is_hybrid && local_first_user_has_image` decision
@@ -799,19 +818,10 @@ impl InsightChatService {
            .map(|s| s.trim().to_lowercase())
            .filter(|s| !s.is_empty())
            .unwrap_or_else(|| stored_backend.clone());
-        if !matches!(effective_backend.as_str(), "local" | "hybrid") {
-            bail!(
-                "unknown backend '{}'; expected 'local' or 'hybrid'",
-                effective_backend
-            );
-        }
-        if stored_backend == "local" && effective_backend == "hybrid" {
-            bail!(
-                "switching from local to hybrid mid-chat isn't supported yet; \
-                 regenerate the insight in hybrid mode if you want OpenRouter chat"
-            );
-        }
+        validate_cross_replay(&stored_backend, &effective_backend)?;
        let is_hybrid = effective_backend == "hybrid";
+        let is_llamacpp = effective_backend == "llamacpp";
+        let describes_then_inlines = is_hybrid || is_llamacpp;

        let max_iterations = req
            .max_iterations
@@ -826,20 +836,21 @@ impl InsightChatService {
            .filter(|m| !m.is_empty());

        let (chat_backend_holder, ollama_client) =
-            self.build_chat_clients(is_hybrid, custom_model.as_deref(), &req)?;
+            self.build_chat_clients(&effective_backend, custom_model.as_deref(), &req)?;
        let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref();
        let model_used = chat_backend.primary_model().to_string();

        // Tool set — local mode + first user turn carries an image →
-        // offer describe_photo. Hybrid: visual description was inlined
-        // when the insight was bootstrapped, no describe tool needed.
+        // offer describe_photo. Describe-then-inline modes (hybrid /
+        // llamacpp): visual description was inlined when the insight was
+        // bootstrapped, no describe tool needed.
        let local_first_user_has_image = messages
            .iter()
            .find(|m| m.role == "user")
            .and_then(|m| m.images.as_ref())
            .map(|imgs| !imgs.is_empty())
            .unwrap_or(false);
-        let offer_describe_tool = !is_hybrid && local_first_user_has_image;
+        let offer_describe_tool = !describes_then_inlines && local_first_user_has_image;
        let gate_opts = self.generator.current_gate_opts_for_persona(
            offer_describe_tool,
            Some((req.user_id, &active_persona)),
@@ -976,6 +987,8 @@ impl InsightChatService {
            .unwrap_or_else(|| "default".to_string());
        let effective_backend = resolve_bootstrap_backend(req.backend.as_deref())?;
        let is_hybrid = effective_backend == "hybrid";
+        let is_llamacpp = effective_backend == "llamacpp";
+        let describes_then_inlines = is_hybrid || is_llamacpp;

        let max_iterations = req
            .max_iterations
@@ -984,7 +997,7 @@ impl InsightChatService {

        let custom_model = req.model.clone().filter(|m| !m.is_empty());
        let (chat_backend_holder, ollama_client) =
-            self.build_chat_clients(is_hybrid, custom_model.as_deref(), &req)?;
+            self.build_chat_clients(&effective_backend, custom_model.as_deref(), &req)?;
        let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref();
        let model_used = chat_backend.primary_model().to_string();

@@ -1007,21 +1020,48 @@ impl InsightChatService {
                _ => None,
            });

-        // Hybrid backend: pre-describe the image via local Ollama vision
-        // so OpenRouter chat models (which can't see images directly) get
-        // the visual description as text. Mirrors the same pre-describe
-        // pass that `generate_agentic_insight_for_photo` does for hybrid.
-        let visual_block = if is_hybrid {
+        // Describe-then-inline backends (hybrid, llamacpp): pre-describe the
+        // image so a text-only chat model gets the visual description inline.
+        // Vision source: llamacpp's vision slot in llamacpp mode; in hybrid
+        // mode Ollama by default, llamacpp via `HYBRID_VISION_BACKEND=llamacpp`.
+        let visual_block = if describes_then_inlines {
            match image_base64.as_deref() {
-                Some(b64) => match self.ollama.describe_image(b64).await {
-                    Ok(desc) => {
-                        format!("Visual description (from local vision model):\n{}\n", desc)
+                Some(b64) => {
+                    let use_llamacpp_vision = if is_llamacpp {
+                        true
+                    } else {
+                        matches!(
+                            std::env::var("HYBRID_VISION_BACKEND")
+                                .ok()
+                                .as_deref()
+                                .map(|s| s.trim().to_lowercase())
+                                .as_deref(),
+                            Some("llamacpp")
+                        )
+                    };
+                    let described = if use_llamacpp_vision {
+                        match self.llamacpp.as_ref() {
+                            Some(c) => c.describe_image(b64).await,
+                            None => {
+                                log::warn!(
+                                    "bootstrap: requested llamacpp vision but LLAMA_SWAP_URL unset; falling back to Ollama"
+                                );
+                                self.ollama.describe_image(b64).await
+                            }
+                        }
+                    } else {
+                        self.ollama.describe_image(b64).await
+                    };
+                    match described {
+                        Ok(desc) => {
+                            format!("Visual description (from local vision model):\n{}\n", desc)
+                        }
+                        Err(e) => {
+                            log::warn!("{} bootstrap: describe_image failed: {}", effective_backend, e);
+                            String::new()
+                        }
                    }
-                    Err(e) => {
-                        log::warn!("hybrid bootstrap: local describe_image failed: {}", e);
-                        String::new()
-                    }
-                },
+                }
                None => String::new(),
            }
        } else {
@@ -1031,7 +1071,7 @@ impl InsightChatService {
        // Tool gates. Local + image present → expose describe_photo so
        // the chat model can re-look at the photo on demand. Hybrid:
        // already inlined, no tool needed.
-        let offer_describe_tool = !is_hybrid && image_base64.is_some();
+        let offer_describe_tool = !describes_then_inlines && image_base64.is_some();
        let gate_opts = self.generator.current_gate_opts_for_persona(
            offer_describe_tool,
            Some((req.user_id, &active_persona)),
@@ -1057,7 +1097,7 @@ impl InsightChatService {
        );
        let system_msg = ChatMessage::system(system_content);
        let mut user_msg = ChatMessage::user(req.user_message.clone());
-        if !is_hybrid && let Some(ref img) = image_base64 {
+        if !describes_then_inlines && let Some(ref img) = image_base64 {
            user_msg.images = Some(vec![img.clone()]);
        }
        let mut messages = vec![system_msg, user_msg];
@@ -1130,19 +1170,22 @@ impl InsightChatService {
        Ok(())
    }

-    /// Set up chat clients (Ollama + optional OpenRouter) shared by
-    /// bootstrap and continuation. Returns the chat-side backend client
-    /// (boxed because hybrid and local return different concrete types)
-    /// and the Ollama client used for describe-image / local tool calls.
+    /// Set up chat clients (Ollama + optional OpenRouter / LlamaCpp) shared
+    /// by bootstrap and continuation. Returns the chat-side backend client
+    /// (boxed because each backend has a different concrete type) and the
+    /// Ollama client used for describe-image / local tool calls.
+    ///
+    /// `effective_backend` must be one of `"local"`, `"hybrid"`, `"llamacpp"`
+    /// (validated upstream).
    fn build_chat_clients(
        &self,
-        is_hybrid: bool,
+        effective_backend: &str,
        custom_model: Option<&str>,
        req: &ChatTurnRequest,
    ) -> Result<(Box<dyn LlmClient>, OllamaClient)> {
        let mut ollama_client = self.ollama.clone();

-        if is_hybrid {
+        if effective_backend == "hybrid" {
            let arc = self.openrouter.as_ref().ok_or_else(|| {
                anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
            })?;
@@ -1163,6 +1206,27 @@ impl InsightChatService {
            return Ok((Box::new(c), ollama_client));
        }

+        if effective_backend == "llamacpp" {
+            let arc = self.llamacpp.as_ref().ok_or_else(|| {
+                anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured")
+            })?;
+            let mut c: LlamaCppClient = (**arc).clone();
+            if let Some(m) = custom_model {
+                c.primary_model = m.to_string();
+            }
+            if req.temperature.is_some()
+                || req.top_p.is_some()
+                || req.top_k.is_some()
+                || req.min_p.is_some()
+            {
+                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
+            }
+            if let Some(ctx) = req.num_ctx {
+                c.set_num_ctx(Some(ctx));
+            }
+            return Ok((Box::new(c), ollama_client));
+        }
+
        if let Some(m) = custom_model
            && m != self.ollama.primary_model
        {
@@ -1459,6 +1523,49 @@ fn resolve_date_taken_for_context(
        .map(|dt| dt.format("%Y-%m-%d").to_string())
 }

+/// Validate a stored→effective backend transition for a chat continuation.
+/// Continuation runs against a transcript that was generated with a specific
+/// backend; some transitions break the conversation shape:
+///
+/// - `local → hybrid` — the stored transcript has images embedded in the
+///   first user message; the openrouter chat client surfaces them through
+///   the wire, but vision-only models routed via the hybrid path may not
+///   accept that shape consistently across providers. Reject to keep the
+///   `regenerate-in-hybrid-mode` workflow as the supported answer.
+/// - `llamacpp → hybrid` — the stored transcript already has an inlined
+///   visual description produced by llama-swap's vision slot. Switching
+///   to hybrid mid-conversation would mix description sources across
+///   subsequent turns (any new image in the chat continuation would be
+///   described by ollama-vision while the original was described by
+///   llama-vision). Reject for consistency.
+///
+/// All other transitions are allowed. `local ↔ llamacpp` works because
+/// LlamaCppClient passes image content-parts through to the chat slot —
+/// the user is responsible for picking a vision-capable chat model in
+/// that case. `hybrid ↔ llamacpp` works because both transcripts are
+/// text-only (visual description inlined at bootstrap).
+fn validate_cross_replay(stored: &str, effective: &str) -> Result<()> {
+    if !matches!(effective, "local" | "hybrid" | "llamacpp") {
+        bail!(
+            "unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'",
+            effective
+        );
+    }
+    if stored == "local" && effective == "hybrid" {
+        bail!(
+            "switching from local to hybrid mid-chat isn't supported yet; \
+             regenerate the insight in hybrid mode if you want OpenRouter chat"
+        );
+    }
+    if stored == "llamacpp" && effective == "hybrid" {
+        bail!(
+            "switching from llamacpp to hybrid mid-chat isn't supported yet; \
+             regenerate the insight in hybrid mode if you want OpenRouter chat"
+        );
+    }
+    Ok(())
+}
+
 /// Pick the backend label for bootstrap. Bootstrap has no stored insight
 /// to defer to (that's continuation's behaviour), so the default is
 /// `"local"`. Returns an error if the supplied label is non-empty but
@@ -1469,8 +1576,11 @@ fn resolve_bootstrap_backend(supplied: Option<&str>) -> Result<String> {
        .map(|s| s.trim().to_lowercase())
        .filter(|s| !s.is_empty())
        .unwrap_or_else(|| "local".to_string());
-    if !matches!(lower.as_str(), "local" | "hybrid") {
-        bail!("unknown backend '{}'; expected 'local' or 'hybrid'", lower);
+    if !matches!(lower.as_str(), "local" | "hybrid" | "llamacpp") {
+        bail!(
+            "unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'",
+            lower
+        );
    }
    Ok(lower)
 }
@@ -2074,6 +2184,10 @@ mod tests {
    fn bootstrap_backend_accepts_local_and_hybrid_case_insensitively() {
        assert_eq!(resolve_bootstrap_backend(Some("LOCAL")).unwrap(), "local");
        assert_eq!(resolve_bootstrap_backend(Some("Hybrid")).unwrap(), "hybrid");
+        assert_eq!(
+            resolve_bootstrap_backend(Some("Llamacpp")).unwrap(),
+            "llamacpp"
+        );
        assert_eq!(
            resolve_bootstrap_backend(Some("  local  ")).unwrap(),
            "local"
@@ -2088,6 +2202,38 @@ mod tests {
        assert!(msg.contains("openrouter"));
    }

+    #[test]
+    fn cross_replay_rejects_local_to_hybrid() {
+        let err = validate_cross_replay("local", "hybrid").unwrap_err();
+        assert!(format!("{}", err).contains("local to hybrid"));
+    }
+
+    #[test]
+    fn cross_replay_rejects_llamacpp_to_hybrid() {
+        let err = validate_cross_replay("llamacpp", "hybrid").unwrap_err();
+        assert!(format!("{}", err).contains("llamacpp to hybrid"));
+    }
+
+    #[test]
+    fn cross_replay_allows_local_llamacpp_and_hybrid_llamacpp_transitions() {
+        // Local ↔ llamacpp: user is responsible for picking a vision-capable
+        // chat slot when the transcript has images.
+        assert!(validate_cross_replay("local", "llamacpp").is_ok());
+        assert!(validate_cross_replay("llamacpp", "local").is_ok());
+        // Hybrid ↔ llamacpp: both transcripts are text-only.
+        assert!(validate_cross_replay("hybrid", "llamacpp").is_ok());
+        // Same-backend replays are always fine.
+        assert!(validate_cross_replay("local", "local").is_ok());
+        assert!(validate_cross_replay("hybrid", "hybrid").is_ok());
+        assert!(validate_cross_replay("llamacpp", "llamacpp").is_ok());
+    }
+
+    #[test]
+    fn cross_replay_rejects_unknown_effective() {
+        let err = validate_cross_replay("local", "openrouter").unwrap_err();
+        assert!(format!("{}", err).contains("unknown backend"));
+    }
+
    #[test]
    fn bootstrap_system_message_includes_path_and_persona() {
        let out = build_bootstrap_system_message("you are helpful", "pics/IMG.jpg", None, None, "");