2026-05-26 18:58:48 +00:00
17 changed files with 2359 additions and 605 deletions
--- a/.env.example
+++ b/.env.example
@@ -53,6 +53,33 @@ AGENTIC_CHAT_MAX_ITERATIONS=6
 # OPENROUTER_HTTP_REFERER=https://your-site.example
 # OPENROUTER_APP_TITLE=ImageApi

+# ── AI Insights — local backend switch ──────────────────────────────────
+# Picks which local LLM stack the server uses for chat, vision describe,
+# and embeddings. `ollama` (default) uses the OLLAMA_* settings above;
+# `llamacpp` uses the LLAMA_SWAP_* settings below. The switch is global
+# and applies to both `backend=local` and `backend=hybrid` (hybrid keeps
+# chat on OpenRouter but still uses this stack for the describe pass).
+# Don't flip mid-deploy without re-embedding existing index rows —
+# mixed vector spaces break similarity search.
+# LLM_BACKEND=ollama
+
+# ── AI Insights — llama.cpp / llama-swap (optional) ─────────────────────
+# Set LLAMA_SWAP_URL plus LLM_BACKEND=llamacpp to swap the local stack
+# off Ollama. Talks OpenAI-compatible /v1 to a llama-swap proxy fronting
+# per-slot llama-server instances. Chat models receive images directly
+# via content-parts (vision-capable models assumed); a separate vision
+# slot is used only by the describe_photo tool and describe-image utility.
+# LLAMA_SWAP_URL=http://localhost:9292/v1
+# LLAMA_SWAP_PRIMARY_MODEL=chat
+# Optional dedicated vision slot for describe_image. Defaults to
+# PRIMARY_MODEL so describe_photo works without extra config.
+# LLAMA_SWAP_VISION_MODEL=vision
+# LLAMA_SWAP_EMBEDDING_MODEL=embed
+# Comma-separated allowlist surfaced by /insights/models when
+# LLM_BACKEND=llamacpp. All report has_vision=true.
+# LLAMA_SWAP_ALLOWED_MODELS=chat,vision,embed
+# LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180
+
 # ── AI Insights — sibling services (optional) ───────────────────────────
 # Apollo (places, face inference, CLIP encoders). Single-Apollo deploys
 # typically set only APOLLO_API_BASE_URL and let the face + CLIP
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -473,7 +473,7 @@ GET /memories?path=...&recursive=true
 POST /insights/generate              (non-agentic single-shot)
 POST /insights/generate/agentic      (tool-calling loop; body: { file_path, backend?, model?, ... })
 GET  /insights?path=...&library=...
-GET  /insights/models                (local Ollama models + capabilities)
+GET  /insights/models                (local-backend models + capabilities; Ollama OR llama-swap based on LLM_BACKEND)
 GET  /insights/openrouter/models     (curated OpenRouter allowlist)
 POST /insights/rate                  (thumbs up/down for training data)

@@ -631,6 +631,27 @@ OPENROUTER_EMBEDDING_MODEL=openai/text-embedding-3-small  # Optional, embeddings
 OPENROUTER_HTTP_REFERER=https://your-site.example    # Optional attribution header
 OPENROUTER_APP_TITLE=ImageApi                  # Optional attribution header

+# Local LLM backend switch. `ollama` (default) keeps the OLLAMA_* settings
+# above; `llamacpp` swaps the entire local stack (chat + vision describe +
+# embeddings) over to llama-swap. The switch is global and applies to
+# `backend=local` requests and to `backend=hybrid`'s describe pass (hybrid
+# chat still goes to OpenRouter). Don't flip mid-deploy without
+# re-embedding — mixed vector spaces break similarity search.
+LLM_BACKEND=ollama
+
+# llama.cpp / llama-swap (used when LLM_BACKEND=llamacpp). OpenAI-compatible
+# proxy hosting one or more llama-server processes. Chat models receive
+# images directly via content-parts (all models assumed vision-capable).
+LLAMA_SWAP_URL=http://localhost:9292/v1         # Required when LLM_BACKEND=llamacpp
+LLAMA_SWAP_PRIMARY_MODEL=chat                   # Chat slot id (matches config.yaml)
+LLAMA_SWAP_VISION_MODEL=                        # Dedicated vision slot for describe_image / describe_photo
+                                                # tool. Defaults to PRIMARY_MODEL when unset.
+LLAMA_SWAP_EMBEDDING_MODEL=embed                # Embedding slot id
+LLAMA_SWAP_ALLOWED_MODELS=chat,coder            # Curated allowlist surfaced by GET /insights/models
+                                                # when LLM_BACKEND=llamacpp. All report has_vision=true.
+                                                # Empty = picker shows only the configured primary model.
+LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180          # Per-request timeout; bump for slow CPU offload
+
 # Insight Chat Continuation
 AGENTIC_CHAT_MAX_ITERATIONS=6                  # Cap on tool-calling iterations per chat turn (default 6)
 ```
@@ -650,10 +671,50 @@ The `OllamaClient` provides methods to query available models:

 This allows runtime verification of model availability before generating insights.

+**Local backend switch (`LLM_BACKEND`):**
+
+One env var decides which "local" stack the server runs against — `ollama`
+(default) or `llamacpp`. It's global on purpose: chat, vision, and
+embeddings all route through the same backend, so the embedding-vector
+column in SQLite stays in one vector space. Don't flip mid-deploy without
+re-embedding the affected rows — similarity search will collapse.
+
+- `LLM_BACKEND=ollama`: chat, vision, and embeddings use Ollama. Vision
+  capability is probed per-model via `/api/show`.
+- `LLM_BACKEND=llamacpp`: chat models receive images directly via OpenAI
+  content-parts (all models assumed vision-capable). Embeddings hit the
+  `embed` slot. A dedicated `LLAMA_SWAP_VISION_MODEL` slot (defaults to
+  the chat model) handles `describe_image` for the `describe_photo` tool.
+  Requires `LLAMA_SWAP_URL`.
+
+The per-request `backend=hybrid` override is orthogonal: it always sends
+chat to OpenRouter (text-only, images are pre-described and inlined), but
+the describe + embed passes still route through whichever `LLM_BACKEND`
+is configured.
+
+**Backend dispatch (`ResolvedBackend`):**
+
+`InsightGenerator::resolve_backend(kind, overrides)` is the single entry
+point that builds clients for a request. Returns a `ResolvedBackend` with
+two roles: `.chat()` (the agentic/chat client) and `.local()` (local-only
+utility calls: rerank, describe_image, embeddings). `BackendKind` is an
+enum (`Local` | `Hybrid`) replacing the stringly-typed `"local"` /
+`"hybrid"` labels. `SamplingOverrides` groups model/ctx/temp/top_p/top_k/
+min_p per-request overrides. All downstream code (`execute_tool`,
+`run_streaming_agentic_loop`, etc.) takes `&ResolvedBackend` rather than
+individual client references.
+
+`GET /insights/models` returns the local-backend models with capabilities
+in the same envelope shape regardless of `LLM_BACKEND`: Ollama servers
+when `ollama`, llama-swap slots (from `LLAMA_SWAP_ALLOWED_MODELS`) when
+`llamacpp`. No `/insights/llamacpp/models` — the picker reads a single
+endpoint.
+
 **Hybrid Backend (OpenRouter):**
 - Per-request opt-in via `backend=hybrid` on `POST /insights/generate/agentic`.
- Local Ollama still describes the image (vision); the description is inlined
-  into the chat prompt and the agentic loop runs on OpenRouter.
+- Vision describe happens before the agentic loop; the description is inlined
+  into the chat prompt and the agentic loop runs on OpenRouter. Vision
+  routes through whichever `LLM_BACKEND` is configured.
 - `request.model` (if provided) overrides `OPENROUTER_DEFAULT_MODEL` for that
  call. The mobile picker reads from `OPENROUTER_ALLOWED_MODELS`.
 - No live capability precheck — the operator-curated allowlist is trusted.
@@ -661,6 +722,15 @@ This allows runtime verification of model availability before generating insight
 - `GET /insights/openrouter/models` returns `{ models, default_model, configured }`
  for client picker UIs.

+**Cross-replay matrix (chat continuation):**
+- `local → local` allowed (whether served by Ollama or llama-swap; that's
+  a deploy-time decision, not a request-time one).
+- `hybrid → hybrid` allowed.
+- `hybrid → local` allowed (the inlined description replays as text).
+- `local → hybrid` rejected — the stored transcript has raw images in the
+  first user message and OpenRouter providers don't accept that shape
+  consistently. Regenerate the insight in hybrid mode instead.
+
 **Insight Chat Continuation:**

 After an agentic insight is generated, the full `Vec<ChatMessage>` transcript is
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2051,7 +2051,7 @@ dependencies = [

 [[package]]
 name = "image-api"
-version = "1.1.0"
+version = "1.2.0"
 dependencies = [
 "actix",
 "actix-cors",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "image-api"
-version = "1.1.0"
+version = "1.2.0"
 authors = ["Cameron Cordes <cameronc.dev@gmail.com>"]
 edition = "2024"

--- a/src/ai/backend.rs
+++ b/src/ai/backend.rs
@@ -0,0 +1,140 @@
+use anyhow::{Result, anyhow};
+
+use crate::ai::llm_client::LlmClient;
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq)]
+pub enum BackendKind {
+    Local,
+    Hybrid,
+}
+
+impl BackendKind {
+    pub fn parse(s: &str) -> Result<Self> {
+        match s.trim().to_lowercase().as_str() {
+            "local" | "" => Ok(Self::Local),
+            "hybrid" => Ok(Self::Hybrid),
+            other => Err(anyhow!(
+                "unknown backend '{}'; expected 'local' or 'hybrid'",
+                other
+            )),
+        }
+    }
+
+    pub fn as_str(&self) -> &'static str {
+        match self {
+            Self::Local => "local",
+            Self::Hybrid => "hybrid",
+        }
+    }
+}
+
+impl std::fmt::Display for BackendKind {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        f.write_str(self.as_str())
+    }
+}
+
+pub struct SamplingOverrides {
+    pub model: Option<String>,
+    pub num_ctx: Option<i32>,
+    pub temperature: Option<f32>,
+    pub top_p: Option<f32>,
+    pub top_k: Option<i32>,
+    pub min_p: Option<f32>,
+}
+
+impl SamplingOverrides {
+    pub fn has_sampling(&self) -> bool {
+        self.temperature.is_some()
+            || self.top_p.is_some()
+            || self.top_k.is_some()
+            || self.min_p.is_some()
+    }
+}
+
+pub struct ResolvedBackend {
+    chat: Box<dyn LlmClient>,
+    local: Box<dyn LlmClient>,
+    pub kind: BackendKind,
+    /// `true` when the chat model receives images directly (Ollama with
+    /// vision, or llamacpp). `false` for hybrid where we describe-then-inline.
+    pub images_inline: bool,
+}
+
+impl ResolvedBackend {
+    pub fn new(
+        chat: Box<dyn LlmClient>,
+        local: Box<dyn LlmClient>,
+        kind: BackendKind,
+        images_inline: bool,
+    ) -> Self {
+        Self {
+            chat,
+            local,
+            kind,
+            images_inline,
+        }
+    }
+
+    pub fn chat(&self) -> &dyn LlmClient {
+        self.chat.as_ref()
+    }
+
+    pub fn local(&self) -> &dyn LlmClient {
+        self.local.as_ref()
+    }
+
+    pub fn model(&self) -> &str {
+        self.chat.primary_model()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn parse_backend_kind() {
+        assert_eq!(BackendKind::parse("local").unwrap(), BackendKind::Local);
+        assert_eq!(BackendKind::parse("hybrid").unwrap(), BackendKind::Hybrid);
+        assert_eq!(BackendKind::parse("  Local ").unwrap(), BackendKind::Local);
+        assert_eq!(BackendKind::parse("HYBRID").unwrap(), BackendKind::Hybrid);
+        assert_eq!(BackendKind::parse("").unwrap(), BackendKind::Local);
+        assert!(BackendKind::parse("vllm").is_err());
+    }
+
+    #[test]
+    fn backend_kind_as_str_roundtrips() {
+        assert_eq!(
+            BackendKind::parse(BackendKind::Local.as_str()).unwrap(),
+            BackendKind::Local
+        );
+        assert_eq!(
+            BackendKind::parse(BackendKind::Hybrid.as_str()).unwrap(),
+            BackendKind::Hybrid
+        );
+    }
+
+    #[test]
+    fn sampling_overrides_has_sampling() {
+        let empty = SamplingOverrides {
+            model: None,
+            num_ctx: None,
+            temperature: None,
+            top_p: None,
+            top_k: None,
+            min_p: None,
+        };
+        assert!(!empty.has_sampling());
+
+        let with_temp = SamplingOverrides {
+            model: None,
+            num_ctx: Some(4096),
+            temperature: Some(0.7),
+            top_p: None,
+            top_k: None,
+            min_p: None,
+        };
+        assert!(with_temp.has_sampling());
+    }
+}
--- a/src/ai/handlers.rs
+++ b/src/ai/handlers.rs
@@ -470,7 +470,16 @@ pub async fn generate_agentic_insight_handler(
    }
 }

-/// GET /insights/models - List available models from both servers with capabilities
+/// GET /insights/models - Local-backend models with capabilities. Returns
+/// Ollama servers when `LLM_BACKEND=ollama` (default), or llama-swap slots
+/// when `LLM_BACKEND=llamacpp`. Same envelope shape either way so the
+/// client picker doesn't have to branch on backend kind.
+///
+/// For llama-swap: `models` comes verbatim from `LLAMA_SWAP_ALLOWED_MODELS`
+/// (no live `/v1/models` probe), `has_vision` is true only for the
+/// configured `LLAMA_SWAP_VISION_MODEL` slot id, and `has_tool_calling` is
+/// reported as true for every slot (llama-server is launched with `--jinja`
+/// by convention — a misconfigured slot surfaces as a chat-call error).
 #[get("/insights/models")]
 pub async fn get_available_models_handler(
    _claims: Claims,
@@ -478,6 +487,29 @@ pub async fn get_available_models_handler(
 ) -> impl Responder {
    log::debug!("Fetching available models with capabilities");

+    if crate::ai::local_backend_is_llamacpp()
+        && let Some(lc) = app_state.llamacpp.as_ref()
+    {
+        let models: Vec<ModelCapabilities> = app_state
+            .llamacpp_allowed_models
+            .iter()
+            .map(|name| ModelCapabilities {
+                name: name.clone(),
+                has_vision: true,
+                has_tool_calling: true,
+            })
+            .collect();
+        let primary = ServerModels {
+            url: lc.base_url.clone(),
+            models,
+            default_model: lc.primary_model.clone(),
+        };
+        return HttpResponse::Ok().json(AvailableModelsResponse {
+            primary,
+            fallback: None,
+        });
+    }
+
    let ollama_client = &app_state.ollama;

    // Fetch models with capabilities from primary server
--- a/src/ai/insight_chat.rs
+++ b/src/ai/insight_chat.rs
@@ -6,10 +6,9 @@ use std::collections::HashMap;
 use std::sync::{Arc, Mutex};
 use tokio::sync::Mutex as TokioMutex;

+use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides};
 use crate::ai::insight_generator::InsightGenerator;
-use crate::ai::llm_client::{ChatMessage, LlmClient, LlmStreamEvent, Tool};
-use crate::ai::ollama::OllamaClient;
-use crate::ai::openrouter::OpenRouterClient;
+use crate::ai::llm_client::{ChatMessage, LlmStreamEvent, Tool};
 use crate::database::InsightDao;
 use crate::database::models::InsertPhotoInsight;
 use crate::otel::global_tracer;
@@ -91,8 +90,6 @@ pub struct ChatTurnResult {
 #[derive(Clone)]
 pub struct InsightChatService {
    generator: Arc<InsightGenerator>,
-    ollama: OllamaClient,
-    openrouter: Option<Arc<OpenRouterClient>>,
    insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
    chat_locks: ChatLockMap,
 }
@@ -100,15 +97,11 @@ pub struct InsightChatService {
 impl InsightChatService {
    pub fn new(
        generator: Arc<InsightGenerator>,
-        ollama: OllamaClient,
-        openrouter: Option<Arc<OpenRouterClient>>,
        insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
        chat_locks: ChatLockMap,
    ) -> Self {
        Self {
            generator,
-            ollama,
-            openrouter,
            insight_dao,
            chat_locks,
        }
@@ -303,24 +296,10 @@ impl InsightChatService {
            .map(|s| s.trim().to_lowercase())
            .filter(|s| !s.is_empty())
            .unwrap_or_else(|| stored_backend.clone());
-        if !matches!(effective_backend.as_str(), "local" | "hybrid") {
-            bail!(
-                "unknown backend '{}'; expected 'local' or 'hybrid'",
-                effective_backend
-            );
-        }
-        if stored_backend == "local" && effective_backend == "hybrid" {
-            bail!(
-                "switching from local to hybrid mid-chat isn't supported yet; \
-                 regenerate the insight in hybrid mode if you want OpenRouter chat"
-            );
-        }
-        let is_hybrid = effective_backend == "hybrid";
-        span.set_attribute(KeyValue::new("backend", effective_backend.clone()));
+        validate_cross_replay(&stored_backend, &effective_backend)?;
+        let kind = BackendKind::parse(&effective_backend)?;
+        span.set_attribute(KeyValue::new("backend", kind.as_str()));

-        // 4. Build the chat backend client. Ollama in local mode, a freshly
-        //    cloned OpenRouter client in hybrid mode (clone so per-request
-        //    sampling/model overrides don't leak into shared state).
        let max_iterations = req
            .max_iterations
            .unwrap_or(DEFAULT_MAX_ITERATIONS)
@@ -328,91 +307,38 @@ impl InsightChatService {
        span.set_attribute(KeyValue::new("max_iterations", max_iterations as i64));

        let stored_model = insight.model_version.clone();
-        let custom_model = req
-            .model
-            .clone()
-            .or_else(|| Some(stored_model.clone()))
-            .filter(|m| !m.is_empty());
-
-        let mut ollama_client = self.ollama.clone();
-        let mut openrouter_client: Option<OpenRouterClient> = None;
-
-        if is_hybrid {
-            let arc = self.openrouter.as_ref().ok_or_else(|| {
-                anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
-            })?;
-            let mut c: OpenRouterClient = (**arc).clone();
-            if let Some(ref m) = custom_model {
-                c.primary_model = m.clone();
-            }
-            if req.temperature.is_some()
-                || req.top_p.is_some()
-                || req.top_k.is_some()
-                || req.min_p.is_some()
-            {
-                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
-            }
-            if let Some(ctx) = req.num_ctx {
-                c.set_num_ctx(Some(ctx));
-            }
-            openrouter_client = Some(c);
-        } else {
-            // Local-mode model swap. Build a new client when the chat model
-            // differs from the configured one (mirrors the agentic pattern).
-            if let Some(ref m) = custom_model
-                && m != &self.ollama.primary_model
-            {
-                ollama_client = OllamaClient::new(
-                    self.ollama.primary_url.clone(),
-                    self.ollama.fallback_url.clone(),
-                    m.clone(),
-                    Some(m.clone()),
-                );
-            }
-            if req.temperature.is_some()
-                || req.top_p.is_some()
-                || req.top_k.is_some()
-                || req.min_p.is_some()
-            {
-                ollama_client.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
-            }
-            if let Some(ctx) = req.num_ctx {
-                ollama_client.set_num_ctx(Some(ctx));
-            }
-        }
-
-        let chat_backend: &dyn LlmClient = if let Some(ref c) = openrouter_client {
-            c
-        } else {
-            &ollama_client
+        let overrides = SamplingOverrides {
+            model: req
+                .model
+                .clone()
+                .or_else(|| Some(stored_model.clone()))
+                .filter(|m| !m.is_empty()),
+            num_ctx: req.num_ctx,
+            temperature: req.temperature,
+            top_p: req.top_p,
+            top_k: req.top_k,
+            min_p: req.min_p,
        };
-        let model_used = chat_backend.primary_model().to_string();
+        let backend = self.generator.resolve_backend(kind, &overrides).await?;
+        let model_used = backend.model().to_string();
        span.set_attribute(KeyValue::new("model", model_used.clone()));

-        // 5. Decide vision + tool set. In hybrid we always omit
-        //    `describe_photo` (matches the original generation flow). In
-        //    local we trust the stored history's first-user shape: if it
-        //    carries `images`, the original model was vision-capable, and
-        //    we keep `describe_photo` available.
+        // 5. Decide vision + tool set. In hybrid (describe-then-inline) mode
+        //    we omit `describe_photo`. Otherwise trust the stored history:
+        //    if the first user message carries images, describe_photo stays.
        let local_first_user_has_image = messages
            .iter()
            .find(|m| m.role == "user")
            .and_then(|m| m.images.as_ref())
            .map(|imgs| !imgs.is_empty())
            .unwrap_or(false);
-        let offer_describe_tool = !is_hybrid && local_first_user_has_image;
-        // current_gate_opts(has_vision) sets gate_opts.has_vision = has_vision
-        // and probes the per-table presence flags. Pass `offer_describe_tool`
-        // directly — the `!is_hybrid && local_first_user_has_image` decision
-        // is the chat-path's vision predicate.
+        let offer_describe_tool = backend.images_inline && local_first_user_has_image;
        let gate_opts = self.generator.current_gate_opts_for_persona(
            offer_describe_tool,
            Some((req.user_id, &active_persona)),
        );
        let tools = InsightGenerator::build_tool_definitions(gate_opts);

-        // Image base64 only needed when describe_photo is on the menu. Load
-        // lazily to avoid disk IO when the loop never invokes it.
        let image_base64: Option<String> = if offer_describe_tool {
            self.generator.load_image_as_base64(&normalized).ok()
        } else {
@@ -461,13 +387,13 @@ impl InsightChatService {
            iterations_used = iteration + 1;
            log::info!("Chat iteration {}/{}", iterations_used, max_iterations);

-            let (response, prompt_tokens, eval_tokens) = chat_backend
+            let (response, prompt_tokens, eval_tokens) = backend
+                .chat()
                .chat_with_tools(messages.clone(), tools.clone())
                .await?;
            last_prompt_eval_count = prompt_tokens;
            last_eval_count = eval_tokens;

-            // Ollama rejects non-object tool-call arguments on replay.
            let mut response = response;
            if let Some(ref mut tcs) = response.tool_calls {
                for tc in tcs.iter_mut() {
@@ -495,13 +421,11 @@ impl InsightChatService {
                        .execute_tool(
                            &tool_call.function.name,
                            &tool_call.function.arguments,
-                            &ollama_client,
+                            &backend,
                            &image_base64,
                            &normalized,
                            req.user_id,
                            &active_persona,
-                            &model_used,
-                            &effective_backend,
                            &loop_cx,
                        )
                        .await;
@@ -515,8 +439,6 @@ impl InsightChatService {
        }

        if final_content.is_empty() {
-            // The model never produced a final answer; ask once more without
-            // tools to force a textual reply.
            log::info!(
                "Chat loop exhausted after {} iterations, requesting final answer",
                iterations_used
@@ -524,7 +446,8 @@ impl InsightChatService {
            messages.push(ChatMessage::user(
                "Please write your final answer now without calling any more tools.",
            ));
-            let (final_response, prompt_tokens, eval_tokens) = chat_backend
+            let (final_response, prompt_tokens, eval_tokens) = backend
+                .chat()
                .chat_with_tools(messages.clone(), vec![])
                .await?;
            last_prompt_eval_count = prompt_tokens;
@@ -560,7 +483,8 @@ impl InsightChatService {
                 Capture the key moment or theme. Return ONLY the title, nothing else.",
                final_content
            );
-            let title_raw = chat_backend
+            let title_raw = backend
+                .chat()
                .generate(
                    &title_prompt,
                    Some(
@@ -585,7 +509,7 @@ impl InsightChatService {
                model_version: model_used.clone(),
                is_current: true,
                training_messages: Some(json),
-                backend: effective_backend.clone(),
+                backend: kind.as_str().to_string(),
                fewshot_source_ids: None,
                content_hash: None,
            };
@@ -610,7 +534,7 @@ impl InsightChatService {
            prompt_eval_count: last_prompt_eval_count,
            eval_count: last_eval_count,
            amended_insight_id,
-            backend_used: effective_backend,
+            backend_used: kind.as_str().to_string(),
            model_used,
        })
    }
@@ -799,19 +723,8 @@ impl InsightChatService {
            .map(|s| s.trim().to_lowercase())
            .filter(|s| !s.is_empty())
            .unwrap_or_else(|| stored_backend.clone());
-        if !matches!(effective_backend.as_str(), "local" | "hybrid") {
-            bail!(
-                "unknown backend '{}'; expected 'local' or 'hybrid'",
-                effective_backend
-            );
-        }
-        if stored_backend == "local" && effective_backend == "hybrid" {
-            bail!(
-                "switching from local to hybrid mid-chat isn't supported yet; \
-                 regenerate the insight in hybrid mode if you want OpenRouter chat"
-            );
-        }
-        let is_hybrid = effective_backend == "hybrid";
+        let kind = BackendKind::parse(&effective_backend)?;
+        validate_cross_replay(&stored_backend, kind.as_str())?;

        let max_iterations = req
            .max_iterations
@@ -819,27 +732,31 @@ impl InsightChatService {
            .clamp(1, env_max_iterations());

        let stored_model = insight.model_version.clone();
-        let custom_model = req
-            .model
-            .clone()
-            .or_else(|| Some(stored_model.clone()))
-            .filter(|m| !m.is_empty());
+        let overrides = SamplingOverrides {
+            model: req
+                .model
+                .clone()
+                .or_else(|| Some(stored_model.clone()))
+                .filter(|m| !m.is_empty()),
+            num_ctx: req.num_ctx,
+            temperature: req.temperature,
+            top_p: req.top_p,
+            top_k: req.top_k,
+            min_p: req.min_p,
+        };
+        let backend = self.generator.resolve_backend(kind, &overrides).await?;
+        let model_used = backend.model().to_string();

-        let (chat_backend_holder, ollama_client) =
-            self.build_chat_clients(is_hybrid, custom_model.as_deref(), &req)?;
-        let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref();
-        let model_used = chat_backend.primary_model().to_string();
-
-        // Tool set — local mode + first user turn carries an image →
-        // offer describe_photo. Hybrid: visual description was inlined
-        // when the insight was bootstrapped, no describe tool needed.
+        // Tool set — images_inline mode + first user turn carries an image →
+        // offer describe_photo. Describe-then-inline mode (hybrid only):
+        // visual description was inlined at bootstrap, no describe tool needed.
        let local_first_user_has_image = messages
            .iter()
            .find(|m| m.role == "user")
            .and_then(|m| m.images.as_ref())
            .map(|imgs| !imgs.is_empty())
            .unwrap_or(false);
-        let offer_describe_tool = !is_hybrid && local_first_user_has_image;
+        let offer_describe_tool = backend.images_inline && local_first_user_has_image;
        let gate_opts = self.generator.current_gate_opts_for_persona(
            offer_describe_tool,
            Some((req.user_id, &active_persona)),
@@ -870,16 +787,13 @@ impl InsightChatService {

        let outcome = self
            .run_streaming_agentic_loop(
-                chat_backend,
-                &ollama_client,
+                &backend,
                &mut messages,
                tools,
                &image_base64,
                &normalized,
                req.user_id,
                &active_persona,
-                &model_used,
-                &effective_backend,
                max_iterations,
                &tx,
            )
@@ -907,7 +821,8 @@ impl InsightChatService {

        let mut amended_insight_id: Option<i32> = None;
        if req.amend {
-            let title = self.generate_title(chat_backend, &final_content).await?;
+            let (title, body) = crate::ai::insight_generator::parse_title_body(&final_content);
+            let final_content = body;

            // Amended rows intentionally do not inherit the parent's
            // `fewshot_source_ids`. The parent's few-shot influence is still
@@ -923,7 +838,7 @@ impl InsightChatService {
                model_version: model_used.clone(),
                is_current: true,
                training_messages: Some(json),
-                backend: effective_backend.clone(),
+                backend: kind.as_str().to_string(),
                fewshot_source_ids: None,
                content_hash: None,
            };
@@ -949,7 +864,7 @@ impl InsightChatService {
                eval_tokens: last_eval_count,
                num_ctx: req.num_ctx,
                amended_insight_id,
-                backend_used: effective_backend,
+                backend_used: kind.as_str().to_string(),
                model_used,
            })
            .await;
@@ -975,18 +890,23 @@ impl InsightChatService {
            .filter(|s| !s.trim().is_empty())
            .unwrap_or_else(|| "default".to_string());
        let effective_backend = resolve_bootstrap_backend(req.backend.as_deref())?;
-        let is_hybrid = effective_backend == "hybrid";
+        let kind = BackendKind::parse(&effective_backend)?;

        let max_iterations = req
            .max_iterations
            .unwrap_or(DEFAULT_MAX_ITERATIONS)
            .clamp(1, env_max_iterations());

-        let custom_model = req.model.clone().filter(|m| !m.is_empty());
-        let (chat_backend_holder, ollama_client) =
-            self.build_chat_clients(is_hybrid, custom_model.as_deref(), &req)?;
-        let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref();
-        let model_used = chat_backend.primary_model().to_string();
+        let overrides = SamplingOverrides {
+            model: req.model.clone().filter(|m| !m.is_empty()),
+            num_ctx: req.num_ctx,
+            temperature: req.temperature,
+            top_p: req.top_p,
+            top_k: req.top_k,
+            min_p: req.min_p,
+        };
+        let backend = self.generator.resolve_backend(kind, &overrides).await?;
+        let model_used = backend.model().to_string();

        // Load image bytes once. RAW preview fallback is handled inside
        // load_image_as_base64. Errors degrade silently — a chat that
@@ -1007,18 +927,17 @@ impl InsightChatService {
                _ => None,
            });

-        // Hybrid backend: pre-describe the image via local Ollama vision
-        // so OpenRouter chat models (which can't see images directly) get
-        // the visual description as text. Mirrors the same pre-describe
-        // pass that `generate_agentic_insight_for_photo` does for hybrid.
-        let visual_block = if is_hybrid {
+        // Describe-then-inline (hybrid only): pre-describe the image so a
+        // text-only chat model gets the visual description inline.
+        // images_inline backends send images directly to the chat model.
+        let visual_block = if !backend.images_inline {
            match image_base64.as_deref() {
-                Some(b64) => match self.ollama.describe_image(b64).await {
+                Some(b64) => match backend.local().describe_image(b64).await {
                    Ok(desc) => {
                        format!("Visual description (from local vision model):\n{}\n", desc)
                    }
                    Err(e) => {
-                        log::warn!("hybrid bootstrap: local describe_image failed: {}", e);
+                        log::warn!("{} bootstrap: describe_image failed: {}", kind.as_str(), e);
                        String::new()
                    }
                },
@@ -1028,10 +947,10 @@ impl InsightChatService {
            String::new()
        };

-        // Tool gates. Local + image present → expose describe_photo so
-        // the chat model can re-look at the photo on demand. Hybrid:
+        // Tool gates. images_inline + image present → expose describe_photo so
+        // the chat model can re-look at the photo on demand. Non-inline:
        // already inlined, no tool needed.
-        let offer_describe_tool = !is_hybrid && image_base64.is_some();
+        let offer_describe_tool = backend.images_inline && image_base64.is_some();
        let gate_opts = self.generator.current_gate_opts_for_persona(
            offer_describe_tool,
            Some((req.user_id, &active_persona)),
@@ -1057,23 +976,22 @@ impl InsightChatService {
        );
        let system_msg = ChatMessage::system(system_content);
        let mut user_msg = ChatMessage::user(req.user_message.clone());
-        if !is_hybrid && let Some(ref img) = image_base64 {
-            user_msg.images = Some(vec![img.clone()]);
+        if backend.images_inline {
+            if let Some(ref img) = image_base64 {
+                user_msg.images = Some(vec![img.clone()]);
+            }
        }
        let mut messages = vec![system_msg, user_msg];

        let outcome = self
            .run_streaming_agentic_loop(
-                chat_backend,
-                &ollama_client,
+                &backend,
                &mut messages,
                tools,
                &image_base64,
                &normalized,
                req.user_id,
                &active_persona,
-                &model_used,
-                &effective_backend,
                max_iterations,
                &tx,
            )
@@ -1086,7 +1004,7 @@ impl InsightChatService {
            final_content,
        } = outcome;

-        let title = self.generate_title(chat_backend, &final_content).await?;
+        let (title, body) = crate::ai::insight_generator::parse_title_body(&final_content);

        let json = serde_json::to_string(&messages)
            .map_err(|e| anyhow!("failed to serialize chat history: {}", e))?;
@@ -1094,12 +1012,12 @@ impl InsightChatService {
            library_id: req.library_id,
            file_path: normalized.clone(),
            title,
-            summary: final_content,
+            summary: body,
            generated_at: Utc::now().timestamp(),
            model_version: model_used.clone(),
            is_current: true,
            training_messages: Some(json),
-            backend: effective_backend.clone(),
+            backend: kind.as_str().to_string(),
            fewshot_source_ids: None,
            content_hash: None,
        };
@@ -1122,7 +1040,7 @@ impl InsightChatService {
                eval_tokens: last_eval_count,
                num_ctx: req.num_ctx,
                amended_insight_id: Some(stored.id),
-                backend_used: effective_backend,
+                backend_used: kind.as_str().to_string(),
                model_used,
            })
            .await;
@@ -1130,105 +1048,19 @@ impl InsightChatService {
        Ok(())
    }

-    /// Set up chat clients (Ollama + optional OpenRouter) shared by
-    /// bootstrap and continuation. Returns the chat-side backend client
-    /// (boxed because hybrid and local return different concrete types)
-    /// and the Ollama client used for describe-image / local tool calls.
-    fn build_chat_clients(
-        &self,
-        is_hybrid: bool,
-        custom_model: Option<&str>,
-        req: &ChatTurnRequest,
-    ) -> Result<(Box<dyn LlmClient>, OllamaClient)> {
-        let mut ollama_client = self.ollama.clone();
-
-        if is_hybrid {
-            let arc = self.openrouter.as_ref().ok_or_else(|| {
-                anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
-            })?;
-            let mut c: OpenRouterClient = (**arc).clone();
-            if let Some(m) = custom_model {
-                c.primary_model = m.to_string();
-            }
-            if req.temperature.is_some()
-                || req.top_p.is_some()
-                || req.top_k.is_some()
-                || req.min_p.is_some()
-            {
-                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
-            }
-            if let Some(ctx) = req.num_ctx {
-                c.set_num_ctx(Some(ctx));
-            }
-            return Ok((Box::new(c), ollama_client));
-        }
-
-        if let Some(m) = custom_model
-            && m != self.ollama.primary_model
-        {
-            ollama_client = OllamaClient::new(
-                self.ollama.primary_url.clone(),
-                self.ollama.fallback_url.clone(),
-                m.to_string(),
-                Some(m.to_string()),
-            );
-        }
-        if req.temperature.is_some()
-            || req.top_p.is_some()
-            || req.top_k.is_some()
-            || req.min_p.is_some()
-        {
-            ollama_client.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
-        }
-        if let Some(ctx) = req.num_ctx {
-            ollama_client.set_num_ctx(Some(ctx));
-        }
-        Ok((Box::new(ollama_client.clone()), ollama_client))
-    }
-
-    /// Generate a short title via the same chat backend so voice stays
-    /// consistent with the body. Mirrors generate_agentic_insight_for_photo's
-    /// titling pass.
-    async fn generate_title(
-        &self,
-        chat_backend: &dyn LlmClient,
-        final_content: &str,
-    ) -> Result<String> {
-        let title_prompt = format!(
-            "Create a short title (maximum 8 words) for the following journal entry:\n\n{}\n\n\
-             Capture the key moment or theme. Return ONLY the title, nothing else.",
-            final_content
-        );
-        let title_raw = chat_backend
-            .generate(
-                &title_prompt,
-                Some(
-                    "You are my long term memory assistant. Use only the information provided. Do not invent details.",
-                ),
-                None,
-            )
-            .await?;
-        Ok(title_raw.trim().trim_matches('"').to_string())
-    }
-
    /// Drive the agentic loop with streaming SSE events. Shared between
    /// bootstrap and continuation. Mutates `messages` in place (response
    /// turns + tool results are appended) and returns counters + the
    /// final assistant content.
    async fn run_streaming_agentic_loop(
        &self,
-        chat_backend: &dyn LlmClient,
-        ollama_client: &OllamaClient,
+        backend: &ResolvedBackend,
        messages: &mut Vec<ChatMessage>,
        tools: Vec<Tool>,
        image_base64: &Option<String>,
        normalized: &str,
        user_id: i32,
        active_persona: &str,
-        // Provenance — stamped onto any store_fact tool call made
-        // during this loop. Mirrors the non-streaming chat path.
-        model_used: &str,
-        effective_backend: &str,
        max_iterations: usize,
        tx: &tokio::sync::mpsc::Sender<ChatStreamEvent>,
    ) -> Result<AgenticLoopOutcome> {
@@ -1247,7 +1079,8 @@ impl InsightChatService {
                })
                .await;

-            let mut stream = chat_backend
+            let mut stream = backend
+                .chat()
                .chat_with_tools_stream(messages.clone(), tools.clone())
                .await?;

@@ -1304,13 +1137,11 @@ impl InsightChatService {
                        .execute_tool(
                            &tool_call.function.name,
                            &tool_call.function.arguments,
-                            ollama_client,
+                            backend,
                            image_base64,
                            normalized,
                            user_id,
                            active_persona,
-                            model_used,
-                            effective_backend,
                            &cx,
                        )
                        .await;
@@ -1345,7 +1176,8 @@ impl InsightChatService {
            messages.push(ChatMessage::user(
                "Please write your final answer now without calling any more tools.",
            ));
-            let mut stream = chat_backend
+            let mut stream = backend
+                .chat()
                .chat_with_tools_stream(messages.clone(), vec![])
                .await?;
            let mut final_message: Option<ChatMessage> = None;
@@ -1459,6 +1291,34 @@ fn resolve_date_taken_for_context(
        .map(|dt| dt.format("%Y-%m-%d").to_string())
 }

+/// Validate a stored→effective backend transition for a chat continuation.
+/// Continuation runs against a transcript that was generated with a specific
+/// backend; the only blocked transition is `local → hybrid`, because the
+/// stored transcript has images embedded in the first user message and the
+/// hybrid path (OpenRouter chat with describe-then-inline) can't replay
+/// raw image bytes through OpenRouter consistently across providers.
+/// `hybrid → local` is allowed (the inlined description replays verbatim
+/// as text).
+///
+/// Whether "local" routes through Ollama or llama-swap is decided at
+/// startup by `LLM_BACKEND`; both share the same transcript shape from
+/// the chat-replay perspective.
+fn validate_cross_replay(stored: &str, effective: &str) -> Result<()> {
+    if !matches!(effective, "local" | "hybrid") {
+        bail!(
+            "unknown backend '{}'; expected 'local' or 'hybrid'",
+            effective
+        );
+    }
+    if stored == "local" && effective == "hybrid" {
+        bail!(
+            "switching from local to hybrid mid-chat isn't supported; \
+             regenerate the insight in hybrid mode if you want OpenRouter chat"
+        );
+    }
+    Ok(())
+}
+
 /// Pick the backend label for bootstrap. Bootstrap has no stored insight
 /// to defer to (that's continuation's behaviour), so the default is
 /// `"local"`. Returns an error if the supplied label is non-empty but
@@ -2082,10 +1942,40 @@ mod tests {

    #[test]
    fn bootstrap_backend_rejects_unknown_label() {
-        let err = resolve_bootstrap_backend(Some("openrouter")).unwrap_err();
-        let msg = format!("{}", err);
-        assert!(msg.contains("unknown backend"));
-        assert!(msg.contains("openrouter"));
+        // `llamacpp` is no longer a per-request backend value — it's chosen
+        // at deploy time via `LLM_BACKEND`.
+        for label in &["openrouter", "llamacpp", "ollama"] {
+            let err = resolve_bootstrap_backend(Some(label)).unwrap_err();
+            let msg = format!("{}", err);
+            assert!(msg.contains("unknown backend"), "label={}", label);
+        }
+    }
+
+    #[test]
+    fn cross_replay_rejects_local_to_hybrid() {
+        let err = validate_cross_replay("local", "hybrid").unwrap_err();
+        assert!(format!("{}", err).contains("local to hybrid"));
+    }
+
+    #[test]
+    fn cross_replay_allows_supported_transitions() {
+        assert!(validate_cross_replay("local", "local").is_ok());
+        assert!(validate_cross_replay("hybrid", "hybrid").is_ok());
+        // Hybrid → local replays the inlined description as plain text.
+        assert!(validate_cross_replay("hybrid", "local").is_ok());
+    }
+
+    #[test]
+    fn cross_replay_rejects_unknown_effective() {
+        // Both "openrouter" and the former "llamacpp" value are unknown now.
+        for label in &["openrouter", "llamacpp"] {
+            let err = validate_cross_replay("local", label).unwrap_err();
+            assert!(
+                format!("{}", err).contains("unknown backend"),
+                "label={}",
+                label
+            );
+        }
    }

    #[test]
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
--- a/src/ai/llamacpp.rs
+++ b/src/ai/llamacpp.rs
--- a/src/ai/mod.rs
+++ b/src/ai/mod.rs
@@ -1,10 +1,12 @@
 pub mod apollo_client;
+pub mod backend;
 pub mod clip_client;
 pub mod daily_summary_job;
 pub mod face_client;
 pub mod handlers;
 pub mod insight_chat;
 pub mod insight_generator;
+pub mod llamacpp;
 pub mod llm_client;
 pub mod ollama;
 pub mod openrouter;
@@ -23,6 +25,7 @@ pub use handlers::{
    get_insight_handler, get_openrouter_models_handler, rate_insight_handler,
 };
 pub use insight_generator::InsightGenerator;
+pub use llamacpp::LlamaCppClient;
 #[allow(unused_imports)]
 pub use llm_client::{
    ChatMessage, LlmClient, ModelCapabilities, Tool, ToolCall, ToolCallFunction, ToolFunction,
@@ -38,3 +41,87 @@ pub use sms_client::{SmsApiClient, SmsMessage};
 pub fn user_display_name() -> String {
    std::env::var("USER_NAME").unwrap_or_else(|_| "Me".to_string())
 }
+
+/// One switch for the "local" LLM stack: when `LLM_BACKEND=llamacpp` is
+/// set, chat / vision describe / embeddings all route through llama-swap
+/// instead of Ollama. Any other value (including unset, the default) is
+/// Ollama. This is intentionally global — embeddings must be drawn from
+/// a single source or similarity search across the index breaks (mixed
+/// vector spaces, possibly mixed dims). The `backend=hybrid` per-request
+/// override remains orthogonal: it always sends chat to OpenRouter, and
+/// uses `LLM_BACKEND` for the describe-then-inline vision pass.
+pub fn local_backend_is_llamacpp() -> bool {
+    matches!(
+        std::env::var("LLM_BACKEND")
+            .ok()
+            .as_deref()
+            .map(|s| s.trim().to_lowercase())
+            .as_deref(),
+        Some("llamacpp")
+    )
+}
+
+/// Embed one string via the configured local backend. Routes through
+/// llama-swap when `LLM_BACKEND=llamacpp` (and a client is configured),
+/// else Ollama. Returns the single embedding vector. See
+/// [`local_backend_is_llamacpp`] for the rationale on consistency.
+pub async fn embed_one(
+    ollama: &OllamaClient,
+    llamacpp: Option<&LlamaCppClient>,
+    text: &str,
+) -> anyhow::Result<Vec<f32>> {
+    if local_backend_is_llamacpp() {
+        if let Some(lc) = llamacpp {
+            let mut vecs = <LlamaCppClient as LlmClient>::generate_embeddings(lc, &[text]).await?;
+            return vecs
+                .pop()
+                .ok_or_else(|| anyhow::anyhow!("llama-swap returned no embeddings"));
+        }
+        log::warn!(
+            "LLM_BACKEND=llamacpp but LlamaCppClient is unconfigured; falling back to Ollama embeddings"
+        );
+    }
+    ollama.generate_embedding(text).await
+}
+
+#[cfg(test)]
+mod env_dispatch_tests {
+    use super::*;
+
+    fn with_env<F: FnOnce()>(key: &str, val: Option<&str>, f: F) {
+        let prev = std::env::var(key).ok();
+        match val {
+            Some(v) => unsafe { std::env::set_var(key, v) },
+            None => unsafe { std::env::remove_var(key) },
+        }
+        f();
+        match prev {
+            Some(v) => unsafe { std::env::set_var(key, v) },
+            None => unsafe { std::env::remove_var(key) },
+        }
+    }
+
+    #[test]
+    fn llm_backend_defaults_to_ollama() {
+        with_env("LLM_BACKEND", None, || {
+            assert!(!local_backend_is_llamacpp());
+        });
+    }
+
+    #[test]
+    fn llm_backend_llamacpp_case_insensitive() {
+        with_env("LLM_BACKEND", Some("LlamaCpp"), || {
+            assert!(local_backend_is_llamacpp());
+        });
+        with_env("LLM_BACKEND", Some("  llamacpp "), || {
+            assert!(local_backend_is_llamacpp());
+        });
+    }
+
+    #[test]
+    fn llm_backend_unknown_value_is_ollama() {
+        with_env("LLM_BACKEND", Some("vllm"), || {
+            assert!(!local_backend_is_llamacpp());
+        });
+    }
+}
--- a/src/ai/sms_client.rs
+++ b/src/ai/sms_client.rs
@@ -281,6 +281,9 @@ impl SmsApiClient {
        if let Some(cid) = params.contact_id {
            url.push_str(&format!("&contact_id={}", cid));
        }
+        if let Some(ref c) = params.contact {
+            url.push_str(&format!("&contact={}", urlencoding::encode(c)));
+        }
        if let Some(off) = params.offset {
            url.push_str(&format!("&offset={}", off));
        }
@@ -413,6 +416,9 @@ pub struct SmsSearchParams<'a> {
    pub mode: &'a str,
    pub limit: usize,
    pub contact_id: Option<i64>,
+    /// Contact name (case-insensitive). Resolved to a numeric ID by the
+    /// SMS-API server when `contact_id` is not set.
+    pub contact: Option<String>,
    /// Unix-seconds inclusive lower bound on `date`.
    pub date_from: Option<i64>,
    /// Unix-seconds inclusive upper bound on `date`.
--- a/src/bin/populate_knowledge.rs
+++ b/src/bin/populate_knowledge.rs
@@ -195,6 +195,7 @@ async fn main() -> anyhow::Result<()> {
    let generator = InsightGenerator::new(
        ollama,
        None,
+        None,
        sms_client,
        apollo_client,
        insight_dao.clone(),
--- a/src/content_hash.rs
+++ b/src/content_hash.rs
@@ -62,6 +62,15 @@ pub fn large_preview_path(thumbs_dir: &Path, hash: &str) -> PathBuf {
        .join(format!("{}.jpg", hash))
 }

+/// Hash-keyed xlarge-preview path: `<thumbs_dir>/_xlarge/<hash[..2]>/<hash>.jpg`.
+pub fn xlarge_preview_path(thumbs_dir: &Path, hash: &str) -> PathBuf {
+    let shard = shard_prefix(hash);
+    thumbs_dir
+        .join("_xlarge")
+        .join(shard)
+        .join(format!("{}.jpg", hash))
+}
+
 /// Hash-keyed HLS output directory: `<video_dir>/<hash[..2]>/<hash>/`.
 /// The playlist lives at `playlist.m3u8` inside this directory and its
 /// segments are co-located so HLS relative references Just Work. See
--- a/src/data/mod.rs
+++ b/src/data/mod.rs
@@ -194,6 +194,7 @@ pub enum MediaType {
 #[serde(rename_all = "lowercase")]
 pub enum PhotoSize {
    Full,
+    XLarge,
    Large,
    Thumb,
 }
--- a/src/handlers/image.rs
+++ b/src/handlers/image.rs
@@ -83,12 +83,14 @@ pub async fn get_image(
    if let Some((library, path)) = resolved {
        let image_size = req.size.unwrap_or(PhotoSize::Full);

-        // `size=large` is only meaningful for stills — there's no useful
-        // "2048px video preview" tier. Videos fall back to the existing
-        // thumb pipeline (which already handles gif/static selection).
-        // `mut` so the Large branch can downgrade itself to `Full` after a
-        // generation failure (RAW-preview branch below keys off `Full`).
-        let mut image_size = if image_size == PhotoSize::Large && file_types::is_video_file(&path) {
+        // `size=large|xlarge` is only meaningful for stills — there's no
+        // useful "resized video preview" tier. Videos fall back to the
+        // existing thumb pipeline (which already handles gif/static
+        // selection). `mut` so preview branches can downgrade to `Full`
+        // after a generation failure.
+        let mut image_size = if (image_size == PhotoSize::Large || image_size == PhotoSize::XLarge)
+            && file_types::is_video_file(&path)
+        {
            PhotoSize::Thumb
        } else {
            image_size
@@ -196,6 +198,93 @@ pub async fn get_image(
            image_size = PhotoSize::Full;
        }

+        if image_size == PhotoSize::XLarge {
+            let relative_path = path
+                .strip_prefix(&library.root_path)
+                .expect("Error stripping library root prefix from xlarge preview");
+            let relative_path_str = relative_path.to_string_lossy().replace('\\', "/");
+            let thumbs = Path::new(&app_state.thumbnail_path);
+            let xlarge_dir = thumbs.join("_xlarge");
+
+            let hash_xlarge_path: Option<PathBuf> = {
+                let mut dao = exif_dao.lock().expect("Unable to lock ExifDao");
+                match dao.get_exif(&context, &relative_path_str) {
+                    Ok(Some(row)) => row
+                        .content_hash
+                        .as_deref()
+                        .map(|h| content_hash::xlarge_preview_path(thumbs, h)),
+                    _ => None,
+                }
+            };
+            let scoped_legacy_xlarge_path =
+                content_hash::library_scoped_legacy_path(&xlarge_dir, library.id, relative_path);
+
+            let existing = hash_xlarge_path
+                .as_ref()
+                .filter(|p| p.exists())
+                .cloned()
+                .or_else(|| {
+                    if scoped_legacy_xlarge_path.exists() {
+                        Some(scoped_legacy_xlarge_path.clone())
+                    } else {
+                        None
+                    }
+                });
+
+            if let Some(found) = existing {
+                if let Ok(file) = NamedFile::open(&found) {
+                    span.set_status(Status::Ok);
+                    return file
+                        .use_etag(true)
+                        .use_last_modified(true)
+                        .prefer_utf8(true)
+                        .into_response(&request);
+                }
+            }
+
+            let dest = hash_xlarge_path
+                .clone()
+                .unwrap_or_else(|| scoped_legacy_xlarge_path.clone());
+            let src = path.clone();
+            let dest_for_block = dest.clone();
+            let generated = web::block(move || {
+                if let Some(parent) = dest_for_block.parent() {
+                    std::fs::create_dir_all(parent)?;
+                }
+                let tmp = dest_for_block.with_extension("jpg.tmp");
+                crate::thumbnails::generate_xlarge_preview(&src, &tmp)?;
+                std::fs::rename(&tmp, &dest_for_block)?;
+                Ok::<(), std::io::Error>(())
+            })
+            .await;
+
+            match generated {
+                Ok(Ok(())) => {
+                    if let Ok(file) = NamedFile::open(&dest) {
+                        span.set_status(Status::Ok);
+                        return file
+                            .use_etag(true)
+                            .use_last_modified(true)
+                            .prefer_utf8(true)
+                            .into_response(&request);
+                    }
+                }
+                Ok(Err(e)) => {
+                    warn!(
+                        "XLarge preview generation failed for {:?}: {} — falling back to original",
+                        path, e
+                    );
+                }
+                Err(e) => {
+                    warn!(
+                        "XLarge preview blocking-pool error for {:?}: {} — falling back to original",
+                        path, e
+                    );
+                }
+            }
+            image_size = PhotoSize::Full;
+        }
+
        if image_size == PhotoSize::Thumb {
            let relative_path = path
                .strip_prefix(&library.root_path)
--- a/src/state.rs
+++ b/src/state.rs
@@ -2,6 +2,7 @@ use crate::ai::apollo_client::ApolloClient;
 use crate::ai::clip_client::ClipClient;
 use crate::ai::face_client::FaceClient;
 use crate::ai::insight_chat::{ChatLockMap, InsightChatService};
+use crate::ai::llamacpp::LlamaCppClient;
 use crate::ai::openrouter::OpenRouterClient;
 use crate::ai::{InsightGenerator, OllamaClient, SmsApiClient};
 use crate::database::{
@@ -62,6 +63,16 @@ pub struct AppState {
    /// Curated list of OpenRouter model ids exposed to clients. Sourced from
    /// `OPENROUTER_ALLOWED_MODELS` (comma-separated). Empty when unset.
    pub openrouter_allowed_models: Vec<String>,
+    /// `None` when `LLAMA_SWAP_URL` is not configured. Consulted only when a
+    /// request explicitly opts into `backend=llamacpp`. Same shape as the
+    /// `openrouter` slot — present here so handlers can route to it without
+    /// threading through the generator.
+    #[allow(dead_code)]
+    pub llamacpp: Option<Arc<LlamaCppClient>>,
+    /// Curated list of llama-swap model ids exposed to clients. Sourced from
+    /// `LLAMA_SWAP_ALLOWED_MODELS` (comma-separated). Empty when unset; the
+    /// server then falls back to `LLAMA_SWAP_PRIMARY_MODEL`.
+    pub llamacpp_allowed_models: Vec<String>,
    pub sms_client: SmsApiClient,
    pub insight_generator: InsightGenerator,
    /// Chat continuation service. Hold an Arc so handlers can clone cheaply.
@@ -105,6 +116,8 @@ impl AppState {
        ollama: OllamaClient,
        openrouter: Option<Arc<OpenRouterClient>>,
        openrouter_allowed_models: Vec<String>,
+        llamacpp: Option<Arc<LlamaCppClient>>,
+        llamacpp_allowed_models: Vec<String>,
        sms_client: SmsApiClient,
        insight_generator: InsightGenerator,
        insight_chat: Arc<InsightChatService>,
@@ -145,6 +158,8 @@ impl AppState {
            ollama,
            openrouter,
            openrouter_allowed_models,
+            llamacpp,
+            llamacpp_allowed_models,
            sms_client,
            insight_generator,
            insight_chat,
@@ -186,6 +201,9 @@ impl Default for AppState {
        let openrouter = build_openrouter_from_env();
        let openrouter_allowed_models = parse_openrouter_allowed_models();

+        let llamacpp = build_llamacpp_from_env();
+        let llamacpp_allowed_models = parse_llamacpp_allowed_models();
+
        let sms_api_url =
            env::var("SMS_API_URL").unwrap_or_else(|_| "http://localhost:8000".to_string());
        let sms_api_token = env::var("SMS_API_TOKEN").ok();
@@ -250,6 +268,7 @@ impl Default for AppState {
        let insight_generator = InsightGenerator::new(
            ollama.clone(),
            openrouter.clone(),
+            llamacpp.clone(),
            sms_client.clone(),
            apollo_client.clone(),
            insight_dao.clone(),
@@ -271,8 +290,6 @@ impl Default for AppState {
            Arc::new(tokio::sync::Mutex::new(std::collections::HashMap::new()));
        let insight_chat = Arc::new(InsightChatService::new(
            Arc::new(insight_generator.clone()),
-            ollama.clone(),
-            openrouter.clone(),
            insight_dao.clone(),
            chat_locks,
        ));
@@ -294,6 +311,8 @@ impl Default for AppState {
            ollama,
            openrouter,
            openrouter_allowed_models,
+            llamacpp,
+            llamacpp_allowed_models,
            sms_client,
            insight_generator,
            insight_chat,
@@ -335,6 +354,37 @@ fn parse_openrouter_allowed_models() -> Vec<String> {
        .collect()
 }

+/// Build a `LlamaCppClient` from environment variables. Returns `None` when
+/// `LLAMA_SWAP_URL` is unset. The client is constructed unconditionally
+/// when the URL is set (so it's available even under `LLM_BACKEND=ollama`
+/// for ad-hoc tooling), but the agentic / chat paths only route through it
+/// when `LLM_BACKEND=llamacpp`. Slot ids default to the names the bundled
+/// `llama-swap/config.yaml` uses — `chat` / `vision` / `embed`.
+fn build_llamacpp_from_env() -> Option<Arc<LlamaCppClient>> {
+    let base_url = env::var("LLAMA_SWAP_URL").ok()?;
+    let primary_model = env::var("LLAMA_SWAP_PRIMARY_MODEL").ok();
+    let mut client = LlamaCppClient::new(Some(base_url), primary_model);
+    if let Ok(model) = env::var("LLAMA_SWAP_EMBEDDING_MODEL") {
+        client.set_embedding_model(model);
+    }
+    if let Ok(model) = env::var("LLAMA_SWAP_VISION_MODEL") {
+        client.set_vision_model(model);
+    }
+    Some(Arc::new(client))
+}
+
+/// Parse `LLAMA_SWAP_ALLOWED_MODELS` (comma-separated) into a vec. Used to
+/// populate the model picker when `LLM_BACKEND=llamacpp` — `/insights/models`
+/// surfaces these slots with capabilities. Empty when unset.
+fn parse_llamacpp_allowed_models() -> Vec<String> {
+    env::var("LLAMA_SWAP_ALLOWED_MODELS")
+        .unwrap_or_default()
+        .split(',')
+        .map(|s| s.trim().to_string())
+        .filter(|s| !s.is_empty())
+        .collect()
+}
+
 #[cfg(test)]
 impl AppState {
    /// Creates an AppState instance for testing with temporary directories
@@ -397,6 +447,7 @@ impl AppState {
        let insight_generator = InsightGenerator::new(
            ollama.clone(),
            None,
+            None,
            sms_client.clone(),
            apollo_client.clone(),
            insight_dao.clone(),
@@ -416,8 +467,6 @@ impl AppState {
            Arc::new(tokio::sync::Mutex::new(std::collections::HashMap::new()));
        let insight_chat = Arc::new(InsightChatService::new(
            Arc::new(insight_generator.clone()),
-            ollama.clone(),
-            None,
            insight_dao.clone(),
            chat_locks,
        ));
@@ -445,6 +494,8 @@ impl AppState {
            ollama,
            None,
            Vec::new(),
+            None,
+            Vec::new(),
            sms_client,
            insight_generator,
            insight_chat,
--- a/src/thumbnails.rs
+++ b/src/thumbnails.rs
@@ -36,12 +36,19 @@ use crate::video::actors::{generate_image_thumbnail_ffmpeg, generate_video_thumb
 /// `size=full` and the handler streams the original bytes.
 pub const LARGE_PREVIEW_MAX_DIM: u32 = 2048;

-/// JPEG quality for the large preview tier. 85 is the conventional
-/// "indistinguishable from source at viewing size" point — well above the
-/// `image` crate's default ~75, but well below quality-90+ territory where
-/// file size doubles for no perceptible win.
+/// JPEG quality for the large and xlarge preview tiers. 85 is the
+/// conventional "indistinguishable from source at viewing size" point —
+/// well above the `image` crate's default ~75, but well below quality-90+
+/// territory where file size doubles for no perceptible win.
 const LARGE_PREVIEW_JPEG_QUALITY: u8 = 85;

+/// Maximum long-edge size (px) for the xlarge preview tier. Bridges the
+/// gap between `large` (2048px, ~16MB decoded) and the original bytes
+/// (potentially 48+ MP / ~192MB decoded). At 4096px the decoded bitmap is
+/// ~64MB — enough for 2-3× pinch-zoom on any phone before the viewer
+/// needs to stream the true original.
+pub const XLARGE_PREVIEW_MAX_DIM: u32 = 4096;
+
 lazy_static! {
    pub static ref IMAGE_GAUGE: IntGauge = IntGauge::new(
        "imageserver_image_total",
@@ -205,6 +212,86 @@ fn generate_large_preview_ffmpeg(src: &Path, dest: &Path) -> std::io::Result<()>
    Ok(())
 }

+/// Generate the on-demand xlarge-preview tier (≈4096 long edge JPEG).
+///
+/// Same waterfall as [`generate_large_preview`] but targeting
+/// [`XLARGE_PREVIEW_MAX_DIM`]. Sources whose long edge is already below
+/// the cap are encoded at native size (no upscale).
+pub fn generate_xlarge_preview(src: &Path, dest: &Path) -> std::io::Result<()> {
+    let orientation = exif::read_orientation(src).unwrap_or(1);
+
+    if let Some(preview) = exif::extract_embedded_jpeg_preview(src) {
+        let img = image::load_from_memory(&preview).map_err(|e| {
+            std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                format!("decode embedded preview {:?}: {}", src, e),
+            )
+        })?;
+        let img = exif::apply_orientation(img, orientation);
+        return encode_xlarge_jpeg(img, dest);
+    }
+
+    if file_types::needs_ffmpeg_thumbnail(src) {
+        return generate_xlarge_preview_ffmpeg(src, dest);
+    }
+
+    let img = image::open(src).map_err(|e| {
+        std::io::Error::new(std::io::ErrorKind::InvalidData, format!("{:?}: {}", src, e))
+    })?;
+    let img = exif::apply_orientation(img, orientation);
+    encode_xlarge_jpeg(img, dest)
+}
+
+fn encode_xlarge_jpeg(img: image::DynamicImage, dest: &Path) -> std::io::Result<()> {
+    let (w, h) = img.dimensions();
+    let max_dim = w.max(h);
+    let scaled = if max_dim > XLARGE_PREVIEW_MAX_DIM {
+        img.thumbnail(XLARGE_PREVIEW_MAX_DIM, XLARGE_PREVIEW_MAX_DIM)
+    } else {
+        img
+    };
+    let file = std::fs::File::create(dest)
+        .map_err(|e| std::io::Error::other(format!("create {:?}: {}", dest, e)))?;
+    let mut writer = std::io::BufWriter::new(file);
+    let mut encoder = JpegEncoder::new_with_quality(&mut writer, LARGE_PREVIEW_JPEG_QUALITY);
+    encoder
+        .encode_image(&scaled)
+        .map_err(|e| std::io::Error::other(format!("encode {:?}: {}", dest, e)))?;
+    Ok(())
+}
+
+fn generate_xlarge_preview_ffmpeg(src: &Path, dest: &Path) -> std::io::Result<()> {
+    let vf = format!(
+        "scale='if(gt(iw,ih),min(iw,{cap}),-1)':'if(gt(iw,ih),-1,min(ih,{cap}))'",
+        cap = XLARGE_PREVIEW_MAX_DIM
+    );
+    let output = Command::new("ffmpeg")
+        .arg("-y")
+        .arg("-i")
+        .arg(src)
+        .arg("-vframes")
+        .arg("1")
+        .arg("-vf")
+        .arg(&vf)
+        .arg("-q:v")
+        .arg("5")
+        .arg("-f")
+        .arg("image2")
+        .arg("-c:v")
+        .arg("mjpeg")
+        .arg(dest)
+        .output()?;
+
+    if !output.status.success() {
+        return Err(std::io::Error::other(format!(
+            "ffmpeg failed ({}): {}",
+            output.status,
+            String::from_utf8_lossy(&output.stderr).trim()
+        )));
+    }
+    Ok(())
+}
+
 pub fn create_thumbnails(libs: &[libraries::Library], excluded_dirs: &[String]) {
    let tracer = global_tracer();
    let span = tracer.start("creating thumbnails");