From 0073409b3d0c4a54f4f81e9a06947504f676d0e5 Mon Sep 17 00:00:00 2001
From: Cameron <ccordes12@mail.bw.edu>
Date: Mon, 20 Apr 2026 22:11:05 -0400
Subject: [PATCH 01/24] refactor: introduce LlmClient trait (no-op)

Preparation for a second LLM backend (OpenRouter) and hybrid vision-local /
chat-remote mode. Shared wire types (ChatMessage, Tool, ToolCall, etc.) move
into a new src/ai/llm_client.rs and are re-exported from ollama.rs so
existing imports keep working. OllamaClient now implements LlmClient.

No behavior change; callers still hold the concrete OllamaClient. Caller
migration to Arc<dyn LlmClient> is deferred to the PR that wires hybrid
backend routing.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Cargo.lock           |   1 +
 Cargo.toml           |   1 +
 src/ai/llm_client.rs | 140 +++++++++++++++++++++++++++++++++++++++++++
 src/ai/mod.rs        |   7 ++-
 src/ai/ollama.rs     | 140 +++++++++++++++----------------------------
 5 files changed, 197 insertions(+), 92 deletions(-)
 create mode 100644 src/ai/llm_client.rs
diff --git a/Cargo.lock b/Cargo.lock
index 4f04521..2e210ea 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1843,6 +1843,7 @@ dependencies = [
  "actix-web",
  "actix-web-prom",
  "anyhow",
+ "async-trait",
  "base64",
  "bcrypt",
  "blake3",
diff --git a/Cargo.toml b/Cargo.toml
index 1e606b0..be60128 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -56,3 +56,4 @@ ical = "0.11"
 scraper = "0.20"
 base64 = "0.22"
 blake3 = "1.5"
+async-trait = "0.1"
diff --git a/src/ai/llm_client.rs b/src/ai/llm_client.rs
new file mode 100644
index 0000000..c1f1bca
--- /dev/null
+++ b/src/ai/llm_client.rs
@@ -0,0 +1,140 @@
+use anyhow::Result;
+use async_trait::async_trait;
+use serde::{Deserialize, Serialize};
+
+/// Provider-agnostic surface for LLM backends (Ollama, OpenRouter, …).
+///
+/// Impls translate these canonical shapes at the wire boundary: tool-call
+/// arguments stay as `serde_json::Value` in memory and are stringified only
+/// when a provider requires it (OpenAI-compatible APIs do), and `images`
+/// stays as base64 strings here and is rewritten into content-parts where
+/// needed.
+// First consumer lands in a later PR (OpenRouter impl + hybrid mode routing).
+#[allow(dead_code)]
+#[async_trait]
+pub trait LlmClient: Send + Sync {
+    /// Single-shot text generation. Optional system prompt and optional
+    /// base64 images (ignored by providers without vision support).
+    async fn generate(
+        &self,
+        prompt: &str,
+        system: Option<&str>,
+        images: Option<Vec<String>>,
+    ) -> Result<String>;
+
+    /// Multi-turn chat with tool definitions. Returns the assistant message
+    /// (which may contain tool_calls) plus optional prompt/eval token counts.
+    async fn chat_with_tools(
+        &self,
+        messages: Vec<ChatMessage>,
+        tools: Vec<Tool>,
+    ) -> Result<(ChatMessage, Option<i32>, Option<i32>)>;
+
+    /// Batch embedding generation. Dimensionality is provider/model specific.
+    async fn generate_embeddings(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>>;
+
+    /// One-shot vision description of an image. Used to convert images into
+    /// plain text for the hybrid-mode conversation flow.
+    async fn describe_image(&self, image_base64: &str) -> Result<String>;
+
+    /// Enumerate available models with their capabilities.
+    async fn list_models(&self) -> Result<Vec<ModelCapabilities>>;
+
+    /// Look up capabilities for a single model.
+    async fn model_capabilities(&self, model: &str) -> Result<ModelCapabilities>;
+
+    /// Primary model identifier this client was constructed with.
+    fn primary_model(&self) -> &str;
+}
+
+/// Tool definition sent to the model (OpenAI-compatible function schema).
+#[derive(Serialize, Clone, Debug)]
+pub struct Tool {
+    #[serde(rename = "type")]
+    pub tool_type: String, // always "function"
+    pub function: ToolFunction,
+}
+
+#[derive(Serialize, Clone, Debug)]
+pub struct ToolFunction {
+    pub name: String,
+    pub description: String,
+    pub parameters: serde_json::Value,
+}
+
+impl Tool {
+    pub fn function(name: &str, description: &str, parameters: serde_json::Value) -> Self {
+        Self {
+            tool_type: "function".to_string(),
+            function: ToolFunction {
+                name: name.to_string(),
+                description: description.to_string(),
+                parameters,
+            },
+        }
+    }
+}
+
+/// A message in the chat conversation history.
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct ChatMessage {
+    pub role: String, // "system" | "user" | "assistant" | "tool"
+    /// Empty string (not null) when tool_calls is present — Ollama quirk.
+    #[serde(default)]
+    pub content: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub tool_calls: Option<Vec<ToolCall>>,
+    /// Base64 images — only on user messages to vision-capable models.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub images: Option<Vec<String>>,
+}
+
+impl ChatMessage {
+    pub fn system(content: impl Into<String>) -> Self {
+        Self {
+            role: "system".to_string(),
+            content: content.into(),
+            tool_calls: None,
+            images: None,
+        }
+    }
+    pub fn user(content: impl Into<String>) -> Self {
+        Self {
+            role: "user".to_string(),
+            content: content.into(),
+            tool_calls: None,
+            images: None,
+        }
+    }
+    pub fn tool_result(content: impl Into<String>) -> Self {
+        Self {
+            role: "tool".to_string(),
+            content: content.into(),
+            tool_calls: None,
+            images: None,
+        }
+    }
+}
+
+/// Tool call returned by the model in an assistant message.
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct ToolCall {
+    pub function: ToolCallFunction,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub id: Option<String>,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct ToolCallFunction {
+    pub name: String,
+    /// Canonical shape: native JSON. Providers that use JSON-encoded-string
+    /// arguments (OpenAI-compatible) translate at their wire boundary.
+    pub arguments: serde_json::Value,
+}
+
+#[derive(Serialize, Deserialize, Clone, Debug)]
+pub struct ModelCapabilities {
+    pub name: String,
+    pub has_vision: bool,
+    pub has_tool_calling: bool,
+}
diff --git a/src/ai/mod.rs b/src/ai/mod.rs
index 4e682fb..f60e553 100644
--- a/src/ai/mod.rs
+++ b/src/ai/mod.rs
@@ -1,6 +1,7 @@
 pub mod daily_summary_job;
 pub mod handlers;
 pub mod insight_generator;
+pub mod llm_client;
 pub mod ollama;
 pub mod sms_client;
 
@@ -13,5 +14,9 @@ pub use handlers::{
     get_insight_handler, rate_insight_handler,
 };
 pub use insight_generator::InsightGenerator;
-pub use ollama::{ModelCapabilities, OllamaClient};
+#[allow(unused_imports)]
+pub use llm_client::{
+    ChatMessage, LlmClient, ModelCapabilities, Tool, ToolCall, ToolCallFunction, ToolFunction,
+};
+pub use ollama::OllamaClient;
 pub use sms_client::{SmsApiClient, SmsMessage};
diff --git a/src/ai/ollama.rs b/src/ai/ollama.rs
index 184bc61..2cc2cfa 100644
--- a/src/ai/ollama.rs
+++ b/src/ai/ollama.rs
@@ -1,4 +1,5 @@
 use anyhow::{Context, Result};
+use async_trait::async_trait;
 use chrono::NaiveDate;
 use reqwest::Client;
 use serde::{Deserialize, Serialize};
@@ -6,6 +7,14 @@ use std::collections::HashMap;
 use std::sync::{Arc, Mutex};
 use std::time::{Duration, Instant};
 
+use crate::ai::llm_client::LlmClient;
+
+// Re-export shared types so existing `crate::ai::ollama::{...}` imports
+// continue to resolve.
+pub use crate::ai::llm_client::{ChatMessage, ModelCapabilities, Tool};
+#[allow(unused_imports)]
+pub use crate::ai::llm_client::{ToolCall, ToolCallFunction, ToolFunction};
+
 // Cache duration: 15 minutes
 const CACHE_DURATION_SECS: u64 = 15 * 60;
 
@@ -818,6 +827,46 @@ Analyze the image and use specific details from both the visual content and the
     }
 }
 
+#[async_trait]
+impl LlmClient for OllamaClient {
+    async fn generate(
+        &self,
+        prompt: &str,
+        system: Option<&str>,
+        images: Option<Vec<String>>,
+    ) -> Result<String> {
+        self.generate_with_images(prompt, system, images).await
+    }
+
+    async fn chat_with_tools(
+        &self,
+        messages: Vec<ChatMessage>,
+        tools: Vec<Tool>,
+    ) -> Result<(ChatMessage, Option<i32>, Option<i32>)> {
+        OllamaClient::chat_with_tools(self, messages, tools).await
+    }
+
+    async fn generate_embeddings(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>> {
+        OllamaClient::generate_embeddings(self, texts).await
+    }
+
+    async fn describe_image(&self, image_base64: &str) -> Result<String> {
+        self.generate_photo_description(image_base64).await
+    }
+
+    async fn list_models(&self) -> Result<Vec<ModelCapabilities>> {
+        Self::list_models_with_capabilities(&self.primary_url).await
+    }
+
+    async fn model_capabilities(&self, model: &str) -> Result<ModelCapabilities> {
+        Self::check_model_capabilities(&self.primary_url, model).await
+    }
+
+    fn primary_model(&self) -> &str {
+        &self.primary_model
+    }
+}
+
 #[derive(Serialize)]
 struct OllamaRequest {
     model: String,
@@ -845,90 +894,6 @@ struct OllamaOptions {
     min_p: Option<f32>,
 }
 
-/// Tool definition sent in /api/chat requests (OpenAI-compatible format)
-#[derive(Serialize, Clone, Debug)]
-pub struct Tool {
-    #[serde(rename = "type")]
-    pub tool_type: String, // always "function"
-    pub function: ToolFunction,
-}
-
-#[derive(Serialize, Clone, Debug)]
-pub struct ToolFunction {
-    pub name: String,
-    pub description: String,
-    pub parameters: serde_json::Value,
-}
-
-impl Tool {
-    pub fn function(name: &str, description: &str, parameters: serde_json::Value) -> Self {
-        Self {
-            tool_type: "function".to_string(),
-            function: ToolFunction {
-                name: name.to_string(),
-                description: description.to_string(),
-                parameters,
-            },
-        }
-    }
-}
-
-/// A message in the chat conversation history
-#[derive(Serialize, Deserialize, Clone, Debug)]
-pub struct ChatMessage {
-    pub role: String, // "system" | "user" | "assistant" | "tool"
-    /// Empty string (not null) when tool_calls is present — Ollama quirk
-    #[serde(default)]
-    pub content: String,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub tool_calls: Option<Vec<ToolCall>>,
-    /// Base64 images — only on user messages to vision-capable models
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub images: Option<Vec<String>>,
-}
-
-impl ChatMessage {
-    pub fn system(content: impl Into<String>) -> Self {
-        Self {
-            role: "system".to_string(),
-            content: content.into(),
-            tool_calls: None,
-            images: None,
-        }
-    }
-    pub fn user(content: impl Into<String>) -> Self {
-        Self {
-            role: "user".to_string(),
-            content: content.into(),
-            tool_calls: None,
-            images: None,
-        }
-    }
-    pub fn tool_result(content: impl Into<String>) -> Self {
-        Self {
-            role: "tool".to_string(),
-            content: content.into(),
-            tool_calls: None,
-            images: None,
-        }
-    }
-}
-
-/// Tool call returned by the model in an assistant message
-#[derive(Serialize, Deserialize, Clone, Debug)]
-pub struct ToolCall {
-    pub function: ToolCallFunction,
-    #[serde(skip_serializing_if = "Option::is_none")]
-    pub id: Option<String>,
-}
-
-#[derive(Serialize, Deserialize, Clone, Debug)]
-pub struct ToolCallFunction {
-    pub name: String,
-    /// Native JSON object (NOT a JSON-encoded string like OpenAI)
-    pub arguments: serde_json::Value,
-}
-
 #[derive(Serialize)]
 struct OllamaChatRequest<'a> {
     model: &'a str,
@@ -975,13 +940,6 @@ struct OllamaShowResponse {
     capabilities: Vec<String>,
 }
 
-#[derive(Serialize, Deserialize, Clone, Debug)]
-pub struct ModelCapabilities {
-    pub name: String,
-    pub has_vision: bool,
-    pub has_tool_calling: bool,
-}
-
 #[derive(Serialize)]
 struct OllamaBatchEmbedRequest {
     model: String,
-- 
2.49.1


From e799ba716c500dcd1ca0f5179a6e5b18a6a7bd85 Mon Sep 17 00:00:00 2001
From: Cameron <ccordes12@mail.bw.edu>
Date: Mon, 20 Apr 2026 22:18:29 -0400
Subject: [PATCH 02/24] feat(ai): add OpenRouterClient implementing LlmClient
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

OpenAI-compatible client for OpenRouter. Translates canonical wire shapes at
the boundary: tool-call arguments stringify on send / parse on receive
(accepting both string and native-object forms); images rewritten from the
base64 images field into content-parts with image_url entries; role=tool
messages inherit tool_call_id from the preceding assistant's tool calls.

/models parsed into ModelCapabilities via supported_parameters (tool use)
and architecture.input_modalities (vision). 15-minute capabilities cache.
Bearer auth; HTTP-Referer / X-Title attribution headers optional.

Not wired into request routing yet — first consumer arrives with hybrid
backend mode. 11 unit tests cover the translation helpers.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/ai/mod.rs        |   1 +
 src/ai/openrouter.rs | 727 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 728 insertions(+)
 create mode 100644 src/ai/openrouter.rs

diff --git a/src/ai/mod.rs b/src/ai/mod.rs
index f60e553..5414e69 100644
--- a/src/ai/mod.rs
+++ b/src/ai/mod.rs
@@ -3,6 +3,7 @@ pub mod handlers;
 pub mod insight_generator;
 pub mod llm_client;
 pub mod ollama;
+pub mod openrouter;
 pub mod sms_client;
 
 // strip_summary_boilerplate is used by binaries (test_daily_summary), not the library
diff --git a/src/ai/openrouter.rs b/src/ai/openrouter.rs
new file mode 100644
index 0000000..2c46852
--- /dev/null
+++ b/src/ai/openrouter.rs
@@ -0,0 +1,727 @@
+// First consumer lands in a later PR (hybrid backend routing). Tests exercise
+// the translation helpers directly.
+#![allow(dead_code)]
+
+use anyhow::{Context, Result, anyhow, bail};
+use async_trait::async_trait;
+use reqwest::Client;
+use serde::Deserialize;
+use serde_json::{Value, json};
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex};
+use std::time::{Duration, Instant};
+
+use crate::ai::llm_client::{
+    ChatMessage, LlmClient, ModelCapabilities, Tool, ToolCall, ToolCallFunction,
+};
+
+const DEFAULT_BASE_URL: &str = "https://openrouter.ai/api/v1";
+const DEFAULT_EMBEDDING_MODEL: &str = "openai/text-embedding-3-small";
+const CACHE_DURATION_SECS: u64 = 15 * 60;
+
+#[derive(Clone)]
+struct CachedEntry<T> {
+    data: T,
+    cached_at: Instant,
+}
+
+impl<T> CachedEntry<T> {
+    fn new(data: T) -> Self {
+        Self {
+            data,
+            cached_at: Instant::now(),
+        }
+    }
+
+    fn is_expired(&self) -> bool {
+        self.cached_at.elapsed().as_secs() > CACHE_DURATION_SECS
+    }
+}
+
+lazy_static::lazy_static! {
+    static ref MODEL_CAPABILITIES_CACHE: Arc<Mutex<HashMap<String, CachedEntry<Vec<ModelCapabilities>>>>> =
+        Arc::new(Mutex::new(HashMap::new()));
+}
+
+/// OpenAI-compatible client for OpenRouter (https://openrouter.ai).
+///
+/// Translates canonical `ChatMessage` / `Tool` shapes to OpenAI wire format:
+/// - Tool-call `arguments` serialized as JSON-encoded strings (vs Ollama's
+///   native JSON).
+/// - Image content rewritten into content-parts array with `image_url` entries.
+/// - `role=tool` messages attach a `tool_call_id` inferred from the preceding
+///   assistant turn's tool call.
+#[derive(Clone)]
+pub struct OpenRouterClient {
+    client: Client,
+    pub api_key: String,
+    pub base_url: String,
+    pub primary_model: String,
+    pub embedding_model: String,
+    num_ctx: Option<i32>,
+    temperature: Option<f32>,
+    top_p: Option<f32>,
+    top_k: Option<i32>,
+    min_p: Option<f32>,
+    /// Optional `HTTP-Referer` header OpenRouter uses for attribution.
+    pub referer: Option<String>,
+    /// Optional `X-Title` header OpenRouter uses for attribution.
+    pub app_title: Option<String>,
+}
+
+impl OpenRouterClient {
+    pub fn new(api_key: String, base_url: Option<String>, primary_model: String) -> Self {
+        Self {
+            client: Client::builder()
+                .connect_timeout(Duration::from_secs(10))
+                .timeout(Duration::from_secs(180))
+                .build()
+                .unwrap_or_else(|_| Client::new()),
+            api_key,
+            base_url: base_url.unwrap_or_else(|| DEFAULT_BASE_URL.to_string()),
+            primary_model,
+            embedding_model: DEFAULT_EMBEDDING_MODEL.to_string(),
+            num_ctx: None,
+            temperature: None,
+            top_p: None,
+            top_k: None,
+            min_p: None,
+            referer: None,
+            app_title: None,
+        }
+    }
+
+    pub fn set_embedding_model(&mut self, model: String) {
+        self.embedding_model = model;
+    }
+
+    #[allow(dead_code)]
+    pub fn set_num_ctx(&mut self, num_ctx: Option<i32>) {
+        self.num_ctx = num_ctx;
+    }
+
+    #[allow(dead_code)]
+    pub fn set_sampling_params(
+        &mut self,
+        temperature: Option<f32>,
+        top_p: Option<f32>,
+        top_k: Option<i32>,
+        min_p: Option<f32>,
+    ) {
+        self.temperature = temperature;
+        self.top_p = top_p;
+        self.top_k = top_k;
+        self.min_p = min_p;
+    }
+
+    pub fn set_attribution(&mut self, referer: Option<String>, app_title: Option<String>) {
+        self.referer = referer;
+        self.app_title = app_title;
+    }
+
+    fn authed(&self, builder: reqwest::RequestBuilder) -> reqwest::RequestBuilder {
+        let mut b = builder.bearer_auth(&self.api_key);
+        if let Some(r) = &self.referer {
+            b = b.header("HTTP-Referer", r);
+        }
+        if let Some(t) = &self.app_title {
+            b = b.header("X-Title", t);
+        }
+        b
+    }
+
+    /// Translate canonical messages to the OpenAI-compatible wire shape.
+    ///
+    /// Walks in order so it can attach `tool_call_id` to `role=tool` messages
+    /// based on the most recent assistant turn's tool call.
+    fn messages_to_openai(messages: &[ChatMessage]) -> Vec<Value> {
+        let mut out = Vec::with_capacity(messages.len());
+        let mut last_tool_call_ids: Vec<String> = Vec::new();
+        let mut next_tool_result_idx: usize = 0;
+
+        for msg in messages {
+            let mut obj = serde_json::Map::new();
+            obj.insert("role".into(), Value::String(msg.role.clone()));
+
+            // Content: string OR content-parts array (when images present).
+            match &msg.images {
+                Some(images) if !images.is_empty() => {
+                    let mut parts: Vec<Value> = Vec::new();
+                    if !msg.content.is_empty() {
+                        parts.push(json!({"type": "text", "text": msg.content}));
+                    }
+                    for img in images {
+                        let url = image_to_data_url(img);
+                        parts.push(json!({
+                            "type": "image_url",
+                            "image_url": { "url": url }
+                        }));
+                    }
+                    obj.insert("content".into(), Value::Array(parts));
+                }
+                _ => {
+                    obj.insert("content".into(), Value::String(msg.content.clone()));
+                }
+            }
+
+            // Assistant message with tool_calls: stringify arguments, remember
+            // the ids so the subsequent tool messages can reference them.
+            if let Some(tcs) = &msg.tool_calls
+                && msg.role == "assistant"
+            {
+                let converted: Vec<Value> = tcs
+                    .iter()
+                    .enumerate()
+                    .map(|(i, call)| {
+                        let id = call.id.clone().unwrap_or_else(|| format!("call_{}", i));
+                        let args_str = serde_json::to_string(&call.function.arguments)
+                            .unwrap_or_else(|_| "{}".to_string());
+                        json!({
+                            "id": id,
+                            "type": "function",
+                            "function": {
+                                "name": call.function.name,
+                                "arguments": args_str,
+                            }
+                        })
+                    })
+                    .collect();
+                last_tool_call_ids = converted
+                    .iter()
+                    .filter_map(|v| v.get("id").and_then(|x| x.as_str()).map(String::from))
+                    .collect();
+                next_tool_result_idx = 0;
+                obj.insert("tool_calls".into(), Value::Array(converted));
+            }
+
+            // Tool result messages: attach tool_call_id from the last assistant turn.
+            if msg.role == "tool" {
+                let id = last_tool_call_ids
+                    .get(next_tool_result_idx)
+                    .cloned()
+                    .unwrap_or_else(|| "call_0".to_string());
+                obj.insert("tool_call_id".into(), Value::String(id));
+                next_tool_result_idx += 1;
+            }
+
+            out.push(Value::Object(obj));
+        }
+
+        out
+    }
+
+    /// Parse an OpenAI-compatible assistant message back into canonical shape.
+    fn openai_message_to_chat(msg: &Value) -> Result<ChatMessage> {
+        let obj = msg
+            .as_object()
+            .ok_or_else(|| anyhow!("response message is not an object"))?;
+        let role = obj
+            .get("role")
+            .and_then(|v| v.as_str())
+            .unwrap_or("assistant")
+            .to_string();
+        let content = obj
+            .get("content")
+            .and_then(|v| v.as_str())
+            .unwrap_or("")
+            .to_string();
+
+        let tool_calls = if let Some(tcs) = obj.get("tool_calls").and_then(|v| v.as_array()) {
+            let mut parsed = Vec::with_capacity(tcs.len());
+            for tc in tcs {
+                let id = tc.get("id").and_then(|v| v.as_str()).map(String::from);
+                let function = tc
+                    .get("function")
+                    .ok_or_else(|| anyhow!("tool_call missing function field"))?;
+                let name = function
+                    .get("name")
+                    .and_then(|v| v.as_str())
+                    .unwrap_or_default()
+                    .to_string();
+                let args_value = match function.get("arguments") {
+                    // OpenAI-compat: stringified JSON.
+                    Some(Value::String(s)) => {
+                        serde_json::from_str::<Value>(s).unwrap_or_else(|_| json!({}))
+                    }
+                    // Some providers emit arguments as an object directly — accept both.
+                    Some(v @ Value::Object(_)) => v.clone(),
+                    _ => json!({}),
+                };
+                parsed.push(ToolCall {
+                    id,
+                    function: ToolCallFunction {
+                        name,
+                        arguments: args_value,
+                    },
+                });
+            }
+            Some(parsed)
+        } else {
+            None
+        };
+
+        Ok(ChatMessage {
+            role,
+            content,
+            tool_calls,
+            images: None,
+        })
+    }
+
+    fn build_options(&self) -> Vec<(&'static str, Value)> {
+        let mut v = Vec::new();
+        if let Some(t) = self.temperature {
+            v.push(("temperature", json!(t)));
+        }
+        if let Some(p) = self.top_p {
+            v.push(("top_p", json!(p)));
+        }
+        if let Some(k) = self.top_k {
+            v.push(("top_k", json!(k)));
+        }
+        if let Some(m) = self.min_p {
+            v.push(("min_p", json!(m)));
+        }
+        if let Some(c) = self.num_ctx {
+            // OpenAI uses max_tokens for generation bound; num_ctx isn't
+            // directly transferable. Skip rather than silently mis-map.
+            let _ = c;
+        }
+        v
+    }
+}
+
+#[async_trait]
+impl LlmClient for OpenRouterClient {
+    async fn generate(
+        &self,
+        prompt: &str,
+        system: Option<&str>,
+        images: Option<Vec<String>>,
+    ) -> Result<String> {
+        let mut messages: Vec<ChatMessage> = Vec::new();
+        if let Some(sys) = system {
+            messages.push(ChatMessage::system(sys));
+        }
+        let mut user = ChatMessage::user(prompt);
+        user.images = images;
+        messages.push(user);
+
+        let (reply, _, _) = self.chat_with_tools(messages, Vec::new()).await?;
+        Ok(reply.content)
+    }
+
+    async fn chat_with_tools(
+        &self,
+        messages: Vec<ChatMessage>,
+        tools: Vec<Tool>,
+    ) -> Result<(ChatMessage, Option<i32>, Option<i32>)> {
+        let url = format!("{}/chat/completions", self.base_url);
+        let mut body = serde_json::Map::new();
+        body.insert("model".into(), Value::String(self.primary_model.clone()));
+        body.insert(
+            "messages".into(),
+            Value::Array(Self::messages_to_openai(&messages)),
+        );
+        body.insert("stream".into(), Value::Bool(false));
+        if !tools.is_empty() {
+            body.insert(
+                "tools".into(),
+                serde_json::to_value(&tools).context("serializing tools")?,
+            );
+        }
+        for (k, v) in self.build_options() {
+            body.insert(k.into(), v);
+        }
+
+        log::info!(
+            "OpenRouter chat_with_tools: model={} messages={} tools={}",
+            self.primary_model,
+            messages.len(),
+            tools.len()
+        );
+
+        let resp = self
+            .authed(self.client.post(&url))
+            .json(&Value::Object(body))
+            .send()
+            .await
+            .with_context(|| format!("POST {} failed", url))?;
+
+        if !resp.status().is_success() {
+            let status = resp.status();
+            let body = resp.text().await.unwrap_or_default();
+            bail!("OpenRouter chat request failed: {} — {}", status, body);
+        }
+
+        let parsed: Value = resp.json().await.context("parsing chat response")?;
+        let choice = parsed
+            .get("choices")
+            .and_then(|v| v.as_array())
+            .and_then(|a| a.first())
+            .ok_or_else(|| anyhow!("response missing choices[0]"))?;
+        let msg = choice
+            .get("message")
+            .ok_or_else(|| anyhow!("choices[0] missing message"))?;
+        let chat_msg = Self::openai_message_to_chat(msg)?;
+
+        let usage = parsed.get("usage");
+        let prompt_tokens = usage
+            .and_then(|u| u.get("prompt_tokens"))
+            .and_then(|v| v.as_i64())
+            .map(|n| n as i32);
+        let completion_tokens = usage
+            .and_then(|u| u.get("completion_tokens"))
+            .and_then(|v| v.as_i64())
+            .map(|n| n as i32);
+
+        Ok((chat_msg, prompt_tokens, completion_tokens))
+    }
+
+    async fn generate_embeddings(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>> {
+        let url = format!("{}/embeddings", self.base_url);
+        let body = json!({
+            "model": self.embedding_model,
+            "input": texts,
+        });
+
+        let resp = self
+            .authed(self.client.post(&url))
+            .json(&body)
+            .send()
+            .await
+            .with_context(|| format!("POST {} failed", url))?;
+
+        if !resp.status().is_success() {
+            let status = resp.status();
+            let body = resp.text().await.unwrap_or_default();
+            bail!("OpenRouter embedding request failed: {} — {}", status, body);
+        }
+
+        #[derive(Deserialize)]
+        struct EmbedResponse {
+            data: Vec<EmbedItem>,
+        }
+        #[derive(Deserialize)]
+        struct EmbedItem {
+            embedding: Vec<f32>,
+        }
+
+        let parsed: EmbedResponse = resp.json().await.context("parsing embed response")?;
+        Ok(parsed.data.into_iter().map(|i| i.embedding).collect())
+    }
+
+    async fn describe_image(&self, image_base64: &str) -> Result<String> {
+        let prompt = "Briefly describe what you see in this image in 1-2 sentences. \
+                      Focus on the people, location, and activity.";
+        self.generate(
+            prompt,
+            Some("You are a scene description assistant. Be concise and factual."),
+            Some(vec![image_base64.to_string()]),
+        )
+        .await
+    }
+
+    async fn list_models(&self) -> Result<Vec<ModelCapabilities>> {
+        {
+            let cache = MODEL_CAPABILITIES_CACHE.lock().unwrap();
+            if let Some(entry) = cache.get(&self.base_url)
+                && !entry.is_expired()
+            {
+                return Ok(entry.data.clone());
+            }
+        }
+
+        let url = format!("{}/models", self.base_url);
+        let resp = self
+            .authed(self.client.get(&url))
+            .send()
+            .await
+            .with_context(|| format!("GET {} failed", url))?;
+
+        if !resp.status().is_success() {
+            let status = resp.status();
+            let body = resp.text().await.unwrap_or_default();
+            bail!("OpenRouter list_models failed: {} — {}", status, body);
+        }
+
+        let parsed: Value = resp.json().await.context("parsing models response")?;
+        let data = parsed
+            .get("data")
+            .and_then(|v| v.as_array())
+            .ok_or_else(|| anyhow!("models response missing data[]"))?;
+
+        let caps: Vec<ModelCapabilities> = data.iter().map(parse_model_capabilities).collect();
+
+        {
+            let mut cache = MODEL_CAPABILITIES_CACHE.lock().unwrap();
+            cache.insert(self.base_url.clone(), CachedEntry::new(caps.clone()));
+        }
+
+        Ok(caps)
+    }
+
+    async fn model_capabilities(&self, model: &str) -> Result<ModelCapabilities> {
+        let all = self.list_models().await?;
+        all.into_iter()
+            .find(|m| m.name == model)
+            .ok_or_else(|| anyhow!("model '{}' not found on OpenRouter", model))
+    }
+
+    fn primary_model(&self) -> &str {
+        &self.primary_model
+    }
+}
+
+/// Build a `data:` URL if the provided string is raw base64, otherwise pass it through.
+fn image_to_data_url(img: &str) -> String {
+    if img.starts_with("data:") {
+        img.to_string()
+    } else {
+        format!("data:image/jpeg;base64,{}", img)
+    }
+}
+
+fn parse_model_capabilities(m: &Value) -> ModelCapabilities {
+    let name = m
+        .get("id")
+        .and_then(|v| v.as_str())
+        .unwrap_or_default()
+        .to_string();
+    let has_tool_calling = m
+        .get("supported_parameters")
+        .and_then(|v| v.as_array())
+        .map(|arr| arr.iter().any(|x| x.as_str() == Some("tools")))
+        .unwrap_or(false);
+    let has_vision = m
+        .get("architecture")
+        .and_then(|v| v.get("input_modalities"))
+        .and_then(|v| v.as_array())
+        .map(|arr| arr.iter().any(|x| x.as_str() == Some("image")))
+        .unwrap_or(false);
+    ModelCapabilities {
+        name,
+        has_vision,
+        has_tool_calling,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn tool_call_arguments_stringified_on_send() {
+        let mut msg = ChatMessage {
+            role: "assistant".into(),
+            content: String::new(),
+            tool_calls: Some(vec![ToolCall {
+                id: Some("call_abc".into()),
+                function: ToolCallFunction {
+                    name: "search_sms".into(),
+                    arguments: json!({"query": "hello", "limit": 5}),
+                },
+            }]),
+            images: None,
+        };
+        msg.tool_calls.as_mut().unwrap()[0].function.arguments =
+            json!({"query": "hello", "limit": 5});
+
+        let wire = OpenRouterClient::messages_to_openai(&[msg]);
+        let tcs = wire[0]
+            .get("tool_calls")
+            .and_then(|v| v.as_array())
+            .expect("tool_calls present");
+        let args = tcs[0]
+            .get("function")
+            .and_then(|f| f.get("arguments"))
+            .and_then(|a| a.as_str())
+            .expect("arguments stringified");
+        let parsed: Value = serde_json::from_str(args).unwrap();
+        assert_eq!(parsed["query"], "hello");
+        assert_eq!(parsed["limit"], 5);
+    }
+
+    #[test]
+    fn tool_call_arguments_parsed_on_receive() {
+        let response_msg = json!({
+            "role": "assistant",
+            "content": "",
+            "tool_calls": [{
+                "id": "call_xyz",
+                "type": "function",
+                "function": {
+                    "name": "get_weather",
+                    "arguments": "{\"city\":\"Boston\",\"units\":\"celsius\"}"
+                }
+            }]
+        });
+
+        let parsed = OpenRouterClient::openai_message_to_chat(&response_msg).unwrap();
+        let tcs = parsed.tool_calls.unwrap();
+        assert_eq!(tcs.len(), 1);
+        assert_eq!(tcs[0].function.name, "get_weather");
+        assert_eq!(tcs[0].function.arguments["city"], "Boston");
+        assert_eq!(tcs[0].function.arguments["units"], "celsius");
+        assert_eq!(tcs[0].id.as_deref(), Some("call_xyz"));
+    }
+
+    #[test]
+    fn tool_call_arguments_accept_native_json_on_receive() {
+        // Some providers return arguments as an object directly; accept both.
+        let response_msg = json!({
+            "role": "assistant",
+            "content": "",
+            "tool_calls": [{
+                "id": "call_1",
+                "type": "function",
+                "function": {
+                    "name": "foo",
+                    "arguments": {"nested": {"k": 1}}
+                }
+            }]
+        });
+        let parsed = OpenRouterClient::openai_message_to_chat(&response_msg).unwrap();
+        let tc = &parsed.tool_calls.unwrap()[0];
+        assert_eq!(tc.function.arguments["nested"]["k"], 1);
+    }
+
+    #[test]
+    fn images_become_content_parts() {
+        let mut msg = ChatMessage::user("What is in this photo?");
+        msg.images = Some(vec!["BASE64DATA".into()]);
+
+        let wire = OpenRouterClient::messages_to_openai(&[msg]);
+        let content = wire[0].get("content").and_then(|v| v.as_array()).unwrap();
+        assert_eq!(content.len(), 2);
+        assert_eq!(content[0]["type"], "text");
+        assert_eq!(content[0]["text"], "What is in this photo?");
+        assert_eq!(content[1]["type"], "image_url");
+        assert_eq!(
+            content[1]["image_url"]["url"],
+            "data:image/jpeg;base64,BASE64DATA"
+        );
+    }
+
+    #[test]
+    fn data_url_images_pass_through_unchanged() {
+        let mut msg = ChatMessage::user("");
+        msg.images = Some(vec!["data:image/png;base64,ABCDEF".into()]);
+        let wire = OpenRouterClient::messages_to_openai(&[msg]);
+        let content = wire[0].get("content").and_then(|v| v.as_array()).unwrap();
+        // No text part when content is empty.
+        assert_eq!(content.len(), 1);
+        assert_eq!(
+            content[0]["image_url"]["url"],
+            "data:image/png;base64,ABCDEF"
+        );
+    }
+
+    #[test]
+    fn text_only_message_stays_string() {
+        let msg = ChatMessage::user("hello");
+        let wire = OpenRouterClient::messages_to_openai(&[msg]);
+        assert_eq!(wire[0]["content"], "hello");
+        assert!(wire[0]["content"].as_str().is_some());
+    }
+
+    #[test]
+    fn tool_result_inherits_tool_call_id_from_prior_assistant() {
+        let assistant = ChatMessage {
+            role: "assistant".into(),
+            content: String::new(),
+            tool_calls: Some(vec![ToolCall {
+                id: Some("call_42".into()),
+                function: ToolCallFunction {
+                    name: "lookup".into(),
+                    arguments: json!({}),
+                },
+            }]),
+            images: None,
+        };
+        let tool_result = ChatMessage::tool_result("found it");
+
+        let wire = OpenRouterClient::messages_to_openai(&[assistant, tool_result]);
+        assert_eq!(wire[1]["role"], "tool");
+        assert_eq!(wire[1]["tool_call_id"], "call_42");
+    }
+
+    #[test]
+    fn multiple_tool_results_map_to_sequential_call_ids() {
+        let assistant = ChatMessage {
+            role: "assistant".into(),
+            content: String::new(),
+            tool_calls: Some(vec![
+                ToolCall {
+                    id: Some("call_A".into()),
+                    function: ToolCallFunction {
+                        name: "a".into(),
+                        arguments: json!({}),
+                    },
+                },
+                ToolCall {
+                    id: Some("call_B".into()),
+                    function: ToolCallFunction {
+                        name: "b".into(),
+                        arguments: json!({}),
+                    },
+                },
+            ]),
+            images: None,
+        };
+        let r1 = ChatMessage::tool_result("a result");
+        let r2 = ChatMessage::tool_result("b result");
+
+        let wire = OpenRouterClient::messages_to_openai(&[assistant, r1, r2]);
+        assert_eq!(wire[1]["tool_call_id"], "call_A");
+        assert_eq!(wire[2]["tool_call_id"], "call_B");
+    }
+
+    #[test]
+    fn missing_tool_call_id_gets_synthetic_fallback() {
+        let assistant = ChatMessage {
+            role: "assistant".into(),
+            content: String::new(),
+            tool_calls: Some(vec![ToolCall {
+                id: None,
+                function: ToolCallFunction {
+                    name: "noid".into(),
+                    arguments: json!({}),
+                },
+            }]),
+            images: None,
+        };
+        let wire = OpenRouterClient::messages_to_openai(&[assistant]);
+        let tcs = wire[0]
+            .get("tool_calls")
+            .and_then(|v| v.as_array())
+            .unwrap();
+        assert_eq!(tcs[0]["id"], "call_0");
+    }
+
+    #[test]
+    fn parse_model_capabilities_extracts_tools_and_vision() {
+        let m = json!({
+            "id": "anthropic/claude-sonnet-4",
+            "supported_parameters": ["temperature", "top_p", "tools", "max_tokens"],
+            "architecture": {
+                "input_modalities": ["text", "image"]
+            }
+        });
+        let caps = parse_model_capabilities(&m);
+        assert_eq!(caps.name, "anthropic/claude-sonnet-4");
+        assert!(caps.has_tool_calling);
+        assert!(caps.has_vision);
+    }
+
+    #[test]
+    fn parse_model_capabilities_handles_missing_fields() {
+        let m = json!({
+            "id": "some/text-only-model"
+        });
+        let caps = parse_model_capabilities(&m);
+        assert_eq!(caps.name, "some/text-only-model");
+        assert!(!caps.has_tool_calling);
+        assert!(!caps.has_vision);
+    }
+}
-- 
2.49.1


From 3ac0cd62ebe01e98d24eb21074207a30f4e7076e Mon Sep 17 00:00:00 2001
From: Cameron <ccordes12@mail.bw.edu>
Date: Mon, 20 Apr 2026 22:30:40 -0400
Subject: [PATCH 03/24] feat(ai): hybrid backend mode for agentic insights
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a `backend` column to photo_insights (default 'local', migration
2026-04-20-000000) and a corresponding optional `backend` field on the
agentic request. When a request sets backend=hybrid:

- The local Ollama vision model is called once via describe_image to
  produce a text description.
- The description is inlined into the first user message as text —
  no base64 image is ever sent to the chat model.
- The agentic tool-calling loop and title generation route through an
  OpenRouterClient (dispatched via &dyn LlmClient), letting the user
  pick any tool-capable model from OpenRouter per request.
- describe_photo is removed from the offered tools since the description
  is already present.

Embeddings and vision stay on local Ollama regardless of backend.
Hybrid mode requires OPENROUTER_API_KEY; handlers return a clear error
when hybrid is requested without it, and also when the selected
OpenRouter model lacks tool-calling support.

AppState gains an optional openrouter client built from
OPENROUTER_API_KEY / OPENROUTER_BASE_URL / OPENROUTER_DEFAULT_MODEL /
OPENROUTER_EMBEDDING_MODEL / attribution headers. Default model is
anthropic/claude-sonnet-4.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../down.sql                                  |  23 ++
 .../up.sql                                    |   1 +
 src/ai/handlers.rs                            |  13 +
 src/ai/insight_generator.rs                   | 363 +++++++++++++-----
 src/bin/populate_knowledge.rs                 |   2 +
 src/database/models.rs                        |   4 +
 src/database/schema.rs                        |   1 +
 src/state.rs                                  |  35 ++
 8 files changed, 342 insertions(+), 100 deletions(-)
 create mode 100644 migrations/2026-04-20-000000_add_backend_to_insights/down.sql
 create mode 100644 migrations/2026-04-20-000000_add_backend_to_insights/up.sql

diff --git a/migrations/2026-04-20-000000_add_backend_to_insights/down.sql b/migrations/2026-04-20-000000_add_backend_to_insights/down.sql
new file mode 100644
index 0000000..cb8864d
--- /dev/null
+++ b/migrations/2026-04-20-000000_add_backend_to_insights/down.sql
@@ -0,0 +1,23 @@
+-- SQLite can't DROP COLUMN cleanly on older versions; rebuild the table.
+CREATE TABLE photo_insights_backup AS
+    SELECT id, library_id, rel_path, title, summary, generated_at, model_version,
+           is_current, training_messages, approved
+    FROM photo_insights;
+DROP TABLE photo_insights;
+CREATE TABLE photo_insights (
+    id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
+    library_id INTEGER NOT NULL REFERENCES libraries(id),
+    rel_path TEXT NOT NULL,
+    title TEXT NOT NULL,
+    summary TEXT NOT NULL,
+    generated_at BIGINT NOT NULL,
+    model_version TEXT NOT NULL,
+    is_current BOOLEAN NOT NULL DEFAULT TRUE,
+    training_messages TEXT,
+    approved BOOLEAN
+);
+INSERT INTO photo_insights
+    SELECT id, library_id, rel_path, title, summary, generated_at, model_version,
+           is_current, training_messages, approved
+    FROM photo_insights_backup;
+DROP TABLE photo_insights_backup;
diff --git a/migrations/2026-04-20-000000_add_backend_to_insights/up.sql b/migrations/2026-04-20-000000_add_backend_to_insights/up.sql
new file mode 100644
index 0000000..520c209
--- /dev/null
+++ b/migrations/2026-04-20-000000_add_backend_to_insights/up.sql
@@ -0,0 +1 @@
+ALTER TABLE photo_insights ADD COLUMN backend TEXT NOT NULL DEFAULT 'local';
diff --git a/src/ai/handlers.rs b/src/ai/handlers.rs
index abf2369..038470d 100644
--- a/src/ai/handlers.rs
+++ b/src/ai/handlers.rs
@@ -28,6 +28,10 @@ pub struct GeneratePhotoInsightRequest {
     pub top_k: Option<i32>,
     #[serde(default)]
     pub min_p: Option<f32>,
+    /// `"local"` (default, Ollama with images) | `"hybrid"` (local vision +
+    /// OpenRouter chat). Only respected by the agentic endpoint.
+    #[serde(default)]
+    pub backend: Option<String>,
 }
 
 #[derive(Debug, Deserialize)]
@@ -65,6 +69,7 @@ pub struct PhotoInsightResponse {
     pub eval_count: Option<i32>,
     #[serde(skip_serializing_if = "Option::is_none")]
     pub approved: Option<bool>,
+    pub backend: String,
 }
 
 #[derive(Debug, Serialize)]
@@ -187,6 +192,7 @@ pub async fn get_insight_handler(
                 prompt_eval_count: None,
                 eval_count: None,
                 approved: insight.approved,
+                backend: insight.backend,
             };
             HttpResponse::Ok().json(response)
         }
@@ -254,6 +260,7 @@ pub async fn get_all_insights_handler(
                     prompt_eval_count: None,
                     eval_count: None,
                     approved: insight.approved,
+                    backend: insight.backend,
                 })
                 .collect();
 
@@ -309,6 +316,10 @@ pub async fn generate_agentic_insight_handler(
         max_iterations
     );
 
+    if let Some(ref b) = request.backend {
+        span.set_attribute(KeyValue::new("backend", b.clone()));
+    }
+
     let result = insight_generator
         .generate_agentic_insight_for_photo(
             &normalized_path,
@@ -320,6 +331,7 @@ pub async fn generate_agentic_insight_handler(
             request.top_k,
             request.min_p,
             max_iterations,
+            request.backend.clone(),
         )
         .await;
 
@@ -341,6 +353,7 @@ pub async fn generate_agentic_insight_handler(
                         prompt_eval_count,
                         eval_count,
                         approved: insight.approved,
+                        backend: insight.backend,
                     };
                     HttpResponse::Ok().json(response)
                 }
diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs
index 18e50c7..292324c 100644
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -9,7 +9,9 @@ use std::fs::File;
 use std::io::Cursor;
 use std::sync::{Arc, Mutex};
 
+use crate::ai::llm_client::LlmClient;
 use crate::ai::ollama::{ChatMessage, OllamaClient, Tool};
+use crate::ai::openrouter::OpenRouterClient;
 use crate::ai::sms_client::SmsApiClient;
 use crate::database::models::InsertPhotoInsight;
 use crate::database::{
@@ -39,6 +41,9 @@ struct NominatimAddress {
 #[derive(Clone)]
 pub struct InsightGenerator {
     ollama: OllamaClient,
+    /// Optional OpenRouter client, used when `backend=hybrid` is requested.
+    /// `None` when `OPENROUTER_API_KEY` is not configured.
+    openrouter: Option<Arc<OpenRouterClient>>,
     sms_client: SmsApiClient,
     insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
     exif_dao: Arc<Mutex<Box<dyn ExifDao>>>,
@@ -59,6 +64,7 @@ pub struct InsightGenerator {
 impl InsightGenerator {
     pub fn new(
         ollama: OllamaClient,
+        openrouter: Option<Arc<OpenRouterClient>>,
         sms_client: SmsApiClient,
         insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
         exif_dao: Arc<Mutex<Box<dyn ExifDao>>>,
@@ -72,6 +78,7 @@ impl InsightGenerator {
     ) -> Self {
         Self {
             ollama,
+            openrouter,
             sms_client,
             insight_dao,
             exif_dao,
@@ -1218,6 +1225,7 @@ impl InsightGenerator {
             model_version: ollama_client.primary_model.clone(),
             is_current: true,
             training_messages: None,
+            backend: "local".to_string(),
         };
 
         let mut dao = self.insight_dao.lock().expect("Unable to lock InsightDao");
@@ -2376,6 +2384,14 @@ Return ONLY the summary, nothing else."#,
 
     /// Generate an AI insight for a photo using an agentic tool-calling loop.
     /// The model decides which tools to call to gather context before writing the final insight.
+    ///
+    /// `backend` selects the chat provider: `"local"` (default) routes the
+    /// agentic loop through the configured Ollama server with the image
+    /// attached to the first user message; `"hybrid"` asks the local Ollama
+    /// vision model to describe the image once, inlines the description as
+    /// text, and runs the loop through OpenRouter (chat only — embeddings
+    /// and describe calls stay local in either mode).
+    #[allow(clippy::too_many_arguments)]
     pub async fn generate_agentic_insight_for_photo(
         &self,
         file_path: &str,
@@ -2387,6 +2403,7 @@ Return ONLY the summary, nothing else."#,
         top_k: Option<i32>,
         min_p: Option<f32>,
         max_iterations: usize,
+        backend: Option<String>,
     ) -> Result<(Option<i32>, Option<i32>)> {
         let tracer = global_tracer();
         let current_cx = opentelemetry::Context::current();
@@ -2398,8 +2415,30 @@ Return ONLY the summary, nothing else."#,
         span.set_attribute(KeyValue::new("file_path", file_path.clone()));
         span.set_attribute(KeyValue::new("max_iterations", max_iterations as i64));
 
-        // 1. Create OllamaClient
-        let mut ollama_client = if let Some(ref model) = custom_model {
+        // 1a. Resolve backend label (defaults to "local").
+        let backend_label = backend
+            .as_deref()
+            .map(|s| s.trim().to_lowercase())
+            .filter(|s| !s.is_empty())
+            .unwrap_or_else(|| "local".to_string());
+        if !matches!(backend_label.as_str(), "local" | "hybrid") {
+            return Err(anyhow::anyhow!(
+                "unknown backend '{}'; expected 'local' or 'hybrid'",
+                backend_label
+            ));
+        }
+        span.set_attribute(KeyValue::new("backend", backend_label.clone()));
+        let is_hybrid = backend_label == "hybrid";
+
+        // 1b. Always build an Ollama client. In local mode it owns the chat
+        //     loop; in hybrid mode it still handles describe_image + any
+        //     tool-local calls (e.g. if a future tool needs embeddings).
+        //     Sampling overrides only apply in local mode — in hybrid the
+        //     user's params belong to the OpenRouter chat client.
+        let apply_sampling_to_ollama = !is_hybrid;
+        let mut ollama_client = if let Some(ref model) = custom_model
+            && !is_hybrid
+        {
             log::info!("Using custom model for agentic: {}", model);
             span.set_attribute(KeyValue::new("custom_model", model.clone()));
             OllamaClient::new(
@@ -2409,108 +2448,179 @@ Return ONLY the summary, nothing else."#,
                 Some(model.clone()),
             )
         } else {
-            span.set_attribute(KeyValue::new("model", self.ollama.primary_model.clone()));
+            if !is_hybrid {
+                span.set_attribute(KeyValue::new("model", self.ollama.primary_model.clone()));
+            }
             self.ollama.clone()
         };
 
-        if let Some(ctx) = num_ctx {
-            log::info!("Using custom context size: {}", ctx);
-            span.set_attribute(KeyValue::new("num_ctx", ctx as i64));
-            ollama_client.set_num_ctx(Some(ctx));
+        if apply_sampling_to_ollama {
+            if let Some(ctx) = num_ctx {
+                log::info!("Using custom context size: {}", ctx);
+                span.set_attribute(KeyValue::new("num_ctx", ctx as i64));
+                ollama_client.set_num_ctx(Some(ctx));
+            }
+
+            if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() {
+                log::info!(
+                    "Using sampling params — temperature: {:?}, top_p: {:?}, top_k: {:?}, min_p: {:?}",
+                    temperature,
+                    top_p,
+                    top_k,
+                    min_p
+                );
+                if let Some(t) = temperature {
+                    span.set_attribute(KeyValue::new("temperature", t as f64));
+                }
+                if let Some(p) = top_p {
+                    span.set_attribute(KeyValue::new("top_p", p as f64));
+                }
+                if let Some(k) = top_k {
+                    span.set_attribute(KeyValue::new("top_k", k as i64));
+                }
+                if let Some(m) = min_p {
+                    span.set_attribute(KeyValue::new("min_p", m as f64));
+                }
+                ollama_client.set_sampling_params(temperature, top_p, top_k, min_p);
+            }
         }
 
-        if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() {
-            log::info!(
-                "Using sampling params — temperature: {:?}, top_p: {:?}, top_k: {:?}, min_p: {:?}",
-                temperature,
-                top_p,
-                top_k,
-                min_p
-            );
-            if let Some(t) = temperature {
-                span.set_attribute(KeyValue::new("temperature", t as f64));
+        // 1c. In hybrid mode, clone the configured OpenRouter client and
+        //     apply per-request overrides.
+        let openrouter_client: Option<OpenRouterClient> = if is_hybrid {
+            let arc = self.openrouter.as_ref().ok_or_else(|| {
+                anyhow::anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
+            })?;
+            let mut c: OpenRouterClient = (**arc).clone();
+            if let Some(ref m) = custom_model {
+                c.primary_model = m.clone();
+                span.set_attribute(KeyValue::new("custom_model", m.clone()));
             }
-            if let Some(p) = top_p {
-                span.set_attribute(KeyValue::new("top_p", p as f64));
+            span.set_attribute(KeyValue::new("openrouter_model", c.primary_model.clone()));
+            if temperature.is_some() || top_p.is_some() || top_k.is_some() || min_p.is_some() {
+                if let Some(t) = temperature {
+                    span.set_attribute(KeyValue::new("temperature", t as f64));
+                }
+                if let Some(p) = top_p {
+                    span.set_attribute(KeyValue::new("top_p", p as f64));
+                }
+                if let Some(k) = top_k {
+                    span.set_attribute(KeyValue::new("top_k", k as i64));
+                }
+                if let Some(m) = min_p {
+                    span.set_attribute(KeyValue::new("min_p", m as f64));
+                }
+                c.set_sampling_params(temperature, top_p, top_k, min_p);
             }
-            if let Some(k) = top_k {
-                span.set_attribute(KeyValue::new("top_k", k as i64));
+            if let Some(ctx) = num_ctx {
+                span.set_attribute(KeyValue::new("num_ctx", ctx as i64));
+                c.set_num_ctx(Some(ctx));
             }
-            if let Some(m) = min_p {
-                span.set_attribute(KeyValue::new("min_p", m as f64));
-            }
-            ollama_client.set_sampling_params(temperature, top_p, top_k, min_p);
-        }
+            Some(c)
+        } else {
+            None
+        };
 
         let insight_cx = current_cx.with_span(span);
 
-        // 2a. Verify the model exists on at least one server before checking capabilities
-        if let Some(ref model_name) = custom_model {
-            let available_on_primary =
-                OllamaClient::is_model_available(&ollama_client.primary_url, model_name)
-                    .await
-                    .unwrap_or(false);
-
-            let available_on_fallback = if let Some(ref fallback_url) = ollama_client.fallback_url {
-                OllamaClient::is_model_available(fallback_url, model_name)
-                    .await
-                    .unwrap_or(false)
-            } else {
-                false
-            };
-
-            if !available_on_primary && !available_on_fallback {
-                anyhow::bail!(
-                    "model not available: '{}' not found on any configured server",
-                    model_name
-                );
-            }
-        }
-
-        // 2b. Check tool calling capability — try primary, fall back to fallback URL
-        let model_name_for_caps = &ollama_client.primary_model;
-        let capabilities = match OllamaClient::check_model_capabilities(
-            &ollama_client.primary_url,
-            model_name_for_caps,
-        )
-        .await
-        {
-            Ok(caps) => caps,
-            Err(_) => {
-                // Model may only be on the fallback server
-                let fallback_url = ollama_client.fallback_url.as_deref().ok_or_else(|| {
+        // 2. Verify chat model supports tool calling.
+        //    - local: existing Ollama model availability + capability check.
+        //    - hybrid: query OpenRouter's /models for the chosen model.
+        let has_vision = if is_hybrid {
+            let or_client = openrouter_client
+                .as_ref()
+                .expect("openrouter_client constructed when is_hybrid");
+            let caps = or_client
+                .model_capabilities(&or_client.primary_model)
+                .await
+                .map_err(|e| {
                     anyhow::anyhow!(
-                        "Failed to check model capabilities for '{}': model not found on primary server and no fallback configured",
-                        model_name_for_caps
+                        "OpenRouter capability lookup failed for '{}': {}",
+                        or_client.primary_model,
+                        e
                     )
                 })?;
-                OllamaClient::check_model_capabilities(fallback_url, model_name_for_caps)
-                    .await
-                    .map_err(|e| {
-                        anyhow::anyhow!(
-                            "Failed to check model capabilities for '{}': {}",
-                            model_name_for_caps,
-                            e
-                        )
-                    })?
+            if !caps.has_tool_calling {
+                return Err(anyhow::anyhow!(
+                    "tool calling not supported by OpenRouter model '{}'",
+                    or_client.primary_model
+                ));
             }
+            insight_cx
+                .span()
+                .set_attribute(KeyValue::new("model_has_tool_calling", true));
+            // In hybrid mode the chat model never sees images directly — we
+            // describe-then-inject, so `has_vision` drives only whether we
+            // bother loading the image to describe it, which we always do.
+            true
+        } else {
+            if let Some(ref model_name) = custom_model {
+                let available_on_primary =
+                    OllamaClient::is_model_available(&ollama_client.primary_url, model_name)
+                        .await
+                        .unwrap_or(false);
+
+                let available_on_fallback =
+                    if let Some(ref fallback_url) = ollama_client.fallback_url {
+                        OllamaClient::is_model_available(fallback_url, model_name)
+                            .await
+                            .unwrap_or(false)
+                    } else {
+                        false
+                    };
+
+                if !available_on_primary && !available_on_fallback {
+                    anyhow::bail!(
+                        "model not available: '{}' not found on any configured server",
+                        model_name
+                    );
+                }
+            }
+
+            let model_name_for_caps = &ollama_client.primary_model;
+            let capabilities = match OllamaClient::check_model_capabilities(
+                &ollama_client.primary_url,
+                model_name_for_caps,
+            )
+            .await
+            {
+                Ok(caps) => caps,
+                Err(_) => {
+                    let fallback_url = ollama_client.fallback_url.as_deref().ok_or_else(|| {
+                        anyhow::anyhow!(
+                            "Failed to check model capabilities for '{}': model not found on primary server and no fallback configured",
+                            model_name_for_caps
+                        )
+                    })?;
+                    OllamaClient::check_model_capabilities(fallback_url, model_name_for_caps)
+                        .await
+                        .map_err(|e| {
+                            anyhow::anyhow!(
+                                "Failed to check model capabilities for '{}': {}",
+                                model_name_for_caps,
+                                e
+                            )
+                        })?
+                }
+            };
+
+            if !capabilities.has_tool_calling {
+                return Err(anyhow::anyhow!(
+                    "tool calling not supported by model '{}'",
+                    ollama_client.primary_model
+                ));
+            }
+
+            insight_cx
+                .span()
+                .set_attribute(KeyValue::new("model_has_vision", capabilities.has_vision));
+            insight_cx
+                .span()
+                .set_attribute(KeyValue::new("model_has_tool_calling", true));
+
+            capabilities.has_vision
         };
 
-        if !capabilities.has_tool_calling {
-            return Err(anyhow::anyhow!(
-                "tool calling not supported by model '{}'",
-                ollama_client.primary_model
-            ));
-        }
-
-        let has_vision = capabilities.has_vision;
-        insight_cx
-            .span()
-            .set_attribute(KeyValue::new("model_has_vision", has_vision));
-        insight_cx
-            .span()
-            .set_attribute(KeyValue::new("model_has_tool_calling", true));
-
         // 3. Fetch EXIF
         let exif = {
             let mut exif_dao = self.exif_dao.lock().expect("Unable to lock ExifDao");
@@ -2603,7 +2713,10 @@ Return ONLY the summary, nothing else."#,
             }
         };
 
-        // 7. Load image if vision capable
+        // 7. Load image if vision capable.
+        //    In hybrid mode we ALSO describe it locally now so the
+        //    description can be inlined as text — the OpenRouter chat model
+        //    never receives the base64 image directly.
         let image_base64 = if has_vision {
             match self.load_image_as_base64(&file_path) {
                 Ok(b64) => {
@@ -2619,6 +2732,30 @@ Return ONLY the summary, nothing else."#,
             None
         };
 
+        let hybrid_visual_description: Option<String> = if is_hybrid {
+            match image_base64.as_deref() {
+                Some(b64) => match self.ollama.describe_image(b64).await {
+                    Ok(desc) => {
+                        log::info!(
+                            "Hybrid: local vision describe succeeded ({} chars)",
+                            desc.len()
+                        );
+                        Some(desc)
+                    }
+                    Err(e) => {
+                        log::warn!(
+                            "Hybrid: local vision describe failed, continuing without: {}",
+                            e
+                        );
+                        None
+                    }
+                },
+                None => None,
+            }
+        } else {
+            None
+        };
+
         // 8. Build system message
         let cameron_id_note = match cameron_entity_id {
             Some(id) => format!(
@@ -2672,8 +2809,13 @@ Return ONLY the summary, nothing else."#,
             .map(|c| format!("Contact/Person: {}", c))
             .unwrap_or_else(|| "Contact/Person: unknown".to_string());
 
+        let visual_block = hybrid_visual_description
+            .as_deref()
+            .map(|d| format!("Visual description (from local vision model):\n{}\n\n", d))
+            .unwrap_or_default();
+
         let user_content = format!(
-            "Please analyze this photo and gather any relevant context from the surrounding weeks.\n\n\
+            "{visual_block}Please analyze this photo and gather any relevant context from the surrounding weeks.\n\n\
              Photo file path: {}\n\
              Date taken: {}\n\
              {}\n\
@@ -2686,21 +2828,32 @@ Return ONLY the summary, nothing else."#,
             contact_info,
             gps_info,
             tags_info,
+            visual_block = visual_block,
         );
 
-        // 10. Define tools
-        let tools = Self::build_tool_definitions(has_vision);
+        // 10. Define tools. Hybrid mode omits `describe_photo` since the
+        //     chat model receives the visual description inline.
+        let offer_describe_tool = has_vision && !is_hybrid;
+        let tools = Self::build_tool_definitions(offer_describe_tool);
 
-        // 11. Build initial messages
+        // 11. Build initial messages. In hybrid mode images are never
+        //     attached to the wire message — the description is part of
+        //     `user_content`.
         let system_msg = ChatMessage::system(system_content);
         let mut user_msg = ChatMessage::user(user_content);
-        if let Some(ref img) = image_base64 {
+        if !is_hybrid && let Some(ref img) = image_base64 {
             user_msg.images = Some(vec![img.clone()]);
         }
 
         let mut messages = vec![system_msg, user_msg];
 
-        // 12. Agentic loop
+        // 12. Agentic loop — dispatch through the selected backend.
+        let chat_backend: &dyn LlmClient = if let Some(ref or_c) = openrouter_client {
+            or_c
+        } else {
+            &ollama_client
+        };
+
         let loop_span = tracer.start_with_context("ai.agentic.loop", &insight_cx);
         let loop_cx = insight_cx.with_span(loop_span);
 
@@ -2713,7 +2866,7 @@ Return ONLY the summary, nothing else."#,
             iterations_used = iteration + 1;
             log::info!("Agentic iteration {}/{}", iteration + 1, max_iterations);
 
-            let (response, prompt_tokens, eval_tokens) = ollama_client
+            let (response, prompt_tokens, eval_tokens) = chat_backend
                 .chat_with_tools(messages.clone(), tools.clone())
                 .await?;
 
@@ -2778,7 +2931,7 @@ Return ONLY the summary, nothing else."#,
             messages.push(ChatMessage::user(
                 "Based on the context gathered, please write the final photo insight: a title and a detailed personal summary. Write in first person as Cameron.",
             ));
-            let (final_response, prompt_tokens, eval_tokens) = ollama_client
+            let (final_response, prompt_tokens, eval_tokens) = chat_backend
                 .chat_with_tools(messages.clone(), vec![])
                 .await?;
             last_prompt_eval_count = prompt_tokens;
@@ -2792,10 +2945,18 @@ Return ONLY the summary, nothing else."#,
             .set_attribute(KeyValue::new("iterations_used", iterations_used as i64));
         loop_cx.span().set_status(Status::Ok);
 
-        // 13. Generate title
-        let title = ollama_client
-            .generate_photo_title(&final_content, custom_system_prompt.as_deref())
+        // 13. Generate title via the same backend so voice stays consistent.
+        let title_prompt = format!(
+            "Create a short title (maximum 8 words) for the following journal entry:\n\n{}\n\nCapture the key moment or theme. Return ONLY the title, nothing else.",
+            final_content
+        );
+        let title_system = custom_system_prompt.as_deref().unwrap_or(
+            "You are my long term memory assistant. Use only the information provided. Do not invent details.",
+        );
+        let title_raw = chat_backend
+            .generate(&title_prompt, Some(title_system), None)
             .await?;
+        let title = title_raw.trim().trim_matches('"').to_string();
 
         log::info!("Agentic generated title: {}", title);
         log::info!(
@@ -2814,15 +2975,17 @@ Return ONLY the summary, nothing else."#,
         };
 
         // 15. Store insight (returns the persisted row including its new id)
+        let model_version = chat_backend.primary_model().to_string();
         let insight = InsertPhotoInsight {
             library_id: crate::libraries::PRIMARY_LIBRARY_ID,
             file_path: file_path.to_string(),
             title,
             summary: final_content,
             generated_at: Utc::now().timestamp(),
-            model_version: ollama_client.primary_model.clone(),
+            model_version,
             is_current: true,
             training_messages,
+            backend: backend_label.clone(),
         };
 
         let stored = {
diff --git a/src/bin/populate_knowledge.rs b/src/bin/populate_knowledge.rs
index bc37960..f70c5a2 100644
--- a/src/bin/populate_knowledge.rs
+++ b/src/bin/populate_knowledge.rs
@@ -134,6 +134,7 @@ async fn main() -> anyhow::Result<()> {
 
     let generator = InsightGenerator::new(
         ollama,
+        None,
         sms_client,
         insight_dao.clone(),
         exif_dao,
@@ -249,6 +250,7 @@ async fn main() -> anyhow::Result<()> {
                 args.top_k,
                 args.min_p,
                 args.max_iterations,
+                None,
             )
             .await
         {
diff --git a/src/database/models.rs b/src/database/models.rs
index d95876b..3d63f1a 100644
--- a/src/database/models.rs
+++ b/src/database/models.rs
@@ -100,6 +100,8 @@ pub struct InsertPhotoInsight {
     pub model_version: String,
     pub is_current: bool,
     pub training_messages: Option<String>,
+    /// `"local"` (Ollama with images) | `"hybrid"` (local vision + OpenRouter chat).
+    pub backend: String,
 }
 
 #[derive(Serialize, Queryable, Clone, Debug)]
@@ -115,6 +117,8 @@ pub struct PhotoInsight {
     pub is_current: bool,
     pub training_messages: Option<String>,
     pub approved: Option<bool>,
+    /// `"local"` (Ollama with images) | `"hybrid"` (local vision + OpenRouter chat).
+    pub backend: String,
 }
 
 // --- Libraries ---
diff --git a/src/database/schema.rs b/src/database/schema.rs
index 3352ca6..200cb15 100644
--- a/src/database/schema.rs
+++ b/src/database/schema.rs
@@ -142,6 +142,7 @@ diesel::table! {
         is_current -> Bool,
         training_messages -> Nullable<Text>,
         approved -> Nullable<Bool>,
+        backend -> Text,
     }
 }
 
diff --git a/src/state.rs b/src/state.rs
index 78b98ad..72a509e 100644
--- a/src/state.rs
+++ b/src/state.rs
@@ -1,3 +1,4 @@
+use crate::ai::openrouter::OpenRouterClient;
 use crate::ai::{InsightGenerator, OllamaClient, SmsApiClient};
 use crate::database::{
     CalendarEventDao, DailySummaryDao, ExifDao, InsightDao, KnowledgeDao, LocationHistoryDao,
@@ -31,6 +32,13 @@ pub struct AppState {
     pub preview_clips_path: String,
     pub excluded_dirs: Vec<String>,
     pub ollama: OllamaClient,
+    /// `None` when `OPENROUTER_API_KEY` is not configured. Consulted only
+    /// when a request explicitly opts into `backend=hybrid`. Currently
+    /// reached via `insight_generator`; kept here so future handlers
+    /// (insight_chat) can route to it without threading it through the
+    /// generator.
+    #[allow(dead_code)]
+    pub openrouter: Option<Arc<OpenRouterClient>>,
     pub sms_client: SmsApiClient,
     pub insight_generator: InsightGenerator,
 }
@@ -61,6 +69,7 @@ impl AppState {
         preview_clips_path: String,
         excluded_dirs: Vec<String>,
         ollama: OllamaClient,
+        openrouter: Option<Arc<OpenRouterClient>>,
         sms_client: SmsApiClient,
         insight_generator: InsightGenerator,
         preview_dao: Arc<Mutex<Box<dyn PreviewDao>>>,
@@ -92,6 +101,7 @@ impl AppState {
             preview_clips_path,
             excluded_dirs,
             ollama,
+            openrouter,
             sms_client,
             insight_generator,
         }
@@ -127,6 +137,8 @@ impl Default for AppState {
             ollama_fallback_model,
         );
 
+        let openrouter = build_openrouter_from_env();
+
         let sms_api_url =
             env::var("SMS_API_URL").unwrap_or_else(|_| "http://localhost:8000".to_string());
         let sms_api_token = env::var("SMS_API_TOKEN").ok();
@@ -168,6 +180,7 @@ impl Default for AppState {
         // Initialize InsightGenerator with all data sources
         let insight_generator = InsightGenerator::new(
             ollama.clone(),
+            openrouter.clone(),
             sms_client.clone(),
             insight_dao.clone(),
             exif_dao.clone(),
@@ -195,6 +208,7 @@ impl Default for AppState {
             preview_clips_path,
             Self::parse_excluded_dirs(),
             ollama,
+            openrouter,
             sms_client,
             insight_generator,
             preview_dao,
@@ -202,6 +216,25 @@ impl Default for AppState {
     }
 }
 
+/// Build an `OpenRouterClient` from environment variables. Returns `None`
+/// when `OPENROUTER_API_KEY` is unset (the hybrid backend is then
+/// unavailable and requests for it return a clear error).
+fn build_openrouter_from_env() -> Option<Arc<OpenRouterClient>> {
+    let api_key = env::var("OPENROUTER_API_KEY").ok()?;
+    let base_url = env::var("OPENROUTER_BASE_URL").ok();
+    let default_model = env::var("OPENROUTER_DEFAULT_MODEL")
+        .unwrap_or_else(|_| "anthropic/claude-sonnet-4".to_string());
+    let mut client = OpenRouterClient::new(api_key, base_url, default_model);
+    client.set_attribution(
+        env::var("OPENROUTER_HTTP_REFERER").ok(),
+        env::var("OPENROUTER_APP_TITLE").ok(),
+    );
+    if let Ok(model) = env::var("OPENROUTER_EMBEDDING_MODEL") {
+        client.set_embedding_model(model);
+    }
+    Some(Arc::new(client))
+}
+
 #[cfg(test)]
 impl AppState {
     /// Creates an AppState instance for testing with temporary directories
@@ -255,6 +288,7 @@ impl AppState {
         };
         let insight_generator = InsightGenerator::new(
             ollama.clone(),
+            None,
             sms_client.clone(),
             insight_dao.clone(),
             exif_dao.clone(),
@@ -286,6 +320,7 @@ impl AppState {
             preview_clips_path.to_string_lossy().to_string(),
             Vec::new(), // No excluded directories for test state
             ollama,
+            None,
             sms_client,
             insight_generator,
             preview_dao,
-- 
2.49.1


From e2eefbd15652a238da9391835c5509dcf73ea0bc Mon Sep 17 00:00:00 2001
From: Cameron <ccordes12@mail.bw.edu>
Date: Tue, 21 Apr 2026 10:36:19 -0400
Subject: [PATCH 04/24] feat(ai): curated OpenRouter model picker for hybrid
 backend
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add OPENROUTER_ALLOWED_MODELS env var and GET /insights/openrouter/models
endpoint returning the curated list verbatim. Drop the live capability
precheck in hybrid mode — trust the operator's allowlist; bad ids surface
as a chat-call error.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CLAUDE.md                   | 22 ++++++++++++++++++++++
 README.md                   | 23 +++++++++++++++++++++++
 src/ai/handlers.rs          | 28 ++++++++++++++++++++++++++++
 src/ai/insight_generator.rs | 28 ++++------------------------
 src/ai/mod.rs               |  2 +-
 src/main.rs                 |  1 +
 src/state.rs                | 20 ++++++++++++++++++++
 7 files changed, 99 insertions(+), 25 deletions(-)

diff --git a/CLAUDE.md b/CLAUDE.md
index 5da2612..0849e41 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -258,6 +258,17 @@ OLLAMA_PRIMARY_MODEL=nemotron-3-nano:30b       # Model for primary server (defau
 OLLAMA_FALLBACK_MODEL=llama3.2:3b              # Model for fallback server (optional, uses primary if not set)
 SMS_API_URL=http://localhost:8000              # SMS message API endpoint (default: localhost:8000)
 SMS_API_TOKEN=your-api-token                   # SMS API authentication token (optional)
+
+# OpenRouter (Hybrid Backend) - keeps embeddings + vision local, routes chat to OpenRouter
+OPENROUTER_API_KEY=sk-or-...                   # Required to enable hybrid backend
+OPENROUTER_DEFAULT_MODEL=anthropic/claude-sonnet-4   # Used when client doesn't pick a model
+OPENROUTER_ALLOWED_MODELS=openai/gpt-4o-mini,anthropic/claude-haiku-4-5,google/gemini-2.5-flash
+                                                # Curated allowlist exposed to clients via
+                                                # GET /insights/openrouter/models. Empty = no picker.
+OPENROUTER_BASE_URL=https://openrouter.ai/api/v1     # Override base URL (optional)
+OPENROUTER_EMBEDDING_MODEL=openai/text-embedding-3-small  # Optional, embeddings stay local today
+OPENROUTER_HTTP_REFERER=https://your-site.example    # Optional attribution header
+OPENROUTER_APP_TITLE=ImageApi                  # Optional attribution header
 ```
 
 **AI Insights Fallback Behavior:**
@@ -275,6 +286,17 @@ The `OllamaClient` provides methods to query available models:
 
 This allows runtime verification of model availability before generating insights.
 
+**Hybrid Backend (OpenRouter):**
+- Per-request opt-in via `backend=hybrid` on `POST /insights/generate/agentic`.
+- Local Ollama still describes the image (vision); the description is inlined
+  into the chat prompt and the agentic loop runs on OpenRouter.
+- `request.model` (if provided) overrides `OPENROUTER_DEFAULT_MODEL` for that
+  call. The mobile picker reads from `OPENROUTER_ALLOWED_MODELS`.
+- No live capability precheck — the operator-curated allowlist is trusted.
+  A bad model id surfaces as a chat-call error.
+- `GET /insights/openrouter/models` returns `{ models, default_model, configured }`
+  for client picker UIs.
+
 ## Dependencies of Note
 
 - **actix-web**: HTTP framework
diff --git a/README.md b/README.md
index c8f1c69..1bd3c9a 100644
--- a/README.md
+++ b/README.md
@@ -50,6 +50,29 @@ The following environment variables configure AI-powered photo insights and dail
 - `OLLAMA_URL` - Used if `OLLAMA_PRIMARY_URL` not set
 - `OLLAMA_MODEL` - Used if `OLLAMA_PRIMARY_MODEL` not set
 
+#### OpenRouter Configuration (Hybrid Backend)
+The hybrid agentic backend keeps embeddings + vision local (Ollama) while routing
+chat + tool-calling to OpenRouter. Enabled per-request when the client sends
+`backend=hybrid`.
+
+- `OPENROUTER_API_KEY` - OpenRouter API key. Required to enable the hybrid backend.
+- `OPENROUTER_DEFAULT_MODEL` - Model id used when the client doesn't specify one
+  [default: `anthropic/claude-sonnet-4`]
+  - Example: `openai/gpt-4o-mini`, `google/gemini-2.5-flash`
+- `OPENROUTER_ALLOWED_MODELS` - Comma-separated curated allowlist exposed to
+  clients via `GET /insights/openrouter/models`. The mobile picker shows only
+  these. Empty/unset = no picker, server default is used.
+  - Example: `openai/gpt-4o-mini,anthropic/claude-haiku-4-5,google/gemini-2.5-flash`
+- `OPENROUTER_BASE_URL` - Override base URL [default: `https://openrouter.ai/api/v1`]
+- `OPENROUTER_EMBEDDING_MODEL` - Embedding model for OpenRouter
+  [default: `openai/text-embedding-3-small`]. Only used if/when embeddings are
+  routed through OpenRouter (currently embeddings stay local).
+- `OPENROUTER_HTTP_REFERER` - Optional `HTTP-Referer` for OpenRouter attribution
+- `OPENROUTER_APP_TITLE` - Optional `X-Title` for OpenRouter attribution
+
+Capability checks are skipped for the curated allowlist — bad model ids surface
+as a 4xx from the chat call. Pick tool-capable models.
+
 #### SMS API Configuration
 - `SMS_API_URL` - URL to SMS message API [default: `http://localhost:8000`]
   - Used to fetch conversation data for context in insights
diff --git a/src/ai/handlers.rs b/src/ai/handlers.rs
index 038470d..5c4036e 100644
--- a/src/ai/handlers.rs
+++ b/src/ai/handlers.rs
@@ -445,6 +445,34 @@ pub async fn get_available_models_handler(
     HttpResponse::Ok().json(response)
 }
 
+#[derive(Debug, Serialize)]
+pub struct OpenRouterModelsResponse {
+    pub models: Vec<String>,
+    pub default_model: Option<String>,
+    pub configured: bool,
+}
+
+/// GET /insights/openrouter/models - Curated OpenRouter model ids exposed
+/// to clients for the hybrid backend. Returned verbatim from
+/// `OPENROUTER_ALLOWED_MODELS`; no live call to OpenRouter.
+#[get("/insights/openrouter/models")]
+pub async fn get_openrouter_models_handler(
+    _claims: Claims,
+    app_state: web::Data<crate::state::AppState>,
+) -> impl Responder {
+    let configured = app_state.openrouter.is_some();
+    let default_model = app_state
+        .openrouter
+        .as_ref()
+        .map(|c| c.primary_model.clone());
+    let response = OpenRouterModelsResponse {
+        models: app_state.openrouter_allowed_models.clone(),
+        default_model,
+        configured,
+    };
+    HttpResponse::Ok().json(response)
+}
+
 /// POST /insights/rate - Rate an insight (thumbs up/down for training data)
 #[post("/insights/rate")]
 pub async fn rate_insight_handler(
diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs
index 292324c..fcf89ee 100644
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -2525,30 +2525,10 @@ Return ONLY the summary, nothing else."#,
 
         // 2. Verify chat model supports tool calling.
         //    - local: existing Ollama model availability + capability check.
-        //    - hybrid: query OpenRouter's /models for the chosen model.
+        //    - hybrid: trust the operator's curated allowlist
+        //      (OPENROUTER_ALLOWED_MODELS) — no live precheck. A bad model id
+        //      surfaces as a chat-call error on the next step.
         let has_vision = if is_hybrid {
-            let or_client = openrouter_client
-                .as_ref()
-                .expect("openrouter_client constructed when is_hybrid");
-            let caps = or_client
-                .model_capabilities(&or_client.primary_model)
-                .await
-                .map_err(|e| {
-                    anyhow::anyhow!(
-                        "OpenRouter capability lookup failed for '{}': {}",
-                        or_client.primary_model,
-                        e
-                    )
-                })?;
-            if !caps.has_tool_calling {
-                return Err(anyhow::anyhow!(
-                    "tool calling not supported by OpenRouter model '{}'",
-                    or_client.primary_model
-                ));
-            }
-            insight_cx
-                .span()
-                .set_attribute(KeyValue::new("model_has_tool_calling", true));
             // In hybrid mode the chat model never sees images directly — we
             // describe-then-inject, so `has_vision` drives only whether we
             // bother loading the image to describe it, which we always do.
@@ -2776,7 +2756,7 @@ Return ONLY the summary, nothing else."#,
             3. Use recall_facts_for_photo to load any previously stored knowledge about subjects in this photo.\n\
             4. Use recall_entities to look up known people, places, or things that appear in this photo.\n\
             5. When you identify people, places, events, or notable things in this photo: use store_entity to record them and store_fact to record key facts (relationships, roles, attributes). This builds a persistent memory for future insights.\n\
-            6. Only produce your final insight AFTER you have gathered context from at least 5-12 tool calls.\n\
+            6. Only produce your final insight AFTER you have gathered context from at least 5 tool calls.\n\
             7. If a tool returns no results, that is useful information — continue calling the remaining tools anyway.",
             cameron_id_note = cameron_id_note
         );
diff --git a/src/ai/mod.rs b/src/ai/mod.rs
index 5414e69..60a3f43 100644
--- a/src/ai/mod.rs
+++ b/src/ai/mod.rs
@@ -12,7 +12,7 @@ pub use daily_summary_job::{generate_daily_summaries, strip_summary_boilerplate}
 pub use handlers::{
     delete_insight_handler, export_training_data_handler, generate_agentic_insight_handler,
     generate_insight_handler, get_all_insights_handler, get_available_models_handler,
-    get_insight_handler, rate_insight_handler,
+    get_insight_handler, get_openrouter_models_handler, rate_insight_handler,
 };
 pub use insight_generator::InsightGenerator;
 #[allow(unused_imports)]
diff --git a/src/main.rs b/src/main.rs
index 570cf58..2deee7e 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1355,6 +1355,7 @@ fn main() -> std::io::Result<()> {
                 .service(ai::delete_insight_handler)
                 .service(ai::get_all_insights_handler)
                 .service(ai::get_available_models_handler)
+                .service(ai::get_openrouter_models_handler)
                 .service(ai::rate_insight_handler)
                 .service(ai::export_training_data_handler)
                 .service(libraries::list_libraries)
diff --git a/src/state.rs b/src/state.rs
index 72a509e..dd8628a 100644
--- a/src/state.rs
+++ b/src/state.rs
@@ -39,6 +39,9 @@ pub struct AppState {
     /// generator.
     #[allow(dead_code)]
     pub openrouter: Option<Arc<OpenRouterClient>>,
+    /// Curated list of OpenRouter model ids exposed to clients. Sourced from
+    /// `OPENROUTER_ALLOWED_MODELS` (comma-separated). Empty when unset.
+    pub openrouter_allowed_models: Vec<String>,
     pub sms_client: SmsApiClient,
     pub insight_generator: InsightGenerator,
 }
@@ -70,6 +73,7 @@ impl AppState {
         excluded_dirs: Vec<String>,
         ollama: OllamaClient,
         openrouter: Option<Arc<OpenRouterClient>>,
+        openrouter_allowed_models: Vec<String>,
         sms_client: SmsApiClient,
         insight_generator: InsightGenerator,
         preview_dao: Arc<Mutex<Box<dyn PreviewDao>>>,
@@ -102,6 +106,7 @@ impl AppState {
             excluded_dirs,
             ollama,
             openrouter,
+            openrouter_allowed_models,
             sms_client,
             insight_generator,
         }
@@ -138,6 +143,7 @@ impl Default for AppState {
         );
 
         let openrouter = build_openrouter_from_env();
+        let openrouter_allowed_models = parse_openrouter_allowed_models();
 
         let sms_api_url =
             env::var("SMS_API_URL").unwrap_or_else(|_| "http://localhost:8000".to_string());
@@ -209,6 +215,7 @@ impl Default for AppState {
             Self::parse_excluded_dirs(),
             ollama,
             openrouter,
+            openrouter_allowed_models,
             sms_client,
             insight_generator,
             preview_dao,
@@ -235,6 +242,18 @@ fn build_openrouter_from_env() -> Option<Arc<OpenRouterClient>> {
     Some(Arc::new(client))
 }
 
+/// Parse `OPENROUTER_ALLOWED_MODELS` (comma-separated) into a vec. Returns
+/// empty when unset, in which case `/insights/openrouter/models` reports no
+/// curated picks and the server falls back to `OPENROUTER_DEFAULT_MODEL`.
+fn parse_openrouter_allowed_models() -> Vec<String> {
+    env::var("OPENROUTER_ALLOWED_MODELS")
+        .unwrap_or_default()
+        .split(',')
+        .map(|s| s.trim().to_string())
+        .filter(|s| !s.is_empty())
+        .collect()
+}
+
 #[cfg(test)]
 impl AppState {
     /// Creates an AppState instance for testing with temporary directories
@@ -321,6 +340,7 @@ impl AppState {
             Vec::new(), // No excluded directories for test state
             ollama,
             None,
+            Vec::new(),
             sms_client,
             insight_generator,
             preview_dao,
-- 
2.49.1


From 0b9528f61e71328a9e7efbff0dcf0c74a06cac59 Mon Sep 17 00:00:00 2001
From: Cameron <ccordes12@mail.bw.edu>
Date: Tue, 21 Apr 2026 13:00:27 -0400
Subject: [PATCH 05/24] feat(ai): chat continuation for photo insights (server
 v1)

Adds POST /insights/chat and GET /insights/chat/history. Replays the
stored agentic conversation through the same backend the insight was
generated with (or a per-turn override), runs a short tool-calling
loop, and persists the extended history in append or amend mode.

Backend switching: same-backend or hybrid->local replay verbatim;
local->hybrid is rejected in v1 (would require on-the-fly vision
description rewrite).

Per-(library, file) async mutex serialises concurrent turns. Soft
context budget drops oldest tool_call+result pairs when the
serialized history exceeds num_ctx - 2048 tokens.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/ai/handlers.rs           | 190 +++++++++++
 src/ai/insight_chat.rs       | 640 +++++++++++++++++++++++++++++++++++
 src/ai/insight_generator.rs  |   8 +-
 src/ai/mod.rs                |   8 +-
 src/database/insights_dao.rs |  37 ++
 src/main.rs                  |   2 +
 src/state.rs                 |  29 ++
 7 files changed, 907 insertions(+), 7 deletions(-)
 create mode 100644 src/ai/insight_chat.rs

diff --git a/src/ai/handlers.rs b/src/ai/handlers.rs
index 5c4036e..b07b157 100644
--- a/src/ai/handlers.rs
+++ b/src/ai/handlers.rs
@@ -3,6 +3,7 @@ use opentelemetry::KeyValue;
 use opentelemetry::trace::{Span, Status, Tracer};
 use serde::{Deserialize, Serialize};
 
+use crate::ai::insight_chat::ChatTurnRequest;
 use crate::ai::{InsightGenerator, ModelCapabilities, OllamaClient};
 use crate::data::Claims;
 use crate::database::{ExifDao, InsightDao};
@@ -70,6 +71,9 @@ pub struct PhotoInsightResponse {
     #[serde(skip_serializing_if = "Option::is_none")]
     pub approved: Option<bool>,
     pub backend: String,
+    /// True when the insight was generated agentically and a chat
+    /// continuation can be started against it. Drives the mobile chat button.
+    pub has_training_messages: bool,
 }
 
 #[derive(Debug, Serialize)]
@@ -192,6 +196,7 @@ pub async fn get_insight_handler(
                 prompt_eval_count: None,
                 eval_count: None,
                 approved: insight.approved,
+                has_training_messages: insight.training_messages.is_some(),
                 backend: insight.backend,
             };
             HttpResponse::Ok().json(response)
@@ -260,6 +265,7 @@ pub async fn get_all_insights_handler(
                     prompt_eval_count: None,
                     eval_count: None,
                     approved: insight.approved,
+                    has_training_messages: insight.training_messages.is_some(),
                     backend: insight.backend,
                 })
                 .collect();
@@ -353,6 +359,7 @@ pub async fn generate_agentic_insight_handler(
                         prompt_eval_count,
                         eval_count,
                         approved: insight.approved,
+                        has_training_messages: insight.training_messages.is_some(),
                         backend: insight.backend,
                     };
                     HttpResponse::Ok().json(response)
@@ -558,3 +565,186 @@ pub async fn export_training_data_handler(
         }
     }
 }
+
+#[derive(Debug, Deserialize)]
+pub struct ChatTurnHttpRequest {
+    pub file_path: String,
+    #[serde(default)]
+    pub library: Option<String>,
+    pub user_message: String,
+    #[serde(default)]
+    pub model: Option<String>,
+    #[serde(default)]
+    pub backend: Option<String>,
+    #[serde(default)]
+    pub num_ctx: Option<i32>,
+    #[serde(default)]
+    pub temperature: Option<f32>,
+    #[serde(default)]
+    pub top_p: Option<f32>,
+    #[serde(default)]
+    pub top_k: Option<i32>,
+    #[serde(default)]
+    pub min_p: Option<f32>,
+    #[serde(default)]
+    pub max_iterations: Option<usize>,
+    #[serde(default)]
+    pub amend: bool,
+}
+
+#[derive(Debug, Serialize)]
+pub struct ChatTurnHttpResponse {
+    pub assistant_message: String,
+    pub tool_calls_made: usize,
+    pub iterations_used: usize,
+    pub truncated: bool,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub prompt_eval_count: Option<i32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub eval_count: Option<i32>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub amended_insight_id: Option<i32>,
+    pub backend: String,
+    pub model: String,
+}
+
+/// POST /insights/chat — submit a follow-up turn against an existing insight.
+#[post("/insights/chat")]
+pub async fn chat_turn_handler(
+    http_request: HttpRequest,
+    _claims: Claims,
+    request: web::Json<ChatTurnHttpRequest>,
+    app_state: web::Data<AppState>,
+) -> impl Responder {
+    let parent_context = extract_context_from_request(&http_request);
+    let tracer = global_tracer();
+    let mut span = tracer.start_with_context("http.insights.chat", &parent_context);
+    span.set_attribute(KeyValue::new("file_path", request.file_path.clone()));
+
+    let library = match libraries::resolve_library_param(&app_state, request.library.as_deref()) {
+        Ok(Some(lib)) => lib,
+        Ok(None) => app_state.primary_library(),
+        Err(e) => {
+            return HttpResponse::BadRequest().json(serde_json::json!({
+                "error": format!("invalid library: {}", e)
+            }));
+        }
+    };
+
+    let chat_req = ChatTurnRequest {
+        library_id: library.id,
+        file_path: request.file_path.clone(),
+        user_message: request.user_message.clone(),
+        model: request.model.clone(),
+        backend: request.backend.clone(),
+        num_ctx: request.num_ctx,
+        temperature: request.temperature,
+        top_p: request.top_p,
+        top_k: request.top_k,
+        min_p: request.min_p,
+        max_iterations: request.max_iterations,
+        amend: request.amend,
+    };
+
+    match app_state.insight_chat.chat_turn(chat_req).await {
+        Ok(result) => {
+            span.set_status(Status::Ok);
+            HttpResponse::Ok().json(ChatTurnHttpResponse {
+                assistant_message: result.assistant_message,
+                tool_calls_made: result.tool_calls_made,
+                iterations_used: result.iterations_used,
+                truncated: result.truncated,
+                prompt_eval_count: result.prompt_eval_count,
+                eval_count: result.eval_count,
+                amended_insight_id: result.amended_insight_id,
+                backend: result.backend_used,
+                model: result.model_used,
+            })
+        }
+        Err(e) => {
+            let msg = format!("{}", e);
+            log::error!("Chat turn failed: {}", msg);
+            span.set_status(Status::error(msg.clone()));
+
+            // Map well-known errors to client-facing 4xx codes.
+            if msg.contains("no insight found") {
+                HttpResponse::NotFound().json(serde_json::json!({ "error": msg }))
+            } else if msg.contains("no chat history") {
+                HttpResponse::Conflict().json(serde_json::json!({ "error": msg }))
+            } else if msg.contains("user_message")
+                || msg.contains("unknown backend")
+                || msg.contains("switching from local to hybrid")
+                || msg.contains("hybrid backend unavailable")
+            {
+                HttpResponse::BadRequest().json(serde_json::json!({ "error": msg }))
+            } else {
+                HttpResponse::InternalServerError().json(serde_json::json!({ "error": msg }))
+            }
+        }
+    }
+}
+
+#[derive(Debug, Deserialize)]
+pub struct ChatHistoryQuery {
+    pub path: String,
+    #[serde(default)]
+    pub library: Option<String>,
+}
+
+#[derive(Debug, Serialize)]
+pub struct ChatHistoryHttpResponse {
+    pub messages: Vec<RenderedHistoryMessage>,
+    pub turn_count: usize,
+    pub model_version: String,
+    pub backend: String,
+}
+
+#[derive(Debug, Serialize)]
+pub struct RenderedHistoryMessage {
+    pub role: String,
+    pub content: String,
+    pub is_initial: bool,
+}
+
+/// GET /insights/chat/history — return the rendered transcript for a photo.
+#[get("/insights/chat/history")]
+pub async fn chat_history_handler(
+    _claims: Claims,
+    query: web::Query<ChatHistoryQuery>,
+    app_state: web::Data<AppState>,
+) -> impl Responder {
+    // library param parsed for parity with other insight endpoints, even
+    // though load_history currently keys on file_path alone (matches the
+    // existing get_insight DAO contract).
+    let _library = libraries::resolve_library_param(&app_state, query.library.as_deref())
+        .ok()
+        .flatten()
+        .unwrap_or_else(|| app_state.primary_library());
+
+    match app_state.insight_chat.load_history(&query.path) {
+        Ok(view) => HttpResponse::Ok().json(ChatHistoryHttpResponse {
+            messages: view
+                .messages
+                .into_iter()
+                .map(|m| RenderedHistoryMessage {
+                    role: m.role,
+                    content: m.content,
+                    is_initial: m.is_initial,
+                })
+                .collect(),
+            turn_count: view.turn_count,
+            model_version: view.model_version,
+            backend: view.backend,
+        }),
+        Err(e) => {
+            let msg = format!("{}", e);
+            if msg.contains("no insight found") {
+                HttpResponse::NotFound().json(serde_json::json!({ "error": msg }))
+            } else if msg.contains("no chat history") {
+                HttpResponse::Conflict().json(serde_json::json!({ "error": msg }))
+            } else {
+                HttpResponse::InternalServerError().json(serde_json::json!({ "error": msg }))
+            }
+        }
+    }
+}
diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs
new file mode 100644
index 0000000..7e5422c
--- /dev/null
+++ b/src/ai/insight_chat.rs
@@ -0,0 +1,640 @@
+use anyhow::{Result, anyhow, bail};
+use chrono::Utc;
+use opentelemetry::KeyValue;
+use opentelemetry::trace::{Span, Status, TraceContextExt, Tracer};
+use std::collections::HashMap;
+use std::sync::{Arc, Mutex};
+use tokio::sync::Mutex as TokioMutex;
+
+use crate::ai::insight_generator::InsightGenerator;
+use crate::ai::llm_client::{ChatMessage, LlmClient};
+use crate::ai::ollama::OllamaClient;
+use crate::ai::openrouter::OpenRouterClient;
+use crate::database::InsightDao;
+use crate::database::models::InsertPhotoInsight;
+use crate::otel::global_tracer;
+use crate::utils::normalize_path;
+
+const DEFAULT_MAX_ITERATIONS: usize = 6;
+const DEFAULT_NUM_CTX: i32 = 8192;
+/// Headroom reserved for the model's response, deducted from the context
+/// budget when deciding whether to truncate the replayed history.
+const RESPONSE_HEADROOM_TOKENS: usize = 2048;
+/// Cheap byte-to-token approximation used by the truncation pass. The real
+/// tokenization is model-specific; this avoids carrying tiktoken just for a
+/// soft bound.
+const BYTES_PER_TOKEN: usize = 4;
+
+pub type ChatLockMap = Arc<TokioMutex<HashMap<(i32, String), Arc<TokioMutex<()>>>>>;
+
+#[derive(Debug)]
+pub struct ChatTurnRequest {
+    pub library_id: i32,
+    pub file_path: String,
+    pub user_message: String,
+    /// Override the model id. Local mode: an Ollama model name. Hybrid:
+    /// an OpenRouter id. None defers to the stored insight's `model_version`.
+    pub model: Option<String>,
+    /// Override the backend used for this turn. None defers to the stored
+    /// insight's `backend`. Switching `local -> hybrid` is rejected in v1.
+    pub backend: Option<String>,
+    pub num_ctx: Option<i32>,
+    pub temperature: Option<f32>,
+    pub top_p: Option<f32>,
+    pub top_k: Option<i32>,
+    pub min_p: Option<f32>,
+    pub max_iterations: Option<usize>,
+    /// When true, write a new insight row (regenerating title) instead of
+    /// updating training_messages on the existing row.
+    pub amend: bool,
+}
+
+#[derive(Debug)]
+pub struct ChatTurnResult {
+    pub assistant_message: String,
+    pub tool_calls_made: usize,
+    pub iterations_used: usize,
+    pub truncated: bool,
+    pub prompt_eval_count: Option<i32>,
+    pub eval_count: Option<i32>,
+    /// Set when `amend=true` and the new insight row was inserted.
+    pub amended_insight_id: Option<i32>,
+    /// Backend used for this turn — useful when the client overrode the
+    /// stored value.
+    pub backend_used: String,
+    /// Model identifier the chat backend ran with.
+    pub model_used: String,
+}
+
+#[derive(Clone)]
+pub struct InsightChatService {
+    generator: Arc<InsightGenerator>,
+    ollama: OllamaClient,
+    openrouter: Option<Arc<OpenRouterClient>>,
+    insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
+    chat_locks: ChatLockMap,
+}
+
+impl InsightChatService {
+    pub fn new(
+        generator: Arc<InsightGenerator>,
+        ollama: OllamaClient,
+        openrouter: Option<Arc<OpenRouterClient>>,
+        insight_dao: Arc<Mutex<Box<dyn InsightDao>>>,
+        chat_locks: ChatLockMap,
+    ) -> Self {
+        Self {
+            generator,
+            ollama,
+            openrouter,
+            insight_dao,
+            chat_locks,
+        }
+    }
+
+    /// Load the rendered transcript for chat-UI display. Filters internal
+    /// scaffolding (system message, tool turns, tool-dispatch-only assistant
+    /// messages) and drops base64 images from user turns to keep payloads
+    /// small. The first remaining user message is flagged `is_initial`.
+    pub fn load_history(&self, file_path: &str) -> Result<HistoryView> {
+        let normalized = normalize_path(file_path);
+        let cx = opentelemetry::Context::new();
+        let mut dao = self.insight_dao.lock().expect("Unable to lock InsightDao");
+        let insight = dao
+            .get_insight(&cx, &normalized)
+            .map_err(|e| anyhow!("failed to load insight: {:?}", e))?
+            .ok_or_else(|| anyhow!("no insight found for path"))?;
+
+        let raw = insight
+            .training_messages
+            .as_ref()
+            .ok_or_else(|| anyhow!("insight has no chat history (pre-agentic insight)"))?;
+        let messages: Vec<ChatMessage> = serde_json::from_str(raw)
+            .map_err(|e| anyhow!("failed to deserialize chat history: {}", e))?;
+
+        let mut rendered = Vec::new();
+        let mut user_turns_seen = 0usize;
+        let mut assistant_turns_seen = 0usize;
+        for msg in &messages {
+            match msg.role.as_str() {
+                "system" => continue,
+                "tool" => continue,
+                "assistant" => {
+                    let has_tool_calls = msg
+                        .tool_calls
+                        .as_ref()
+                        .map(|c| !c.is_empty())
+                        .unwrap_or(false);
+                    if has_tool_calls && msg.content.trim().is_empty() {
+                        continue;
+                    }
+                    assistant_turns_seen += 1;
+                    rendered.push(RenderedMessage {
+                        role: "assistant".to_string(),
+                        content: msg.content.clone(),
+                        is_initial: false,
+                    });
+                }
+                "user" => {
+                    let is_initial = user_turns_seen == 0;
+                    user_turns_seen += 1;
+                    rendered.push(RenderedMessage {
+                        role: "user".to_string(),
+                        content: msg.content.clone(),
+                        is_initial,
+                    });
+                }
+                _ => continue,
+            }
+        }
+
+        Ok(HistoryView {
+            messages: rendered,
+            turn_count: assistant_turns_seen,
+            model_version: insight.model_version,
+            backend: insight.backend,
+        })
+    }
+
+    pub async fn chat_turn(&self, req: ChatTurnRequest) -> Result<ChatTurnResult> {
+        let tracer = global_tracer();
+        let parent_cx = opentelemetry::Context::new();
+        let mut span = tracer.start_with_context("ai.insight.chat_turn", &parent_cx);
+        span.set_attribute(KeyValue::new("file_path", req.file_path.clone()));
+        span.set_attribute(KeyValue::new("library_id", req.library_id as i64));
+        span.set_attribute(KeyValue::new("amend", req.amend));
+
+        if req.user_message.trim().is_empty() {
+            bail!("user_message must not be empty");
+        }
+        if req.user_message.len() > 8192 {
+            bail!("user_message exceeds 8192 chars");
+        }
+
+        let normalized = normalize_path(&req.file_path);
+
+        // 1. Acquire the per-(library, file) async mutex. Two concurrent
+        //    chat turns on the same insight would race on the JSON blob —
+        //    the lock serialises them.
+        let lock_key = (req.library_id, normalized.clone());
+        let entry_lock = {
+            let mut locks = self.chat_locks.lock().await;
+            locks
+                .entry(lock_key.clone())
+                .or_insert_with(|| Arc::new(TokioMutex::new(())))
+                .clone()
+        };
+        let _guard = entry_lock.lock().await;
+
+        // 2. Load the current insight + history.
+        let insight = {
+            let cx = opentelemetry::Context::new();
+            let mut dao = self.insight_dao.lock().expect("Unable to lock InsightDao");
+            dao.get_insight(&cx, &normalized)
+                .map_err(|e| anyhow!("failed to load insight: {:?}", e))?
+                .ok_or_else(|| anyhow!("no insight found for path"))?
+        };
+        let raw_history = insight
+            .training_messages
+            .as_ref()
+            .ok_or_else(|| {
+                anyhow!("insight has no chat history; regenerate this insight in agentic mode")
+            })?
+            .clone();
+        let mut messages: Vec<ChatMessage> = serde_json::from_str(&raw_history)
+            .map_err(|e| anyhow!("failed to deserialize chat history: {}", e))?;
+
+        // 3. Resolve effective backend. Reject the unsupported switch.
+        let stored_backend = insight.backend.clone();
+        let effective_backend = req
+            .backend
+            .as_deref()
+            .map(|s| s.trim().to_lowercase())
+            .filter(|s| !s.is_empty())
+            .unwrap_or_else(|| stored_backend.clone());
+        if !matches!(effective_backend.as_str(), "local" | "hybrid") {
+            bail!(
+                "unknown backend '{}'; expected 'local' or 'hybrid'",
+                effective_backend
+            );
+        }
+        if stored_backend == "local" && effective_backend == "hybrid" {
+            bail!(
+                "switching from local to hybrid mid-chat isn't supported yet; \
+                 regenerate the insight in hybrid mode if you want OpenRouter chat"
+            );
+        }
+        let is_hybrid = effective_backend == "hybrid";
+        span.set_attribute(KeyValue::new("backend", effective_backend.clone()));
+
+        // 4. Build the chat backend client. Ollama in local mode, a freshly
+        //    cloned OpenRouter client in hybrid mode (clone so per-request
+        //    sampling/model overrides don't leak into shared state).
+        let max_iterations = req
+            .max_iterations
+            .unwrap_or(DEFAULT_MAX_ITERATIONS)
+            .clamp(1, env_max_iterations());
+        span.set_attribute(KeyValue::new("max_iterations", max_iterations as i64));
+
+        let stored_model = insight.model_version.clone();
+        let custom_model = req
+            .model
+            .clone()
+            .or_else(|| Some(stored_model.clone()))
+            .filter(|m| !m.is_empty());
+
+        let mut ollama_client = self.ollama.clone();
+        let mut openrouter_client: Option<OpenRouterClient> = None;
+
+        if is_hybrid {
+            let arc = self.openrouter.as_ref().ok_or_else(|| {
+                anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
+            })?;
+            let mut c: OpenRouterClient = (**arc).clone();
+            if let Some(ref m) = custom_model {
+                c.primary_model = m.clone();
+            }
+            if req.temperature.is_some()
+                || req.top_p.is_some()
+                || req.top_k.is_some()
+                || req.min_p.is_some()
+            {
+                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
+            }
+            if let Some(ctx) = req.num_ctx {
+                c.set_num_ctx(Some(ctx));
+            }
+            openrouter_client = Some(c);
+        } else {
+            // Local-mode model swap. Build a new client when the chat model
+            // differs from the configured one (mirrors the agentic pattern).
+            if let Some(ref m) = custom_model
+                && m != &self.ollama.primary_model
+            {
+                ollama_client = OllamaClient::new(
+                    self.ollama.primary_url.clone(),
+                    self.ollama.fallback_url.clone(),
+                    m.clone(),
+                    Some(m.clone()),
+                );
+            }
+            if req.temperature.is_some()
+                || req.top_p.is_some()
+                || req.top_k.is_some()
+                || req.min_p.is_some()
+            {
+                ollama_client.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
+            }
+            if let Some(ctx) = req.num_ctx {
+                ollama_client.set_num_ctx(Some(ctx));
+            }
+        }
+
+        let chat_backend: &dyn LlmClient = if let Some(ref c) = openrouter_client {
+            c
+        } else {
+            &ollama_client
+        };
+        let model_used = chat_backend.primary_model().to_string();
+        span.set_attribute(KeyValue::new("model", model_used.clone()));
+
+        // 5. Decide vision + tool set. In hybrid we always omit
+        //    `describe_photo` (matches the original generation flow). In
+        //    local we trust the stored history's first-user shape: if it
+        //    carries `images`, the original model was vision-capable, and
+        //    we keep `describe_photo` available.
+        let local_first_user_has_image = messages
+            .iter()
+            .find(|m| m.role == "user")
+            .and_then(|m| m.images.as_ref())
+            .map(|imgs| !imgs.is_empty())
+            .unwrap_or(false);
+        let offer_describe_tool = !is_hybrid && local_first_user_has_image;
+        let tools = InsightGenerator::build_tool_definitions(offer_describe_tool);
+
+        // Image base64 only needed when describe_photo is on the menu. Load
+        // lazily to avoid disk IO when the loop never invokes it.
+        let image_base64: Option<String> = if offer_describe_tool {
+            self.generator.load_image_as_base64(&normalized).ok()
+        } else {
+            None
+        };
+
+        // 6. Apply truncation budget. Drops oldest tool_call+tool pairs
+        //    (preserves system + first user including any images).
+        let budget_tokens = (req.num_ctx.unwrap_or(DEFAULT_NUM_CTX) as usize)
+            .saturating_sub(RESPONSE_HEADROOM_TOKENS);
+        let budget_bytes = budget_tokens.saturating_mul(BYTES_PER_TOKEN);
+        let truncated = apply_context_budget(&mut messages, budget_bytes);
+        if truncated {
+            span.set_attribute(KeyValue::new("history_truncated", true));
+        }
+
+        // 7. Append the new user turn.
+        messages.push(ChatMessage::user(req.user_message.clone()));
+
+        let insight_cx = parent_cx.with_span(span);
+
+        // 8. Agentic loop — same shape as insight_generator's, but capped
+        //    tighter and dispatching tools through the shared executor.
+        let loop_span = tracer.start_with_context("ai.chat.loop", &insight_cx);
+        let loop_cx = insight_cx.with_span(loop_span);
+        let mut tool_calls_made = 0usize;
+        let mut iterations_used = 0usize;
+        let mut last_prompt_eval_count: Option<i32> = None;
+        let mut last_eval_count: Option<i32> = None;
+        let mut final_content = String::new();
+
+        for iteration in 0..max_iterations {
+            iterations_used = iteration + 1;
+            log::info!("Chat iteration {}/{}", iterations_used, max_iterations);
+
+            let (response, prompt_tokens, eval_tokens) = chat_backend
+                .chat_with_tools(messages.clone(), tools.clone())
+                .await?;
+            last_prompt_eval_count = prompt_tokens;
+            last_eval_count = eval_tokens;
+
+            // Ollama rejects non-object tool-call arguments on replay.
+            let mut response = response;
+            if let Some(ref mut tcs) = response.tool_calls {
+                for tc in tcs.iter_mut() {
+                    if !tc.function.arguments.is_object() {
+                        tc.function.arguments = serde_json::Value::Object(Default::default());
+                    }
+                }
+            }
+
+            messages.push(response.clone());
+
+            if let Some(ref tool_calls) = response.tool_calls
+                && !tool_calls.is_empty()
+            {
+                for tool_call in tool_calls {
+                    tool_calls_made += 1;
+                    log::info!(
+                        "Chat tool call [{}]: {} {:?}",
+                        iteration,
+                        tool_call.function.name,
+                        tool_call.function.arguments
+                    );
+                    let result = self
+                        .generator
+                        .execute_tool(
+                            &tool_call.function.name,
+                            &tool_call.function.arguments,
+                            &ollama_client,
+                            &image_base64,
+                            &normalized,
+                            &loop_cx,
+                        )
+                        .await;
+                    messages.push(ChatMessage::tool_result(result));
+                }
+                continue;
+            }
+
+            final_content = response.content;
+            break;
+        }
+
+        if final_content.is_empty() {
+            // The model never produced a final answer; ask once more without
+            // tools to force a textual reply.
+            log::info!(
+                "Chat loop exhausted after {} iterations, requesting final answer",
+                iterations_used
+            );
+            messages.push(ChatMessage::user(
+                "Please write your final answer now without calling any more tools.",
+            ));
+            let (final_response, prompt_tokens, eval_tokens) = chat_backend
+                .chat_with_tools(messages.clone(), vec![])
+                .await?;
+            last_prompt_eval_count = prompt_tokens;
+            last_eval_count = eval_tokens;
+            final_content = final_response.content.clone();
+            messages.push(final_response);
+        }
+
+        loop_cx.span().set_status(Status::Ok);
+
+        // 9. Persist. Append mode rewrites the JSON blob in place; amend
+        //    mode regenerates the title and inserts a new insight row,
+        //    relying on store_insight to flip prior rows' is_current=false.
+        let json = serde_json::to_string(&messages)
+            .map_err(|e| anyhow!("failed to serialize chat history: {}", e))?;
+
+        let mut amended_insight_id: Option<i32> = None;
+        if req.amend {
+            let title_prompt = format!(
+                "Create a short title (maximum 8 words) for the following journal entry:\n\n{}\n\n\
+                 Capture the key moment or theme. Return ONLY the title, nothing else.",
+                final_content
+            );
+            let title_raw = chat_backend
+                .generate(
+                    &title_prompt,
+                    Some(
+                        "You are my long term memory assistant. Use only the information provided. Do not invent details.",
+                    ),
+                    None,
+                )
+                .await?;
+            let title = title_raw.trim().trim_matches('"').to_string();
+
+            let new_row = InsertPhotoInsight {
+                library_id: req.library_id,
+                file_path: normalized.clone(),
+                title,
+                summary: final_content.clone(),
+                generated_at: Utc::now().timestamp(),
+                model_version: model_used.clone(),
+                is_current: true,
+                training_messages: Some(json),
+                backend: effective_backend.clone(),
+            };
+            let cx = opentelemetry::Context::new();
+            let mut dao = self.insight_dao.lock().expect("Unable to lock InsightDao");
+            let stored = dao
+                .store_insight(&cx, new_row)
+                .map_err(|e| anyhow!("failed to store amended insight: {:?}", e))?;
+            amended_insight_id = Some(stored.id);
+        } else {
+            let cx = opentelemetry::Context::new();
+            let mut dao = self.insight_dao.lock().expect("Unable to lock InsightDao");
+            dao.update_training_messages(&cx, req.library_id, &normalized, &json)
+                .map_err(|e| anyhow!("failed to persist chat history: {:?}", e))?;
+        }
+
+        Ok(ChatTurnResult {
+            assistant_message: final_content,
+            tool_calls_made,
+            iterations_used,
+            truncated,
+            prompt_eval_count: last_prompt_eval_count,
+            eval_count: last_eval_count,
+            amended_insight_id,
+            backend_used: effective_backend,
+            model_used,
+        })
+    }
+}
+
+/// Read AGENTIC_CHAT_MAX_ITERATIONS once per call. Cheap; keeps the code
+/// free of static globals and lets the operator change the cap by env without
+/// a restart in test harnesses (the running server still caches via Default).
+fn env_max_iterations() -> usize {
+    std::env::var("AGENTIC_CHAT_MAX_ITERATIONS")
+        .ok()
+        .and_then(|s| s.parse::<usize>().ok())
+        .unwrap_or(DEFAULT_MAX_ITERATIONS)
+        .max(1)
+}
+
+/// View returned to clients for chat-UI rendering.
+#[derive(Debug)]
+pub struct HistoryView {
+    pub messages: Vec<RenderedMessage>,
+    pub turn_count: usize,
+    pub model_version: String,
+    pub backend: String,
+}
+
+#[derive(Debug)]
+pub struct RenderedMessage {
+    pub role: String,
+    pub content: String,
+    pub is_initial: bool,
+}
+
+/// Trim history to fit within `budget_bytes` of serialized JSON. Preserves
+/// the system message and the first user message (with its base64 images
+/// intact, since dropping those would invalidate the model's prior visual
+/// reasoning). Drops the oldest assistant-tool_call + corresponding
+/// tool-result pair on each pass until the budget is met or only the
+/// preserved prefix remains.
+///
+/// Returns true when at least one message was dropped.
+pub(crate) fn apply_context_budget(messages: &mut Vec<ChatMessage>, budget_bytes: usize) -> bool {
+    if budget_bytes == 0 {
+        return false;
+    }
+    if estimate_bytes(messages) <= budget_bytes {
+        return false;
+    }
+
+    // Find the index past the protected prefix: system messages + the first
+    // user message. Everything after is droppable in pairs.
+    let first_user_idx = messages.iter().position(|m| m.role == "user");
+    let preserve_through = match first_user_idx {
+        Some(i) => i, // keep [0..=i]
+        None => return false,
+    };
+
+    let mut dropped_any = false;
+    loop {
+        if estimate_bytes(messages) <= budget_bytes {
+            break;
+        }
+        // Find the oldest assistant-with-tool_calls strictly after the
+        // preserved prefix. Drop it together with the following tool turn(s)
+        // until we hit the next assistant or user turn.
+        let drop_start = (preserve_through + 1..messages.len()).find(|&i| {
+            let m = &messages[i];
+            m.role == "assistant"
+                && m.tool_calls
+                    .as_ref()
+                    .map(|c| !c.is_empty())
+                    .unwrap_or(false)
+        });
+        let Some(start) = drop_start else { break };
+        // Determine end: drop the assistant turn plus any contiguous tool
+        // result turns that follow.
+        let mut end = start + 1;
+        while end < messages.len() && messages[end].role == "tool" {
+            end += 1;
+        }
+        // Stop if dropping these would leave the just-appended user turn at
+        // the end alone with no preceding context — we still want it kept.
+        if end > messages.len() {
+            break;
+        }
+        messages.drain(start..end);
+        dropped_any = true;
+    }
+
+    dropped_any
+}
+
+fn estimate_bytes(messages: &[ChatMessage]) -> usize {
+    serde_json::to_string(messages)
+        .map(|s| s.len())
+        .unwrap_or(0)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::ai::llm_client::{ToolCall, ToolCallFunction};
+
+    fn assistant_with_tool_call(name: &str) -> ChatMessage {
+        ChatMessage {
+            role: "assistant".to_string(),
+            content: String::new(),
+            tool_calls: Some(vec![ToolCall {
+                id: None,
+                function: ToolCallFunction {
+                    name: name.to_string(),
+                    arguments: serde_json::Value::Object(Default::default()),
+                },
+            }]),
+            images: None,
+        }
+    }
+
+    fn assistant_text(text: &str) -> ChatMessage {
+        ChatMessage {
+            role: "assistant".to_string(),
+            content: text.to_string(),
+            tool_calls: None,
+            images: None,
+        }
+    }
+
+    #[test]
+    fn truncation_preserves_system_and_first_user() {
+        let mut msgs = vec![
+            ChatMessage::system("sys"),
+            ChatMessage::user("first user with lots of context".repeat(50)),
+            assistant_with_tool_call("get_x"),
+            ChatMessage::tool_result("x result ".repeat(200)),
+            assistant_with_tool_call("get_y"),
+            ChatMessage::tool_result("y result ".repeat(200)),
+            assistant_text("final answer"),
+        ];
+        let original_len = msgs.len();
+        let dropped = apply_context_budget(&mut msgs, 500);
+        assert!(dropped, "should drop something at this small budget");
+        assert!(msgs.len() < original_len);
+        // First two messages preserved.
+        assert_eq!(msgs[0].role, "system");
+        assert_eq!(msgs[1].role, "user");
+    }
+
+    #[test]
+    fn truncation_no_op_when_under_budget() {
+        let mut msgs = vec![ChatMessage::system("s"), ChatMessage::user("u")];
+        let dropped = apply_context_budget(&mut msgs, 1_000_000);
+        assert!(!dropped);
+        assert_eq!(msgs.len(), 2);
+    }
+
+    #[test]
+    fn truncation_returns_false_with_no_droppable_pairs() {
+        // Only system + user, no tool-call turns to drop.
+        let mut msgs = vec![ChatMessage::system("s"), ChatMessage::user("u")];
+        let dropped = apply_context_budget(&mut msgs, 1);
+        assert!(!dropped);
+    }
+}
diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs
index fcf89ee..09fd585 100644
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -96,7 +96,7 @@ impl InsightGenerator {
     /// first root under which the file exists. Insights may be generated
     /// for any library — the generator itself doesn't know which — so we
     /// probe each root rather than trust a single `base_path`.
-    fn resolve_full_path(&self, rel_path: &str) -> Option<std::path::PathBuf> {
+    pub(crate) fn resolve_full_path(&self, rel_path: &str) -> Option<std::path::PathBuf> {
         use std::path::Path;
         for lib in &self.libraries {
             let candidate = Path::new(&lib.root_path).join(rel_path);
@@ -129,7 +129,7 @@ impl InsightGenerator {
 
     /// Load image file, resize it, and encode as base64 for vision models
     /// Resizes to max 1024px on longest edge to reduce context usage
-    fn load_image_as_base64(&self, file_path: &str) -> Result<String> {
+    pub(crate) fn load_image_as_base64(&self, file_path: &str) -> Result<String> {
         use image::imageops::FilterType;
 
         let full_path = self.resolve_full_path(file_path).ok_or_else(|| {
@@ -1411,7 +1411,7 @@ Return ONLY the summary, nothing else."#,
     // ── Tool executors for agentic loop ────────────────────────────────
 
     /// Dispatch a tool call to the appropriate executor
-    async fn execute_tool(
+    pub(crate) async fn execute_tool(
         &self,
         tool_name: &str,
         arguments: &serde_json::Value,
@@ -2136,7 +2136,7 @@ Return ONLY the summary, nothing else."#,
     // ── Agentic insight generation ──────────────────────────────────────
 
     /// Build the list of tool definitions for the agentic loop
-    fn build_tool_definitions(has_vision: bool) -> Vec<Tool> {
+    pub(crate) fn build_tool_definitions(has_vision: bool) -> Vec<Tool> {
         let mut tools = vec![
             Tool::function(
                 "search_rag",
diff --git a/src/ai/mod.rs b/src/ai/mod.rs
index 60a3f43..93735c0 100644
--- a/src/ai/mod.rs
+++ b/src/ai/mod.rs
@@ -1,5 +1,6 @@
 pub mod daily_summary_job;
 pub mod handlers;
+pub mod insight_chat;
 pub mod insight_generator;
 pub mod llm_client;
 pub mod ollama;
@@ -10,9 +11,10 @@ pub mod sms_client;
 #[allow(unused_imports)]
 pub use daily_summary_job::{generate_daily_summaries, strip_summary_boilerplate};
 pub use handlers::{
-    delete_insight_handler, export_training_data_handler, generate_agentic_insight_handler,
-    generate_insight_handler, get_all_insights_handler, get_available_models_handler,
-    get_insight_handler, get_openrouter_models_handler, rate_insight_handler,
+    chat_history_handler, chat_turn_handler, delete_insight_handler, export_training_data_handler,
+    generate_agentic_insight_handler, generate_insight_handler, get_all_insights_handler,
+    get_available_models_handler, get_insight_handler, get_openrouter_models_handler,
+    rate_insight_handler,
 };
 pub use insight_generator::InsightGenerator;
 #[allow(unused_imports)]
diff --git a/src/database/insights_dao.rs b/src/database/insights_dao.rs
index 553b579..34c1d12 100644
--- a/src/database/insights_dao.rs
+++ b/src/database/insights_dao.rs
@@ -60,6 +60,17 @@ pub trait InsightDao: Sync + Send {
         &mut self,
         context: &opentelemetry::Context,
     ) -> Result<Vec<PhotoInsight>, DbError>;
+
+    /// Replace the `training_messages` JSON blob on the current row for
+    /// `(library_id, rel_path)`. Used by chat-turn append mode to persist
+    /// the extended conversation without inserting a new insight version.
+    fn update_training_messages(
+        &mut self,
+        context: &opentelemetry::Context,
+        library_id: i32,
+        file_path: &str,
+        training_messages_json: &str,
+    ) -> Result<(), DbError>;
 }
 
 pub struct SqliteInsightDao {
@@ -265,4 +276,30 @@ impl InsightDao for SqliteInsightDao {
         })
         .map_err(|_| DbError::new(DbErrorKind::QueryError))
     }
+
+    fn update_training_messages(
+        &mut self,
+        context: &opentelemetry::Context,
+        lib_id: i32,
+        path: &str,
+        training_messages_json: &str,
+    ) -> Result<(), DbError> {
+        trace_db_call(context, "update", "update_training_messages", |_span| {
+            use schema::photo_insights::dsl::*;
+
+            let mut connection = self.connection.lock().expect("Unable to get InsightDao");
+
+            diesel::update(
+                photo_insights
+                    .filter(library_id.eq(lib_id))
+                    .filter(rel_path.eq(path))
+                    .filter(is_current.eq(true)),
+            )
+            .set(training_messages.eq(Some(training_messages_json.to_string())))
+            .execute(connection.deref_mut())
+            .map(|_| ())
+            .map_err(|_| anyhow::anyhow!("Update error"))
+        })
+        .map_err(|_| DbError::new(DbErrorKind::UpdateError))
+    }
 }
diff --git a/src/main.rs b/src/main.rs
index 2deee7e..53ab607 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1356,6 +1356,8 @@ fn main() -> std::io::Result<()> {
                 .service(ai::get_all_insights_handler)
                 .service(ai::get_available_models_handler)
                 .service(ai::get_openrouter_models_handler)
+                .service(ai::chat_turn_handler)
+                .service(ai::chat_history_handler)
                 .service(ai::rate_insight_handler)
                 .service(ai::export_training_data_handler)
                 .service(libraries::list_libraries)
diff --git a/src/state.rs b/src/state.rs
index dd8628a..8e13d28 100644
--- a/src/state.rs
+++ b/src/state.rs
@@ -1,3 +1,4 @@
+use crate::ai::insight_chat::{ChatLockMap, InsightChatService};
 use crate::ai::openrouter::OpenRouterClient;
 use crate::ai::{InsightGenerator, OllamaClient, SmsApiClient};
 use crate::database::{
@@ -44,6 +45,8 @@ pub struct AppState {
     pub openrouter_allowed_models: Vec<String>,
     pub sms_client: SmsApiClient,
     pub insight_generator: InsightGenerator,
+    /// Chat continuation service. Hold an Arc so handlers can clone cheaply.
+    pub insight_chat: Arc<InsightChatService>,
 }
 
 impl AppState {
@@ -76,6 +79,7 @@ impl AppState {
         openrouter_allowed_models: Vec<String>,
         sms_client: SmsApiClient,
         insight_generator: InsightGenerator,
+        insight_chat: Arc<InsightChatService>,
         preview_dao: Arc<Mutex<Box<dyn PreviewDao>>>,
     ) -> Self {
         assert!(
@@ -109,6 +113,7 @@ impl AppState {
             openrouter_allowed_models,
             sms_client,
             insight_generator,
+            insight_chat,
         }
     }
 
@@ -199,6 +204,18 @@ impl Default for AppState {
             libraries_vec.clone(),
         );
 
+        // Chat continuation reuses the generator for tool dispatch + image
+        // loading. The lock map starts empty and grows lazily per file.
+        let chat_locks: ChatLockMap =
+            Arc::new(tokio::sync::Mutex::new(std::collections::HashMap::new()));
+        let insight_chat = Arc::new(InsightChatService::new(
+            Arc::new(insight_generator.clone()),
+            ollama.clone(),
+            openrouter.clone(),
+            insight_dao.clone(),
+            chat_locks,
+        ));
+
         // Ensure preview clips directory exists
         let preview_clips_path =
             env::var("PREVIEW_CLIPS_DIRECTORY").unwrap_or_else(|_| "preview_clips".to_string());
@@ -218,6 +235,7 @@ impl Default for AppState {
             openrouter_allowed_models,
             sms_client,
             insight_generator,
+            insight_chat,
             preview_dao,
         )
     }
@@ -320,6 +338,16 @@ impl AppState {
             vec![test_lib],
         );
 
+        let chat_locks: ChatLockMap =
+            Arc::new(tokio::sync::Mutex::new(std::collections::HashMap::new()));
+        let insight_chat = Arc::new(InsightChatService::new(
+            Arc::new(insight_generator.clone()),
+            ollama.clone(),
+            None,
+            insight_dao.clone(),
+            chat_locks,
+        ));
+
         // Initialize test preview DAO
         let preview_dao: Arc<Mutex<Box<dyn PreviewDao>>> =
             Arc::new(Mutex::new(Box::new(SqlitePreviewDao::new())));
@@ -343,6 +371,7 @@ impl AppState {
             Vec::new(),
             sms_client,
             insight_generator,
+            insight_chat,
             preview_dao,
         )
     }
-- 
2.49.1


From 65ab10e9a867cf872cc6f485f2dc41a86f5f6e5c Mon Sep 17 00:00:00 2001
From: Cameron <ccordes12@mail.bw.edu>
Date: Tue, 21 Apr 2026 15:16:32 -0400
Subject: [PATCH 06/24] feat(ai): chat rewind + ollama metrics logging

Rewind: POST /insights/chat/rewind truncates training_messages at a
given rendered index, dropping the target message plus any preceding
tool-call scaffolding. The initial user prompt is protected.

Metrics: log prompt_eval_count/duration and eval_count/duration from
every Ollama chat response, rendered as tokens + ms + tok/s.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/ai/handlers.rs     |  57 ++++++++++++++++
 src/ai/insight_chat.rs | 145 +++++++++++++++++++++++++++++++++++++++++
 src/ai/mod.rs          |   8 +--
 src/ai/ollama.rs       |  63 ++++++++++++++++++
 src/main.rs            |   1 +
 5 files changed, 270 insertions(+), 4 deletions(-)

diff --git a/src/ai/handlers.rs b/src/ai/handlers.rs
index b07b157..9c664b6 100644
--- a/src/ai/handlers.rs
+++ b/src/ai/handlers.rs
@@ -706,6 +706,63 @@ pub struct RenderedHistoryMessage {
     pub is_initial: bool,
 }
 
+#[derive(Debug, Deserialize)]
+pub struct ChatRewindHttpRequest {
+    pub file_path: String,
+    #[serde(default)]
+    pub library: Option<String>,
+    /// 0-based index into the rendered transcript. The message at this
+    /// index, and everything after it, is discarded. Must be > 0 — the
+    /// initial user message is protected.
+    pub discard_from_rendered_index: usize,
+}
+
+/// POST /insights/chat/rewind — truncate the stored conversation so the
+/// rendered message at `discard_from_rendered_index` (and everything after)
+/// is removed. Use when a user wants to retry a turn with a different
+/// prompt without prior replies poisoning context.
+#[post("/insights/chat/rewind")]
+pub async fn chat_rewind_handler(
+    _claims: Claims,
+    request: web::Json<ChatRewindHttpRequest>,
+    app_state: web::Data<AppState>,
+) -> impl Responder {
+    let library = match libraries::resolve_library_param(&app_state, request.library.as_deref()) {
+        Ok(Some(lib)) => lib,
+        Ok(None) => app_state.primary_library(),
+        Err(e) => {
+            return HttpResponse::BadRequest().json(serde_json::json!({
+                "error": format!("invalid library: {}", e)
+            }));
+        }
+    };
+
+    match app_state
+        .insight_chat
+        .rewind_history(
+            library.id,
+            &request.file_path,
+            request.discard_from_rendered_index,
+        )
+        .await
+    {
+        Ok(()) => HttpResponse::Ok().json(serde_json::json!({ "success": true })),
+        Err(e) => {
+            let msg = format!("{}", e);
+            log::error!("Chat rewind failed: {}", msg);
+            if msg.contains("no insight found") {
+                HttpResponse::NotFound().json(serde_json::json!({ "error": msg }))
+            } else if msg.contains("no chat history") {
+                HttpResponse::Conflict().json(serde_json::json!({ "error": msg }))
+            } else if msg.contains("cannot discard the initial") || msg.contains("out of range") {
+                HttpResponse::BadRequest().json(serde_json::json!({ "error": msg }))
+            } else {
+                HttpResponse::InternalServerError().json(serde_json::json!({ "error": msg }))
+            }
+        }
+    }
+}
+
 /// GET /insights/chat/history — return the rendered transcript for a photo.
 #[get("/insights/chat/history")]
 pub async fn chat_history_handler(
diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs
index 7e5422c..41411d3 100644
--- a/src/ai/insight_chat.rs
+++ b/src/ai/insight_chat.rs
@@ -479,6 +479,109 @@ impl InsightChatService {
             model_used,
         })
     }
+
+    /// Truncate the stored conversation so the rendered message at
+    /// `discard_from_rendered_index` (and everything after it — including
+    /// the tool-call scaffolding that produced a discarded assistant reply)
+    /// is removed. The initial user turn cannot be discarded; attempting to
+    /// do so returns an error.
+    ///
+    /// Holds the per-file chat mutex so it serialises with `chat_turn`.
+    pub async fn rewind_history(
+        &self,
+        library_id: i32,
+        file_path: &str,
+        discard_from_rendered_index: usize,
+    ) -> Result<()> {
+        if discard_from_rendered_index == 0 {
+            bail!("cannot discard the initial user message");
+        }
+        let normalized = normalize_path(file_path);
+
+        let lock_key = (library_id, normalized.clone());
+        let entry_lock = {
+            let mut locks = self.chat_locks.lock().await;
+            locks
+                .entry(lock_key.clone())
+                .or_insert_with(|| Arc::new(TokioMutex::new(())))
+                .clone()
+        };
+        let _guard = entry_lock.lock().await;
+
+        let insight = {
+            let cx = opentelemetry::Context::new();
+            let mut dao = self.insight_dao.lock().expect("Unable to lock InsightDao");
+            dao.get_insight(&cx, &normalized)
+                .map_err(|e| anyhow!("failed to load insight: {:?}", e))?
+                .ok_or_else(|| anyhow!("no insight found for path"))?
+        };
+        let raw_history = insight
+            .training_messages
+            .as_ref()
+            .ok_or_else(|| anyhow!("insight has no chat history"))?;
+        let messages: Vec<ChatMessage> = serde_json::from_str(raw_history)
+            .map_err(|e| anyhow!("failed to deserialize chat history: {}", e))?;
+
+        let cut_at = find_raw_cut(&messages, discard_from_rendered_index)
+            .ok_or_else(|| anyhow!("discard_from_rendered_index out of range"))?;
+
+        let truncated = &messages[..cut_at];
+        let json = serde_json::to_string(truncated)
+            .map_err(|e| anyhow!("failed to serialize truncated history: {}", e))?;
+
+        let cx = opentelemetry::Context::new();
+        let mut dao = self.insight_dao.lock().expect("Unable to lock InsightDao");
+        dao.update_training_messages(&cx, library_id, &normalized, &json)
+            .map_err(|e| anyhow!("failed to persist truncated history: {:?}", e))?;
+        Ok(())
+    }
+}
+
+/// Is this raw message visible in the rendered transcript? Must match
+/// `load_history`'s filter exactly — `find_raw_cut` depends on it to map
+/// rendered indices back to raw positions.
+fn is_rendered(m: &ChatMessage) -> bool {
+    match m.role.as_str() {
+        "user" => true,
+        "assistant" => {
+            let has_tool_calls = m
+                .tool_calls
+                .as_ref()
+                .map(|c| !c.is_empty())
+                .unwrap_or(false);
+            !(has_tool_calls && m.content.trim().is_empty())
+        }
+        _ => false,
+    }
+}
+
+/// Given a rendered index to start discarding from, find the raw index at
+/// which to truncate. The cut position is the raw length after all prior
+/// rendered messages — which also strips any tool-call scaffolding that
+/// immediately precedes the discarded rendered message. Returns `None` if
+/// `discard_from_rendered_index` is past the end of the rendered view.
+pub(crate) fn find_raw_cut(
+    messages: &[ChatMessage],
+    discard_from_rendered_index: usize,
+) -> Option<usize> {
+    let mut rendered_count = 0usize;
+    let mut last_kept_raw_end = 0usize;
+    for (i, m) in messages.iter().enumerate() {
+        if !is_rendered(m) {
+            continue;
+        }
+        if rendered_count == discard_from_rendered_index {
+            return Some(last_kept_raw_end);
+        }
+        rendered_count += 1;
+        last_kept_raw_end = i + 1;
+    }
+    if rendered_count == discard_from_rendered_index {
+        // Discarding past the last rendered message is a no-op, but we
+        // surface it as "nothing to cut" rather than silent success.
+        return None;
+    }
+    None
 }
 
 /// Read AGENTIC_CHAT_MAX_ITERATIONS once per call. Cheap; keeps the code
@@ -637,4 +740,46 @@ mod tests {
         let dropped = apply_context_budget(&mut msgs, 1);
         assert!(!dropped);
     }
+
+    #[test]
+    fn rewind_strips_assistant_and_tool_scaffolding() {
+        // Rendered: [user1, asst1, user2, asst2] → cut at rendered index 3
+        // (the final asst2) should drop the tool-call scaffolding + asst2,
+        // leaving raw up through user2.
+        let msgs = vec![
+            ChatMessage::system("sys"),
+            ChatMessage::user("q1"),
+            assistant_text("a1"),
+            ChatMessage::user("q2"),
+            assistant_with_tool_call("lookup"),
+            ChatMessage::tool_result("data"),
+            assistant_text("a2 final"),
+        ];
+        let cut = find_raw_cut(&msgs, 3).expect("cut found");
+        // raw[0..cut] should end at user("q2") — indices 0..=3.
+        assert_eq!(cut, 4);
+        assert_eq!(msgs[cut - 1].role, "user");
+        assert_eq!(msgs[cut - 1].content, "q2");
+    }
+
+    #[test]
+    fn rewind_at_second_rendered_cuts_after_first_user() {
+        // Rendered index 1 = the first assistant reply → dropping it should
+        // leave just the initial user message.
+        let msgs = vec![
+            ChatMessage::system("s"),
+            ChatMessage::user("q1"),
+            assistant_with_tool_call("tool"),
+            ChatMessage::tool_result("r"),
+            assistant_text("a1"),
+        ];
+        let cut = find_raw_cut(&msgs, 1).expect("cut found");
+        assert_eq!(cut, 2); // sys + user("q1")
+    }
+
+    #[test]
+    fn rewind_beyond_range_returns_none() {
+        let msgs = vec![ChatMessage::user("q1"), assistant_text("a1")];
+        assert!(find_raw_cut(&msgs, 5).is_none());
+    }
 }
diff --git a/src/ai/mod.rs b/src/ai/mod.rs
index 93735c0..3c58b2a 100644
--- a/src/ai/mod.rs
+++ b/src/ai/mod.rs
@@ -11,10 +11,10 @@ pub mod sms_client;
 #[allow(unused_imports)]
 pub use daily_summary_job::{generate_daily_summaries, strip_summary_boilerplate};
 pub use handlers::{
-    chat_history_handler, chat_turn_handler, delete_insight_handler, export_training_data_handler,
-    generate_agentic_insight_handler, generate_insight_handler, get_all_insights_handler,
-    get_available_models_handler, get_insight_handler, get_openrouter_models_handler,
-    rate_insight_handler,
+    chat_history_handler, chat_rewind_handler, chat_turn_handler, delete_insight_handler,
+    export_training_data_handler, generate_agentic_insight_handler, generate_insight_handler,
+    get_all_insights_handler, get_available_models_handler, get_insight_handler,
+    get_openrouter_models_handler, rate_insight_handler,
 };
 pub use insight_generator::InsightGenerator;
 #[allow(unused_imports)]
diff --git a/src/ai/ollama.rs b/src/ai/ollama.rs
index 2cc2cfa..8c487c5 100644
--- a/src/ai/ollama.rs
+++ b/src/ai/ollama.rs
@@ -691,6 +691,17 @@ Analyze the image and use specific details from both the visual content and the
             .await
             .with_context(|| "Failed to parse Ollama chat response")?;
 
+        // Log performance counters returned by Ollama. Durations are
+        // reported in nanoseconds; we render ms + tokens/sec for skim-ability
+        // in the server log. Missing fields are left off the line rather
+        // than printed as `None`.
+        log_chat_metrics(
+            chat_response.prompt_eval_count,
+            chat_response.prompt_eval_duration,
+            chat_response.eval_count,
+            chat_response.eval_duration,
+        );
+
         Ok((
             chat_response.message,
             chat_response.prompt_eval_count,
@@ -915,8 +926,14 @@ struct OllamaChatResponse {
     done_reason: String,
     #[serde(default)]
     prompt_eval_count: Option<i32>,
+    /// Nanoseconds spent evaluating the prompt (context ingestion).
+    #[serde(default)]
+    prompt_eval_duration: Option<u64>,
     #[serde(default)]
     eval_count: Option<i32>,
+    /// Nanoseconds spent generating the response tokens.
+    #[serde(default)]
+    eval_duration: Option<u64>,
 }
 
 #[derive(Deserialize)]
@@ -924,6 +941,52 @@ struct OllamaResponse {
     response: String,
 }
 
+fn log_chat_metrics(
+    prompt_eval_count: Option<i32>,
+    prompt_eval_duration_ns: Option<u64>,
+    eval_count: Option<i32>,
+    eval_duration_ns: Option<u64>,
+) {
+    // Compute tokens/sec when both count and duration are present.
+    fn tokens_per_sec(count: Option<i32>, duration_ns: Option<u64>) -> Option<f64> {
+        match (count, duration_ns) {
+            (Some(c), Some(d)) if c > 0 && d > 0 => Some((c as f64) * 1_000_000_000.0 / (d as f64)),
+            _ => None,
+        }
+    }
+    let prompt_ms = prompt_eval_duration_ns.map(|ns| ns as f64 / 1_000_000.0);
+    let eval_ms = eval_duration_ns.map(|ns| ns as f64 / 1_000_000.0);
+    let prompt_tps = tokens_per_sec(prompt_eval_count, prompt_eval_duration_ns);
+    let eval_tps = tokens_per_sec(eval_count, eval_duration_ns);
+
+    let mut parts: Vec<String> = Vec::new();
+    if let Some(c) = prompt_eval_count {
+        let mut s = format!("prompt={} tok", c);
+        if let Some(ms) = prompt_ms {
+            s.push_str(&format!(" ({:.0} ms", ms));
+            if let Some(tps) = prompt_tps {
+                s.push_str(&format!(", {:.1} tok/s", tps));
+            }
+            s.push(')');
+        }
+        parts.push(s);
+    }
+    if let Some(c) = eval_count {
+        let mut s = format!("gen={} tok", c);
+        if let Some(ms) = eval_ms {
+            s.push_str(&format!(" ({:.0} ms", ms));
+            if let Some(tps) = eval_tps {
+                s.push_str(&format!(", {:.1} tok/s", tps));
+            }
+            s.push(')');
+        }
+        parts.push(s);
+    }
+    if !parts.is_empty() {
+        log::info!("Ollama chat metrics — {}", parts.join(", "));
+    }
+}
+
 #[derive(Deserialize)]
 struct OllamaTagsResponse {
     models: Vec<OllamaModel>,
diff --git a/src/main.rs b/src/main.rs
index 53ab607..8be397f 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1358,6 +1358,7 @@ fn main() -> std::io::Result<()> {
                 .service(ai::get_openrouter_models_handler)
                 .service(ai::chat_turn_handler)
                 .service(ai::chat_history_handler)
+                .service(ai::chat_rewind_handler)
                 .service(ai::rate_insight_handler)
                 .service(ai::export_training_data_handler)
                 .service(libraries::list_libraries)
-- 
2.49.1


From c2bd3c08e197f2932476e8934fcba990b03fdd90 Mon Sep 17 00:00:00 2001
From: Cameron <ccordes12@mail.bw.edu>
Date: Tue, 21 Apr 2026 16:03:53 -0400
Subject: [PATCH 07/24] feat(ai): surface tool invocations in chat history

load_history now groups preceding tool_call + tool_result scaffolding
under each assistant reply as `tools: [{name, arguments, result}]`.
Result bodies over 2000 chars are truncated for payload size with a
`result_truncated` flag; the full value remains in training_messages.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/ai/handlers.rs     | 21 +++++++++++
 src/ai/insight_chat.rs | 82 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 102 insertions(+), 1 deletion(-)

diff --git a/src/ai/handlers.rs b/src/ai/handlers.rs
index 9c664b6..6bcb592 100644
--- a/src/ai/handlers.rs
+++ b/src/ai/handlers.rs
@@ -704,6 +704,17 @@ pub struct RenderedHistoryMessage {
     pub role: String,
     pub content: String,
     pub is_initial: bool,
+    #[serde(skip_serializing_if = "Vec::is_empty")]
+    pub tools: Vec<HistoryToolInvocation>,
+}
+
+#[derive(Debug, Serialize)]
+pub struct HistoryToolInvocation {
+    pub name: String,
+    pub arguments: serde_json::Value,
+    pub result: String,
+    #[serde(skip_serializing_if = "std::ops::Not::not")]
+    pub result_truncated: bool,
 }
 
 #[derive(Debug, Deserialize)]
@@ -787,6 +798,16 @@ pub async fn chat_history_handler(
                     role: m.role,
                     content: m.content,
                     is_initial: m.is_initial,
+                    tools: m
+                        .tools
+                        .into_iter()
+                        .map(|t| HistoryToolInvocation {
+                            name: t.name,
+                            arguments: t.arguments,
+                            result: t.result,
+                            result_truncated: t.result_truncated,
+                        })
+                        .collect(),
                 })
                 .collect(),
             turn_count: view.turn_count,
diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs
index 41411d3..2cc1da7 100644
--- a/src/ai/insight_chat.rs
+++ b/src/ai/insight_chat.rs
@@ -115,10 +115,39 @@ impl InsightChatService {
         let mut rendered = Vec::new();
         let mut user_turns_seen = 0usize;
         let mut assistant_turns_seen = 0usize;
+
+        // Accumulate tool invocations seen since the last user turn. An
+        // invocation is: one assistant tool_call message (which may hold
+        // multiple calls) + the N following tool-role messages (one per call,
+        // in order). They attach to the next assistant-with-content, which
+        // is the "final" reply for the current turn.
+        //
+        // Wire shape from the model:
+        //   assistant { tool_calls: [A, B], content: "" }
+        //   tool      { content: "result of A" }
+        //   tool      { content: "result of B" }
+        //   assistant { content: "here's the answer" }  ← rendered as final
+        let mut pending_tools: Vec<ToolInvocation> = Vec::new();
+        // Queue of (name, arguments) awaiting a tool_result to pair with.
+        let mut pending_calls: std::collections::VecDeque<(String, serde_json::Value)> =
+            std::collections::VecDeque::new();
+
         for msg in &messages {
             match msg.role.as_str() {
                 "system" => continue,
-                "tool" => continue,
+                "tool" => {
+                    if let Some((name, arguments)) = pending_calls.pop_front() {
+                        let (result, result_truncated) = truncate_tool_result(&msg.content);
+                        pending_tools.push(ToolInvocation {
+                            name,
+                            arguments,
+                            result,
+                            result_truncated,
+                        });
+                    }
+                    // If there's no pending call, the tool message is an
+                    // orphan (shouldn't happen in practice) — skip silently.
+                }
                 "assistant" => {
                     let has_tool_calls = msg
                         .tool_calls
@@ -126,22 +155,41 @@ impl InsightChatService {
                         .map(|c| !c.is_empty())
                         .unwrap_or(false);
                     if has_tool_calls && msg.content.trim().is_empty() {
+                        // Tool-dispatch turn: enqueue calls, wait for tool
+                        // results on subsequent messages.
+                        if let Some(ref tcs) = msg.tool_calls {
+                            for tc in tcs {
+                                pending_calls.push_back((
+                                    tc.function.name.clone(),
+                                    tc.function.arguments.clone(),
+                                ));
+                            }
+                        }
                         continue;
                     }
+                    // Final assistant reply for this turn — drain accumulated
+                    // tools into it.
                     assistant_turns_seen += 1;
+                    let tools = std::mem::take(&mut pending_tools);
+                    pending_calls.clear(); // any leftover unpaired calls are dropped
                     rendered.push(RenderedMessage {
                         role: "assistant".to_string(),
                         content: msg.content.clone(),
                         is_initial: false,
+                        tools,
                     });
                 }
                 "user" => {
                     let is_initial = user_turns_seen == 0;
                     user_turns_seen += 1;
+                    // New user turn resets any in-flight tool state.
+                    pending_tools.clear();
+                    pending_calls.clear();
                     rendered.push(RenderedMessage {
                         role: "user".to_string(),
                         content: msg.content.clone(),
                         is_initial,
+                        tools: Vec::new(),
                     });
                 }
                 _ => continue,
@@ -609,6 +657,38 @@ pub struct RenderedMessage {
     pub role: String,
     pub content: String,
     pub is_initial: bool,
+    /// Tools invoked during this turn (only populated for assistant replies).
+    /// Empty for user messages and for assistant replies that didn't involve
+    /// tool calls.
+    pub tools: Vec<ToolInvocation>,
+}
+
+#[derive(Debug, Clone)]
+pub struct ToolInvocation {
+    pub name: String,
+    pub arguments: serde_json::Value,
+    pub result: String,
+    /// True when `result` was trimmed for payload size. Full value remains
+    /// available in the raw training_messages blob.
+    pub result_truncated: bool,
+}
+
+/// Soft cap for tool-result bodies returned via the history API. Keeps
+/// payloads small for the mobile client — verbose SMS / geocoding responses
+/// don't need to ship in full for inspection.
+const TOOL_RESULT_PREVIEW_MAX: usize = 2000;
+
+fn truncate_tool_result(s: &str) -> (String, bool) {
+    if s.len() <= TOOL_RESULT_PREVIEW_MAX {
+        (s.to_string(), false)
+    } else {
+        // Cut on a char boundary.
+        let mut cut = TOOL_RESULT_PREVIEW_MAX;
+        while !s.is_char_boundary(cut) && cut > 0 {
+            cut -= 1;
+        }
+        (s[..cut].to_string(), true)
+    }
 }
 
 /// Trim history to fit within `budget_bytes` of serialized JSON. Preserves
-- 
2.49.1


From 079cd4c5b95f6e0ad422eeaee0bb0ee16ed289f0 Mon Sep 17 00:00:00 2001
From: Cameron <ccordes12@mail.bw.edu>
Date: Tue, 21 Apr 2026 16:57:41 -0400
Subject: [PATCH 08/24] feat(ai): streaming chat endpoint with live tool events

Add LlmClient::chat_with_tools_stream and SSE endpoint
POST /insights/chat/stream that emits text deltas, tool_call /
tool_result pairs, truncated notice, and a terminal done frame as the
agentic loop runs.

- Ollama: parses NDJSON from /api/chat stream, accumulates content
  deltas, emits Done with tool_calls from the final chunk.
- OpenRouter: parses OpenAI-compatible SSE, reassembles tool_call
  argument deltas by index, asks for stream_options.include_usage.
- InsightChatService spawns the loop on a tokio task, feeds events
  through an mpsc channel, persists training_messages at the end.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Cargo.lock             |  40 ++++
 Cargo.toml             |   5 +-
 src/ai/handlers.rs     | 108 +++++++++-
 src/ai/insight_chat.rs | 439 ++++++++++++++++++++++++++++++++++++++++-
 src/ai/llm_client.rs   |  32 +++
 src/ai/mod.rs          |   8 +-
 src/ai/ollama.rs       | 208 ++++++++++++++++++-
 src/ai/openrouter.rs   | 239 +++++++++++++++++++++-
 src/main.rs            |   1 +
 9 files changed, 1071 insertions(+), 9 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 2e210ea..a404abc 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -486,6 +486,28 @@ version = "0.7.6"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7c02d123df017efcdfbd739ef81735b36c5ba83ec3c59c80a9d7ecc718f92e50"
 
+[[package]]
+name = "async-stream"
+version = "0.3.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0b5a71a6f37880a80d1d7f19efd781e4b5de42c88f0722cc13bcb6cc2cfe8476"
+dependencies = [
+ "async-stream-impl",
+ "futures-core",
+ "pin-project-lite",
+]
+
+[[package]]
+name = "async-stream-impl"
+version = "0.3.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c7c24de15d275a1ecfd47a380fb4d5ec9bfe0933f309ed5e705b775596a3574d"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "async-trait"
 version = "0.1.89"
@@ -1843,10 +1865,12 @@ dependencies = [
  "actix-web",
  "actix-web-prom",
  "anyhow",
+ "async-stream",
  "async-trait",
  "base64",
  "bcrypt",
  "blake3",
+ "bytes",
  "chrono",
  "clap",
  "diesel",
@@ -1878,6 +1902,7 @@ dependencies = [
  "serde_json",
  "tempfile",
  "tokio",
+ "tokio-util",
  "urlencoding",
  "walkdir",
  "zerocopy",
@@ -3125,12 +3150,14 @@ dependencies = [
  "sync_wrapper",
  "tokio",
  "tokio-native-tls",
+ "tokio-util",
  "tower",
  "tower-http",
  "tower-service",
  "url",
  "wasm-bindgen",
  "wasm-bindgen-futures",
+ "wasm-streams",
  "web-sys",
 ]
 
@@ -4219,6 +4246,19 @@ dependencies = [
  "unicode-ident",
 ]
 
+[[package]]
+name = "wasm-streams"
+version = "0.4.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65"
+dependencies = [
+ "futures-util",
+ "js-sys",
+ "wasm-bindgen",
+ "wasm-bindgen-futures",
+ "web-sys",
+]
+
 [[package]]
 name = "web-sys"
 version = "0.3.77"
diff --git a/Cargo.toml b/Cargo.toml
index be60128..2b966b7 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -49,7 +49,10 @@ opentelemetry-appender-log = "0.31.0"
 tempfile = "3.20.0"
 regex = "1.11.1"
 exif = { package = "kamadak-exif", version = "0.6.1" }
-reqwest = { version = "0.12", features = ["json"] }
+reqwest = { version = "0.12", features = ["json", "stream"] }
+async-stream = "0.3"
+tokio-util = { version = "0.7", features = ["io"] }
+bytes = "1"
 urlencoding = "2.1"
 zerocopy = "0.8"
 ical = "0.11"
diff --git a/src/ai/handlers.rs b/src/ai/handlers.rs
index 6bcb592..d24927f 100644
--- a/src/ai/handlers.rs
+++ b/src/ai/handlers.rs
@@ -3,7 +3,7 @@ use opentelemetry::KeyValue;
 use opentelemetry::trace::{Span, Status, Tracer};
 use serde::{Deserialize, Serialize};
 
-use crate::ai::insight_chat::ChatTurnRequest;
+use crate::ai::insight_chat::{ChatStreamEvent, ChatTurnRequest};
 use crate::ai::{InsightGenerator, ModelCapabilities, OllamaClient};
 use crate::data::Claims;
 use crate::database::{ExifDao, InsightDao};
@@ -826,3 +826,109 @@ pub async fn chat_history_handler(
         }
     }
 }
+
+/// POST /insights/chat/stream — streaming variant of /insights/chat.
+/// Returns `text/event-stream` with one event per chat stream event.
+#[post("/insights/chat/stream")]
+pub async fn chat_stream_handler(
+    _claims: Claims,
+    request: web::Json<ChatTurnHttpRequest>,
+    app_state: web::Data<AppState>,
+) -> HttpResponse {
+    let library = match libraries::resolve_library_param(&app_state, request.library.as_deref()) {
+        Ok(Some(lib)) => lib,
+        Ok(None) => app_state.primary_library(),
+        Err(e) => {
+            return HttpResponse::BadRequest().json(serde_json::json!({
+                "error": format!("invalid library: {}", e)
+            }));
+        }
+    };
+
+    let chat_req = ChatTurnRequest {
+        library_id: library.id,
+        file_path: request.file_path.clone(),
+        user_message: request.user_message.clone(),
+        model: request.model.clone(),
+        backend: request.backend.clone(),
+        num_ctx: request.num_ctx,
+        temperature: request.temperature,
+        top_p: request.top_p,
+        top_k: request.top_k,
+        min_p: request.min_p,
+        max_iterations: request.max_iterations,
+        amend: request.amend,
+    };
+
+    let service = app_state.insight_chat.clone();
+    let events = service.chat_turn_stream(chat_req);
+
+    // Map ChatStreamEvent → SSE frame bytes.
+    let sse_stream = futures::stream::StreamExt::map(events, |ev| {
+        let frame = render_sse_frame(&ev);
+        Ok::<_, actix_web::Error>(actix_web::web::Bytes::from(frame))
+    });
+
+    HttpResponse::Ok()
+        .content_type("text/event-stream")
+        .insert_header(("Cache-Control", "no-cache"))
+        .insert_header(("X-Accel-Buffering", "no")) // nginx: disable response buffering
+        .streaming(sse_stream)
+}
+
+fn render_sse_frame(ev: &ChatStreamEvent) -> String {
+    let (event_name, payload) = match ev {
+        ChatStreamEvent::IterationStart { n, max } => {
+            ("iteration_start", serde_json::json!({ "n": n, "max": max }))
+        }
+        ChatStreamEvent::Truncated => ("truncated", serde_json::json!({})),
+        ChatStreamEvent::TextDelta(delta) => ("text", serde_json::json!({ "delta": delta })),
+        ChatStreamEvent::ToolCall {
+            index,
+            name,
+            arguments,
+        } => (
+            "tool_call",
+            serde_json::json!({ "index": index, "name": name, "arguments": arguments }),
+        ),
+        ChatStreamEvent::ToolResult {
+            index,
+            name,
+            result,
+            result_truncated,
+        } => (
+            "tool_result",
+            serde_json::json!({
+                "index": index,
+                "name": name,
+                "result": result,
+                "result_truncated": result_truncated,
+            }),
+        ),
+        ChatStreamEvent::Done {
+            tool_calls_made,
+            iterations_used,
+            truncated,
+            prompt_eval_count,
+            eval_count,
+            amended_insight_id,
+            backend_used,
+            model_used,
+        } => (
+            "done",
+            serde_json::json!({
+                "tool_calls_made": tool_calls_made,
+                "iterations_used": iterations_used,
+                "truncated": truncated,
+                "prompt_eval_count": prompt_eval_count,
+                "eval_count": eval_count,
+                "amended_insight_id": amended_insight_id,
+                "backend": backend_used,
+                "model": model_used,
+            }),
+        ),
+        ChatStreamEvent::Error(msg) => ("error", serde_json::json!({ "message": msg })),
+    };
+    let data = serde_json::to_string(&payload).unwrap_or_else(|_| "{}".to_string());
+    format!("event: {}\ndata: {}\n\n", event_name, data)
+}
diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs
index 2cc1da7..a4d21ea 100644
--- a/src/ai/insight_chat.rs
+++ b/src/ai/insight_chat.rs
@@ -7,13 +7,14 @@ use std::sync::{Arc, Mutex};
 use tokio::sync::Mutex as TokioMutex;
 
 use crate::ai::insight_generator::InsightGenerator;
-use crate::ai::llm_client::{ChatMessage, LlmClient};
+use crate::ai::llm_client::{ChatMessage, LlmClient, LlmStreamEvent};
 use crate::ai::ollama::OllamaClient;
 use crate::ai::openrouter::OpenRouterClient;
 use crate::database::InsightDao;
 use crate::database::models::InsertPhotoInsight;
 use crate::otel::global_tracer;
 use crate::utils::normalize_path;
+use futures::stream::{BoxStream, StreamExt};
 
 const DEFAULT_MAX_ITERATIONS: usize = 6;
 const DEFAULT_NUM_CTX: i32 = 8192;
@@ -583,6 +584,442 @@ impl InsightChatService {
             .map_err(|e| anyhow!("failed to persist truncated history: {:?}", e))?;
         Ok(())
     }
+
+    /// Streaming variant of `chat_turn`. Emits user-facing events as the
+    /// conversation progresses: iteration starts, tool dispatch + result,
+    /// text deltas from the final assistant reply, and a terminal `Done`
+    /// frame. Persistence happens inside the stream after the loop ends.
+    ///
+    /// The stream takes ownership of the service via `Arc<Self>` (passed by
+    /// the caller) so it can live past the handler's await boundary.
+    pub fn chat_turn_stream(
+        self: Arc<Self>,
+        req: ChatTurnRequest,
+    ) -> BoxStream<'static, ChatStreamEvent> {
+        let svc = self;
+        let s = async_stream::stream! {
+            match svc.chat_turn_stream_inner(req, |ev| Ok(ev)).await {
+                Ok(mut rx) => {
+                    while let Some(ev) = rx.recv().await {
+                        yield ev;
+                    }
+                }
+                Err(e) => {
+                    yield ChatStreamEvent::Error(format!("{}", e));
+                }
+            }
+        };
+        Box::pin(s)
+    }
+
+    /// Internal: drives the streaming loop on a background task, returning
+    /// a receiver the caller drains. Keeping the work on a spawned task
+    /// decouples the HTTP request lifetime from the chat execution, which
+    /// matters because the chat may run longer than any single network hop
+    /// and we want clean cancellation semantics via the channel close.
+    async fn chat_turn_stream_inner<F>(
+        self: Arc<Self>,
+        req: ChatTurnRequest,
+        _ev_mapper: F,
+    ) -> Result<tokio::sync::mpsc::Receiver<ChatStreamEvent>>
+    where
+        F: Fn(ChatStreamEvent) -> Result<ChatStreamEvent> + Send + 'static,
+    {
+        let (tx, rx) = tokio::sync::mpsc::channel::<ChatStreamEvent>(64);
+        let svc = self.clone();
+        tokio::spawn(async move {
+            let result = svc.run_streaming_turn(req, tx.clone()).await;
+            if let Err(e) = result {
+                let _ = tx.send(ChatStreamEvent::Error(format!("{}", e))).await;
+            }
+        });
+        Ok(rx)
+    }
+
+    async fn run_streaming_turn(
+        self: Arc<Self>,
+        req: ChatTurnRequest,
+        tx: tokio::sync::mpsc::Sender<ChatStreamEvent>,
+    ) -> Result<()> {
+        if req.user_message.trim().is_empty() {
+            bail!("user_message must not be empty");
+        }
+        if req.user_message.len() > 8192 {
+            bail!("user_message exceeds 8192 chars");
+        }
+        let normalized = normalize_path(&req.file_path);
+
+        let lock_key = (req.library_id, normalized.clone());
+        let entry_lock = {
+            let mut locks = self.chat_locks.lock().await;
+            locks
+                .entry(lock_key.clone())
+                .or_insert_with(|| Arc::new(TokioMutex::new(())))
+                .clone()
+        };
+        let _guard = entry_lock.lock().await;
+
+        let insight = {
+            let cx = opentelemetry::Context::new();
+            let mut dao = self.insight_dao.lock().expect("Unable to lock InsightDao");
+            dao.get_insight(&cx, &normalized)
+                .map_err(|e| anyhow!("failed to load insight: {:?}", e))?
+                .ok_or_else(|| anyhow!("no insight found for path"))?
+        };
+        let raw_history = insight
+            .training_messages
+            .as_ref()
+            .ok_or_else(|| {
+                anyhow!("insight has no chat history; regenerate this insight in agentic mode")
+            })?
+            .clone();
+        let mut messages: Vec<ChatMessage> = serde_json::from_str(&raw_history)
+            .map_err(|e| anyhow!("failed to deserialize chat history: {}", e))?;
+
+        // Backend selection — same rules as non-streaming chat_turn.
+        let stored_backend = insight.backend.clone();
+        let effective_backend = req
+            .backend
+            .as_deref()
+            .map(|s| s.trim().to_lowercase())
+            .filter(|s| !s.is_empty())
+            .unwrap_or_else(|| stored_backend.clone());
+        if !matches!(effective_backend.as_str(), "local" | "hybrid") {
+            bail!(
+                "unknown backend '{}'; expected 'local' or 'hybrid'",
+                effective_backend
+            );
+        }
+        if stored_backend == "local" && effective_backend == "hybrid" {
+            bail!(
+                "switching from local to hybrid mid-chat isn't supported yet; \
+                 regenerate the insight in hybrid mode if you want OpenRouter chat"
+            );
+        }
+        let is_hybrid = effective_backend == "hybrid";
+
+        let max_iterations = req
+            .max_iterations
+            .unwrap_or(DEFAULT_MAX_ITERATIONS)
+            .clamp(1, env_max_iterations());
+
+        let stored_model = insight.model_version.clone();
+        let custom_model = req
+            .model
+            .clone()
+            .or_else(|| Some(stored_model.clone()))
+            .filter(|m| !m.is_empty());
+
+        let mut ollama_client = self.ollama.clone();
+        let mut openrouter_client: Option<OpenRouterClient> = None;
+
+        if is_hybrid {
+            let arc = self.openrouter.as_ref().ok_or_else(|| {
+                anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
+            })?;
+            let mut c: OpenRouterClient = (**arc).clone();
+            if let Some(ref m) = custom_model {
+                c.primary_model = m.clone();
+            }
+            if req.temperature.is_some()
+                || req.top_p.is_some()
+                || req.top_k.is_some()
+                || req.min_p.is_some()
+            {
+                c.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
+            }
+            if let Some(ctx) = req.num_ctx {
+                c.set_num_ctx(Some(ctx));
+            }
+            openrouter_client = Some(c);
+        } else {
+            if let Some(ref m) = custom_model
+                && m != &self.ollama.primary_model
+            {
+                ollama_client = OllamaClient::new(
+                    self.ollama.primary_url.clone(),
+                    self.ollama.fallback_url.clone(),
+                    m.clone(),
+                    Some(m.clone()),
+                );
+            }
+            if req.temperature.is_some()
+                || req.top_p.is_some()
+                || req.top_k.is_some()
+                || req.min_p.is_some()
+            {
+                ollama_client.set_sampling_params(req.temperature, req.top_p, req.top_k, req.min_p);
+            }
+            if let Some(ctx) = req.num_ctx {
+                ollama_client.set_num_ctx(Some(ctx));
+            }
+        }
+
+        let chat_backend: &dyn LlmClient = if let Some(ref c) = openrouter_client {
+            c
+        } else {
+            &ollama_client
+        };
+        let model_used = chat_backend.primary_model().to_string();
+
+        // Tool set.
+        let local_first_user_has_image = messages
+            .iter()
+            .find(|m| m.role == "user")
+            .and_then(|m| m.images.as_ref())
+            .map(|imgs| !imgs.is_empty())
+            .unwrap_or(false);
+        let offer_describe_tool = !is_hybrid && local_first_user_has_image;
+        let tools = InsightGenerator::build_tool_definitions(offer_describe_tool);
+
+        let image_base64: Option<String> = if offer_describe_tool {
+            self.generator.load_image_as_base64(&normalized).ok()
+        } else {
+            None
+        };
+
+        // Truncate before appending the new user turn.
+        let budget_tokens = (req.num_ctx.unwrap_or(DEFAULT_NUM_CTX) as usize)
+            .saturating_sub(RESPONSE_HEADROOM_TOKENS);
+        let budget_bytes = budget_tokens.saturating_mul(BYTES_PER_TOKEN);
+        let truncated = apply_context_budget(&mut messages, budget_bytes);
+        if truncated {
+            let _ = tx.send(ChatStreamEvent::Truncated).await;
+        }
+
+        messages.push(ChatMessage::user(req.user_message.clone()));
+
+        let mut tool_calls_made = 0usize;
+        let mut iterations_used = 0usize;
+        let mut last_prompt_eval_count: Option<i32> = None;
+        let mut last_eval_count: Option<i32> = None;
+        let mut final_content = String::new();
+
+        for iteration in 0..max_iterations {
+            iterations_used = iteration + 1;
+            let _ = tx
+                .send(ChatStreamEvent::IterationStart {
+                    n: iterations_used,
+                    max: max_iterations,
+                })
+                .await;
+
+            let mut stream = chat_backend
+                .chat_with_tools_stream(messages.clone(), tools.clone())
+                .await?;
+
+            let mut final_message: Option<ChatMessage> = None;
+            while let Some(ev) = stream.next().await {
+                let ev = ev?;
+                match ev {
+                    LlmStreamEvent::TextDelta(delta) => {
+                        let _ = tx.send(ChatStreamEvent::TextDelta(delta)).await;
+                    }
+                    LlmStreamEvent::Done {
+                        message,
+                        prompt_eval_count,
+                        eval_count,
+                    } => {
+                        last_prompt_eval_count = prompt_eval_count;
+                        last_eval_count = eval_count;
+                        final_message = Some(message);
+                        break;
+                    }
+                }
+            }
+            let mut response =
+                final_message.ok_or_else(|| anyhow!("stream ended without a Done event"))?;
+
+            // Normalize non-object tool arguments (same as non-streaming path).
+            if let Some(ref mut tcs) = response.tool_calls {
+                for tc in tcs.iter_mut() {
+                    if !tc.function.arguments.is_object() {
+                        tc.function.arguments = serde_json::Value::Object(Default::default());
+                    }
+                }
+            }
+
+            messages.push(response.clone());
+
+            if let Some(ref tool_calls) = response.tool_calls
+                && !tool_calls.is_empty()
+            {
+                for (i, tool_call) in tool_calls.iter().enumerate() {
+                    tool_calls_made += 1;
+                    let call_index = tool_calls_made - 1;
+                    let _ = tx
+                        .send(ChatStreamEvent::ToolCall {
+                            index: call_index,
+                            name: tool_call.function.name.clone(),
+                            arguments: tool_call.function.arguments.clone(),
+                        })
+                        .await;
+                    let cx = opentelemetry::Context::new();
+                    let result = self
+                        .generator
+                        .execute_tool(
+                            &tool_call.function.name,
+                            &tool_call.function.arguments,
+                            &ollama_client,
+                            &image_base64,
+                            &normalized,
+                            &cx,
+                        )
+                        .await;
+                    let (result_preview, result_truncated) = truncate_tool_result(&result);
+                    let _ = tx
+                        .send(ChatStreamEvent::ToolResult {
+                            index: call_index,
+                            name: tool_call.function.name.clone(),
+                            result: result_preview,
+                            result_truncated,
+                        })
+                        .await;
+                    messages.push(ChatMessage::tool_result(result));
+                    let _ = i; // reserved for per-call ordering if needed
+                }
+                continue;
+            }
+
+            final_content = response.content;
+            break;
+        }
+
+        if final_content.is_empty() {
+            messages.push(ChatMessage::user(
+                "Please write your final answer now without calling any more tools.",
+            ));
+            let mut stream = chat_backend
+                .chat_with_tools_stream(messages.clone(), vec![])
+                .await?;
+            let mut final_message: Option<ChatMessage> = None;
+            while let Some(ev) = stream.next().await {
+                let ev = ev?;
+                match ev {
+                    LlmStreamEvent::TextDelta(delta) => {
+                        let _ = tx.send(ChatStreamEvent::TextDelta(delta)).await;
+                    }
+                    LlmStreamEvent::Done {
+                        message,
+                        prompt_eval_count,
+                        eval_count,
+                    } => {
+                        last_prompt_eval_count = prompt_eval_count;
+                        last_eval_count = eval_count;
+                        final_message = Some(message);
+                        break;
+                    }
+                }
+            }
+            let final_response =
+                final_message.ok_or_else(|| anyhow!("final stream ended without a Done event"))?;
+            final_content = final_response.content.clone();
+            messages.push(final_response);
+        }
+
+        // Persist.
+        let json = serde_json::to_string(&messages)
+            .map_err(|e| anyhow!("failed to serialize chat history: {}", e))?;
+
+        let mut amended_insight_id: Option<i32> = None;
+        if req.amend {
+            let title_prompt = format!(
+                "Create a short title (maximum 8 words) for the following journal entry:\n\n{}\n\n\
+                 Capture the key moment or theme. Return ONLY the title, nothing else.",
+                final_content
+            );
+            let title_raw = chat_backend
+                .generate(
+                    &title_prompt,
+                    Some(
+                        "You are my long term memory assistant. Use only the information provided. Do not invent details.",
+                    ),
+                    None,
+                )
+                .await?;
+            let title = title_raw.trim().trim_matches('"').to_string();
+
+            let new_row = InsertPhotoInsight {
+                library_id: req.library_id,
+                file_path: normalized.clone(),
+                title,
+                summary: final_content.clone(),
+                generated_at: Utc::now().timestamp(),
+                model_version: model_used.clone(),
+                is_current: true,
+                training_messages: Some(json),
+                backend: effective_backend.clone(),
+            };
+            let cx = opentelemetry::Context::new();
+            let mut dao = self.insight_dao.lock().expect("Unable to lock InsightDao");
+            let stored = dao
+                .store_insight(&cx, new_row)
+                .map_err(|e| anyhow!("failed to store amended insight: {:?}", e))?;
+            amended_insight_id = Some(stored.id);
+        } else {
+            let cx = opentelemetry::Context::new();
+            let mut dao = self.insight_dao.lock().expect("Unable to lock InsightDao");
+            dao.update_training_messages(&cx, req.library_id, &normalized, &json)
+                .map_err(|e| anyhow!("failed to persist chat history: {:?}", e))?;
+        }
+
+        let _ = tx
+            .send(ChatStreamEvent::Done {
+                tool_calls_made,
+                iterations_used,
+                truncated,
+                prompt_eval_count: last_prompt_eval_count,
+                eval_count: last_eval_count,
+                amended_insight_id,
+                backend_used: effective_backend,
+                model_used,
+            })
+            .await;
+
+        Ok(())
+    }
+}
+
+/// Events emitted by `chat_turn_stream`. One stream per turn; ends after
+/// `Done` or `Error`.
+#[derive(Debug, Clone)]
+pub enum ChatStreamEvent {
+    /// Starting iteration `n` of up to `max` (1-based).
+    IterationStart { n: usize, max: usize },
+    /// History was trimmed to fit the context budget before the turn ran.
+    /// Emitted at most once, before any tool or text events.
+    Truncated,
+    /// Incremental content from the final assistant reply. Concatenate to
+    /// reconstruct the reply body. Tool-dispatch turns don't produce these.
+    TextDelta(String),
+    /// The model requested this tool call. Emitted just before execution.
+    /// `index` is a monotonically-increasing counter across the turn so the
+    /// client can pair `ToolCall` with its matching `ToolResult`.
+    ToolCall {
+        index: usize,
+        name: String,
+        arguments: serde_json::Value,
+    },
+    /// The tool finished; `result` is the (possibly truncated) output.
+    ToolResult {
+        index: usize,
+        name: String,
+        result: String,
+        result_truncated: bool,
+    },
+    /// Terminal success event with counters + persistence result.
+    Done {
+        tool_calls_made: usize,
+        iterations_used: usize,
+        truncated: bool,
+        prompt_eval_count: Option<i32>,
+        eval_count: Option<i32>,
+        amended_insight_id: Option<i32>,
+        backend_used: String,
+        model_used: String,
+    },
+    /// Terminal failure event. No further events follow.
+    Error(String),
 }
 
 /// Is this raw message visible in the rendered transcript? Must match
diff --git a/src/ai/llm_client.rs b/src/ai/llm_client.rs
index c1f1bca..8d68978 100644
--- a/src/ai/llm_client.rs
+++ b/src/ai/llm_client.rs
@@ -1,5 +1,6 @@
 use anyhow::Result;
 use async_trait::async_trait;
+use futures::stream::BoxStream;
 use serde::{Deserialize, Serialize};
 
 /// Provider-agnostic surface for LLM backends (Ollama, OpenRouter, …).
@@ -30,6 +31,18 @@ pub trait LlmClient: Send + Sync {
         tools: Vec<Tool>,
     ) -> Result<(ChatMessage, Option<i32>, Option<i32>)>;
 
+    /// Streaming variant of `chat_with_tools`. The returned stream yields
+    /// `TextDelta` items as content is produced, then a single terminal
+    /// `Done` carrying the complete assembled message (with tool_calls, if
+    /// any) plus token usage counts. Implementations that can't stream may
+    /// fall back to calling `chat_with_tools` and emitting the full reply
+    /// as one `Done` event.
+    async fn chat_with_tools_stream(
+        &self,
+        messages: Vec<ChatMessage>,
+        tools: Vec<Tool>,
+    ) -> Result<BoxStream<'static, Result<LlmStreamEvent>>>;
+
     /// Batch embedding generation. Dimensionality is provider/model specific.
     async fn generate_embeddings(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>>;
 
@@ -47,6 +60,25 @@ pub trait LlmClient: Send + Sync {
     fn primary_model(&self) -> &str;
 }
 
+/// Events emitted by streaming `chat_with_tools_stream`. A stream is a
+/// sequence of zero or more `TextDelta` events followed by exactly one
+/// `Done`. Callers should treat `Done` as terminal — further items (if any
+/// slip through due to upstream misbehavior) are safe to ignore.
+#[derive(Debug, Clone)]
+pub enum LlmStreamEvent {
+    /// Incremental content token(s) from the model. Concatenate in order to
+    /// reconstruct the assistant's final text.
+    TextDelta(String),
+    /// Terminal event with the full assembled message (content + any
+    /// tool_calls). `message.content` equals the concatenation of every
+    /// preceding `TextDelta.0`.
+    Done {
+        message: ChatMessage,
+        prompt_eval_count: Option<i32>,
+        eval_count: Option<i32>,
+    },
+}
+
 /// Tool definition sent to the model (OpenAI-compatible function schema).
 #[derive(Serialize, Clone, Debug)]
 pub struct Tool {
diff --git a/src/ai/mod.rs b/src/ai/mod.rs
index 3c58b2a..8e38930 100644
--- a/src/ai/mod.rs
+++ b/src/ai/mod.rs
@@ -11,10 +11,10 @@ pub mod sms_client;
 #[allow(unused_imports)]
 pub use daily_summary_job::{generate_daily_summaries, strip_summary_boilerplate};
 pub use handlers::{
-    chat_history_handler, chat_rewind_handler, chat_turn_handler, delete_insight_handler,
-    export_training_data_handler, generate_agentic_insight_handler, generate_insight_handler,
-    get_all_insights_handler, get_available_models_handler, get_insight_handler,
-    get_openrouter_models_handler, rate_insight_handler,
+    chat_history_handler, chat_rewind_handler, chat_stream_handler, chat_turn_handler,
+    delete_insight_handler, export_training_data_handler, generate_agentic_insight_handler,
+    generate_insight_handler, get_all_insights_handler, get_available_models_handler,
+    get_insight_handler, get_openrouter_models_handler, rate_insight_handler,
 };
 pub use insight_generator::InsightGenerator;
 #[allow(unused_imports)]
diff --git a/src/ai/ollama.rs b/src/ai/ollama.rs
index 8c487c5..1dc67f8 100644
--- a/src/ai/ollama.rs
+++ b/src/ai/ollama.rs
@@ -7,7 +7,8 @@ use std::collections::HashMap;
 use std::sync::{Arc, Mutex};
 use std::time::{Duration, Instant};
 
-use crate::ai::llm_client::LlmClient;
+use crate::ai::llm_client::{LlmClient, LlmStreamEvent};
+use futures::stream::{BoxStream, StreamExt};
 
 // Re-export shared types so existing `crate::ai::ollama::{...}` imports
 // continue to resolve.
@@ -634,6 +635,174 @@ Analyze the image and use specific details from both the visual content and the
         }
     }
 
+    /// Streaming variant of `chat_with_tools`. Tries primary, then falls
+    /// back if the initial connection fails; once the stream has begun
+    /// emitting, mid-stream errors propagate to the caller. Emits
+    /// `TextDelta` events as content tokens arrive and a single terminal
+    /// `Done` event when the model marks the turn complete (tool_calls, if
+    /// any, live on the final message).
+    pub async fn chat_with_tools_stream(
+        &self,
+        messages: Vec<ChatMessage>,
+        tools: Vec<Tool>,
+    ) -> Result<BoxStream<'static, Result<LlmStreamEvent>>> {
+        // Attempt primary. If it can't be opened at all, try fallback.
+        match self
+            .try_chat_with_tools_stream(&self.primary_url, messages.clone(), tools.clone())
+            .await
+        {
+            Ok(s) => Ok(s),
+            Err(e) => {
+                if let Some(fallback_url) = self.fallback_url.clone() {
+                    log::warn!(
+                        "Streaming chat primary failed ({}); trying fallback {}",
+                        e,
+                        fallback_url
+                    );
+                    self.try_chat_with_tools_stream(&fallback_url, messages, tools)
+                        .await
+                } else {
+                    Err(e)
+                }
+            }
+        }
+    }
+
+    async fn try_chat_with_tools_stream(
+        &self,
+        base_url: &str,
+        messages: Vec<ChatMessage>,
+        tools: Vec<Tool>,
+    ) -> Result<BoxStream<'static, Result<LlmStreamEvent>>> {
+        let url = format!("{}/api/chat", base_url);
+        let model = if base_url == self.primary_url {
+            &self.primary_model
+        } else {
+            self.fallback_model
+                .as_deref()
+                .unwrap_or(&self.primary_model)
+        };
+        let options = self.build_options();
+
+        let request_body = OllamaChatRequest {
+            model,
+            messages: &messages,
+            stream: true,
+            tools,
+            options,
+        };
+
+        let response = self
+            .client
+            .post(&url)
+            .json(&request_body)
+            .send()
+            .await
+            .with_context(|| format!("Failed to connect to Ollama at {}", url))?;
+
+        if !response.status().is_success() {
+            let status = response.status();
+            let body = response.text().await.unwrap_or_default();
+            anyhow::bail!(
+                "Ollama stream request failed with status {}: {}",
+                status,
+                body
+            );
+        }
+
+        // Ollama streams NDJSON: each line is a full `OllamaStreamChunk`.
+        // We buffer partial lines across chunks from the byte stream.
+        let byte_stream = response.bytes_stream();
+        let stream = async_stream::stream! {
+            let mut buf: Vec<u8> = Vec::new();
+            let mut accumulated = String::new();
+            let mut tool_calls: Option<Vec<crate::ai::llm_client::ToolCall>> = None;
+            let mut role = "assistant".to_string();
+            let mut prompt_eval_count: Option<i32> = None;
+            let mut eval_count: Option<i32> = None;
+            let mut prompt_eval_duration: Option<u64> = None;
+            let mut eval_duration: Option<u64> = None;
+            let mut done_seen = false;
+
+            let mut byte_stream = byte_stream;
+            while let Some(chunk) = byte_stream.next().await {
+                let chunk = match chunk {
+                    Ok(b) => b,
+                    Err(e) => {
+                        yield Err(anyhow::anyhow!("stream read failed: {}", e));
+                        return;
+                    }
+                };
+                buf.extend_from_slice(&chunk);
+
+                // Drain complete lines; hold any trailing partial.
+                while let Some(nl) = buf.iter().position(|b| *b == b'\n') {
+                    let line = buf.drain(..=nl).collect::<Vec<_>>();
+                    let line_str = match std::str::from_utf8(&line) {
+                        Ok(s) => s.trim(),
+                        Err(_) => continue,
+                    };
+                    if line_str.is_empty() {
+                        continue;
+                    }
+                    match serde_json::from_str::<OllamaStreamChunk>(line_str) {
+                        Ok(chunk) => {
+                            // Accumulate content delta.
+                            if !chunk.message.content.is_empty() {
+                                accumulated.push_str(&chunk.message.content);
+                                yield Ok(LlmStreamEvent::TextDelta(chunk.message.content));
+                            }
+                            if !chunk.message.role.is_empty() {
+                                role = chunk.message.role;
+                            }
+                            // Ollama only attaches tool_calls on the final chunk.
+                            if let Some(tcs) = chunk.message.tool_calls
+                                && !tcs.is_empty()
+                            {
+                                tool_calls = Some(tcs);
+                            }
+                            if chunk.done {
+                                prompt_eval_count = chunk.prompt_eval_count;
+                                eval_count = chunk.eval_count;
+                                prompt_eval_duration = chunk.prompt_eval_duration;
+                                eval_duration = chunk.eval_duration;
+                                done_seen = true;
+                                break;
+                            }
+                        }
+                        Err(e) => {
+                            log::warn!("malformed Ollama stream line: {} ({})", line_str, e);
+                        }
+                    }
+                }
+                if done_seen {
+                    break;
+                }
+            }
+
+            // Emit the terminal Done event with the assembled message.
+            log_chat_metrics(
+                prompt_eval_count,
+                prompt_eval_duration,
+                eval_count,
+                eval_duration,
+            );
+            let message = ChatMessage {
+                role,
+                content: accumulated,
+                tool_calls,
+                images: None,
+            };
+            yield Ok(LlmStreamEvent::Done {
+                message,
+                prompt_eval_count,
+                eval_count,
+            });
+        };
+
+        Ok(Box::pin(stream))
+    }
+
     async fn try_chat_with_tools(
         &self,
         base_url: &str,
@@ -857,6 +1026,14 @@ impl LlmClient for OllamaClient {
         OllamaClient::chat_with_tools(self, messages, tools).await
     }
 
+    async fn chat_with_tools_stream(
+        &self,
+        messages: Vec<ChatMessage>,
+        tools: Vec<Tool>,
+    ) -> Result<BoxStream<'static, Result<LlmStreamEvent>>> {
+        OllamaClient::chat_with_tools_stream(self, messages, tools).await
+    }
+
     async fn generate_embeddings(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>> {
         OllamaClient::generate_embeddings(self, texts).await
     }
@@ -936,6 +1113,35 @@ struct OllamaChatResponse {
     eval_duration: Option<u64>,
 }
 
+/// One chunk in the NDJSON stream from `/api/chat` with `stream: true`.
+/// Early chunks carry content deltas in `message.content`; the final chunk
+/// has `done: true`, optional `tool_calls`, and usage counters.
+#[derive(Deserialize, Debug)]
+struct OllamaStreamChunk {
+    #[serde(default)]
+    message: OllamaStreamMessage,
+    #[serde(default)]
+    done: bool,
+    #[serde(default)]
+    prompt_eval_count: Option<i32>,
+    #[serde(default)]
+    prompt_eval_duration: Option<u64>,
+    #[serde(default)]
+    eval_count: Option<i32>,
+    #[serde(default)]
+    eval_duration: Option<u64>,
+}
+
+#[derive(Deserialize, Debug, Default)]
+struct OllamaStreamMessage {
+    #[serde(default)]
+    role: String,
+    #[serde(default)]
+    content: String,
+    #[serde(default)]
+    tool_calls: Option<Vec<crate::ai::llm_client::ToolCall>>,
+}
+
 #[derive(Deserialize)]
 struct OllamaResponse {
     response: String,
diff --git a/src/ai/openrouter.rs b/src/ai/openrouter.rs
index 2c46852..1559479 100644
--- a/src/ai/openrouter.rs
+++ b/src/ai/openrouter.rs
@@ -12,8 +12,9 @@ use std::sync::{Arc, Mutex};
 use std::time::{Duration, Instant};
 
 use crate::ai::llm_client::{
-    ChatMessage, LlmClient, ModelCapabilities, Tool, ToolCall, ToolCallFunction,
+    ChatMessage, LlmClient, LlmStreamEvent, ModelCapabilities, Tool, ToolCall, ToolCallFunction,
 };
+use futures::stream::{BoxStream, StreamExt};
 
 const DEFAULT_BASE_URL: &str = "https://openrouter.ai/api/v1";
 const DEFAULT_EMBEDDING_MODEL: &str = "openai/text-embedding-3-small";
@@ -378,6 +379,220 @@ impl LlmClient for OpenRouterClient {
         Ok((chat_msg, prompt_tokens, completion_tokens))
     }
 
+    async fn chat_with_tools_stream(
+        &self,
+        messages: Vec<ChatMessage>,
+        tools: Vec<Tool>,
+    ) -> Result<BoxStream<'static, Result<LlmStreamEvent>>> {
+        let url = format!("{}/chat/completions", self.base_url);
+        let mut body = serde_json::Map::new();
+        body.insert("model".into(), Value::String(self.primary_model.clone()));
+        body.insert(
+            "messages".into(),
+            Value::Array(Self::messages_to_openai(&messages)),
+        );
+        body.insert("stream".into(), Value::Bool(true));
+        // Ask for usage data in the final chunk (OpenAI + OpenRouter
+        // both honor this options bag).
+        body.insert(
+            "stream_options".into(),
+            serde_json::json!({ "include_usage": true }),
+        );
+        if !tools.is_empty() {
+            body.insert(
+                "tools".into(),
+                serde_json::to_value(&tools).context("serializing tools")?,
+            );
+        }
+        for (k, v) in self.build_options() {
+            body.insert(k.into(), v);
+        }
+
+        let resp = self
+            .authed(self.client.post(&url))
+            .json(&Value::Object(body))
+            .send()
+            .await
+            .with_context(|| format!("POST {} failed", url))?;
+
+        if !resp.status().is_success() {
+            let status = resp.status();
+            let body = resp.text().await.unwrap_or_default();
+            bail!("OpenRouter stream request failed: {} — {}", status, body);
+        }
+
+        // OpenAI-compat SSE stream. Each event is `data: <json>\n\n`, with
+        // `data: [DONE]` signalling completion. Tool calls arrive as
+        // `delta.tool_calls[i]` chunks that must be concatenated by index.
+        let byte_stream = resp.bytes_stream();
+        let stream = async_stream::stream! {
+            let mut byte_stream = byte_stream;
+            let mut buf: Vec<u8> = Vec::new();
+            let mut accumulated_content = String::new();
+            // tool call state: index -> (id, name, args_string)
+            let mut tool_state: std::collections::BTreeMap<
+                usize,
+                (Option<String>, Option<String>, String),
+            > = std::collections::BTreeMap::new();
+            let mut role = "assistant".to_string();
+            let mut prompt_tokens: Option<i32> = None;
+            let mut completion_tokens: Option<i32> = None;
+            let mut done_seen = false;
+
+            while let Some(chunk) = byte_stream.next().await {
+                let chunk = match chunk {
+                    Ok(b) => b,
+                    Err(e) => {
+                        yield Err(anyhow!("stream read failed: {}", e));
+                        return;
+                    }
+                };
+                buf.extend_from_slice(&chunk);
+
+                // SSE frames are delimited by a blank line. Walk the buffer
+                // for "\n\n" markers; anything before them is a complete
+                // frame (possibly multi-line).
+                loop {
+                    let Some(sep) = find_double_newline(&buf) else { break };
+                    let frame = buf.drain(..sep + 2).collect::<Vec<_>>();
+                    let frame_str = match std::str::from_utf8(&frame) {
+                        Ok(s) => s,
+                        Err(_) => continue,
+                    };
+                    // A frame is one or more lines; the payload is on data:
+                    // lines. Ignore comments and other fields.
+                    for line in frame_str.lines() {
+                        let line = line.trim_end_matches('\r');
+                        let payload = match line.strip_prefix("data: ") {
+                            Some(p) => p,
+                            None => continue,
+                        };
+                        if payload == "[DONE]" {
+                            done_seen = true;
+                            break;
+                        }
+                        let v: Value = match serde_json::from_str(payload) {
+                            Ok(v) => v,
+                            Err(e) => {
+                                log::warn!(
+                                    "malformed OpenRouter SSE frame: {} ({})",
+                                    payload,
+                                    e
+                                );
+                                continue;
+                            }
+                        };
+
+                        // Usage can arrive in a dedicated final frame with
+                        // empty choices.
+                        if let Some(usage) = v.get("usage") {
+                            prompt_tokens = usage
+                                .get("prompt_tokens")
+                                .and_then(|n| n.as_i64())
+                                .map(|n| n as i32);
+                            completion_tokens = usage
+                                .get("completion_tokens")
+                                .and_then(|n| n.as_i64())
+                                .map(|n| n as i32);
+                        }
+
+                        let Some(choices) = v.get("choices").and_then(|c| c.as_array())
+                        else {
+                            continue;
+                        };
+                        let Some(choice) = choices.first() else { continue };
+                        let delta = match choice.get("delta") {
+                            Some(d) => d,
+                            None => continue,
+                        };
+                        if let Some(r) = delta.get("role").and_then(|v| v.as_str()) {
+                            role = r.to_string();
+                        }
+                        if let Some(content) =
+                            delta.get("content").and_then(|v| v.as_str())
+                            && !content.is_empty()
+                        {
+                            accumulated_content.push_str(content);
+                            yield Ok(LlmStreamEvent::TextDelta(content.to_string()));
+                        }
+                        if let Some(tcs) = delta.get("tool_calls").and_then(|v| v.as_array()) {
+                            for tc_delta in tcs {
+                                let idx = tc_delta
+                                    .get("index")
+                                    .and_then(|n| n.as_u64())
+                                    .unwrap_or(0) as usize;
+                                let entry = tool_state
+                                    .entry(idx)
+                                    .or_insert((None, None, String::new()));
+                                if let Some(id) =
+                                    tc_delta.get("id").and_then(|v| v.as_str())
+                                {
+                                    entry.0 = Some(id.to_string());
+                                }
+                                if let Some(func) = tc_delta.get("function") {
+                                    if let Some(name) =
+                                        func.get("name").and_then(|v| v.as_str())
+                                    {
+                                        entry.1 = Some(name.to_string());
+                                    }
+                                    if let Some(args) =
+                                        func.get("arguments").and_then(|v| v.as_str())
+                                    {
+                                        entry.2.push_str(args);
+                                    }
+                                }
+                            }
+                        }
+                    }
+                    if done_seen {
+                        break;
+                    }
+                }
+                if done_seen {
+                    break;
+                }
+            }
+
+            // Finalize tool calls: parse accumulated argument strings.
+            let tool_calls: Option<Vec<ToolCall>> = if tool_state.is_empty() {
+                None
+            } else {
+                let mut v = Vec::with_capacity(tool_state.len());
+                for (_idx, (id, name, args)) in tool_state {
+                    let arguments: Value = if args.trim().is_empty() {
+                        Value::Object(Default::default())
+                    } else {
+                        serde_json::from_str(&args).unwrap_or_else(|_| {
+                            Value::Object(Default::default())
+                        })
+                    };
+                    v.push(ToolCall {
+                        id,
+                        function: ToolCallFunction {
+                            name: name.unwrap_or_default(),
+                            arguments,
+                        },
+                    });
+                }
+                Some(v)
+            };
+
+            let message = ChatMessage {
+                role,
+                content: accumulated_content,
+                tool_calls,
+                images: None,
+            };
+            yield Ok(LlmStreamEvent::Done {
+                message,
+                prompt_eval_count: prompt_tokens,
+                eval_count: completion_tokens,
+            });
+        };
+
+        Ok(Box::pin(stream))
+    }
+
     async fn generate_embeddings(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>> {
         let url = format!("{}/embeddings", self.base_url);
         let body = json!({
@@ -473,6 +688,28 @@ impl LlmClient for OpenRouterClient {
     }
 }
 
+/// Find the byte offset of the first `\n\n` (end of an SSE frame) in `buf`.
+/// Returns the index of the first `\n` of the pair, so the full separator is
+/// `buf[idx..=idx+1]`. Also handles `\r\n\r\n` since some servers emit it.
+fn find_double_newline(buf: &[u8]) -> Option<usize> {
+    for i in 0..buf.len().saturating_sub(1) {
+        if buf[i] == b'\n' && buf[i + 1] == b'\n' {
+            return Some(i);
+        }
+        // \r\n\r\n: the second \n of this pattern is at i+2; flag at i so the
+        // drain call (which consumes ..sep+2) takes exactly the frame.
+        if i + 3 < buf.len()
+            && buf[i] == b'\r'
+            && buf[i + 1] == b'\n'
+            && buf[i + 2] == b'\r'
+            && buf[i + 3] == b'\n'
+        {
+            return Some(i + 1);
+        }
+    }
+    None
+}
+
 /// Build a `data:` URL if the provided string is raw base64, otherwise pass it through.
 fn image_to_data_url(img: &str) -> String {
     if img.starts_with("data:") {
diff --git a/src/main.rs b/src/main.rs
index 8be397f..da0c4dd 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -1357,6 +1357,7 @@ fn main() -> std::io::Result<()> {
                 .service(ai::get_available_models_handler)
                 .service(ai::get_openrouter_models_handler)
                 .service(ai::chat_turn_handler)
+                .service(ai::chat_stream_handler)
                 .service(ai::chat_history_handler)
                 .service(ai::chat_rewind_handler)
                 .service(ai::rate_insight_handler)
-- 
2.49.1


From e51cd564a3da4cf66488c4d8409d70a12ec711d5 Mon Sep 17 00:00:00 2001
From: Cameron <ccordes12@mail.bw.edu>
Date: Tue, 21 Apr 2026 17:32:43 -0400
Subject: [PATCH 09/24] docs: chat continuation endpoints + env vars

Document the four new chat endpoints, SSE event shape, backend
routing rules, rewind semantics, amend mode, and the
AGENTIC_CHAT_MAX_ITERATIONS cap.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CLAUDE.md | 67 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
 README.md | 18 +++++++++++++++
 2 files changed, 85 insertions(+)

diff --git a/CLAUDE.md b/CLAUDE.md
index 0849e41..004569e 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -169,6 +169,20 @@ POST   /image/tags/batch (bulk tag updates)
 
 // Memories (week-based grouping)
 GET /memories?path=...&recursive=true
+
+// AI Insights
+POST /insights/generate              (non-agentic single-shot)
+POST /insights/generate/agentic      (tool-calling loop; body: { file_path, backend?, model?, ... })
+GET  /insights?path=...&library=...
+GET  /insights/models                (local Ollama models + capabilities)
+GET  /insights/openrouter/models     (curated OpenRouter allowlist)
+POST /insights/rate                  (thumbs up/down for training data)
+
+// Insight Chat Continuation
+POST /insights/chat                  (single-turn reply, non-streaming)
+POST /insights/chat/stream           (SSE: text / tool_call / tool_result / truncated / done)
+GET  /insights/chat/history?path=... (rendered transcript with tool invocations)
+POST /insights/chat/rewind           (truncate transcript at a rendered index)
 ```
 
 **Request Types:**
@@ -269,6 +283,9 @@ OPENROUTER_BASE_URL=https://openrouter.ai/api/v1     # Override base URL (option
 OPENROUTER_EMBEDDING_MODEL=openai/text-embedding-3-small  # Optional, embeddings stay local today
 OPENROUTER_HTTP_REFERER=https://your-site.example    # Optional attribution header
 OPENROUTER_APP_TITLE=ImageApi                  # Optional attribution header
+
+# Insight Chat Continuation
+AGENTIC_CHAT_MAX_ITERATIONS=6                  # Cap on tool-calling iterations per chat turn (default 6)
 ```
 
 **AI Insights Fallback Behavior:**
@@ -297,6 +314,56 @@ This allows runtime verification of model availability before generating insight
 - `GET /insights/openrouter/models` returns `{ models, default_model, configured }`
   for client picker UIs.
 
+**Insight Chat Continuation:**
+
+After an agentic insight is generated, the full `Vec<ChatMessage>` transcript is
+stored in `photo_insights.training_messages` and can be continued via the
+chat endpoints. The `PhotoInsightResponse.has_training_messages` flag tells
+clients whether chat is available for a given insight.
+
+- `POST /insights/chat` runs one turn of the agentic loop against the replayed
+  history. Body: `{ file_path, library?, user_message, model?, backend?, num_ctx?,
+  temperature?, top_p?, top_k?, min_p?, max_iterations?, amend? }`.
+- `POST /insights/chat/stream` is the SSE variant — same request body, response
+  is `text/event-stream` with events: `iteration_start`, `text` (delta), `tool_call`,
+  `tool_result`, `truncated`, `done`, plus a server-emitted `error_message` on
+  failure. Preferred by the mobile client for live tool-chip updates.
+- `GET /insights/chat/history?path=...&library=...` returns the rendered
+  transcript. Each assistant message carries a `tools: [{name, arguments, result,
+  result_truncated?}]` array with the tool invocations that led up to it. Tool
+  results over 2000 chars are truncated with `result_truncated: true`.
+- `POST /insights/chat/rewind` truncates the transcript at a given rendered
+  index (drops that message + any tool-call scaffolding that preceded it + all
+  later turns). Index 0 is protected. Used for "try again from here" flows.
+
+Backend routing rules (matches agentic-insight generation):
+- Stored `backend` on the insight row is authoritative by default.
+- `request.backend` may override per-turn. `local -> hybrid` is rejected in
+  v1 (would require on-the-fly visual-description rewrite); `hybrid -> local`
+  replays verbatim since the description is already inlined as text.
+- `request.model` overrides the chat model (an Ollama id in local mode, an
+  OpenRouter id in hybrid mode).
+
+Persistence:
+- Append mode (default): re-serialize the full history and `UPDATE` the same
+  row's `training_messages`.
+- Amend mode (`amend: true`): regenerate the title, insert a new insight row
+  via `store_insight` (auto-flips prior rows' `is_current=false`). Response
+  surfaces the new row's id as `amended_insight_id`.
+
+Per-`(library_id, file_path)` async mutex (`AppState.insight_chat.chat_locks`)
+serialises concurrent turns on the same insight so the JSON blob doesn't race.
+
+Context management is a soft bound: if the serialized history exceeds
+`num_ctx - 2048` tokens (cheap 4-byte/token heuristic), the oldest
+assistant-tool_call + tool_result pairs are dropped until under budget. The
+initial user message (with any images) and system prompt are always preserved.
+The `truncated` event / flag is surfaced to the client when a drop occurred.
+
+Configurable env:
+- `AGENTIC_CHAT_MAX_ITERATIONS` — cap on tool-calling iterations per turn
+  (default 6). Per-request `max_iterations` is clamped to this cap.
+
 ## Dependencies of Note
 
 - **actix-web**: HTTP framework
diff --git a/README.md b/README.md
index 1bd3c9a..b625b04 100644
--- a/README.md
+++ b/README.md
@@ -83,6 +83,24 @@ as a 4xx from the chat call. Pick tool-capable models.
   - Controls how many times the model can invoke tools before being forced to produce a final answer
   - Increase for more thorough context gathering; decrease to limit response time
 
+#### Insight Chat Continuation
+After an agentic insight is generated, the conversation can be continued. Endpoints:
+- `POST /insights/chat` — single-turn reply (non-streaming)
+- `POST /insights/chat/stream` — SSE variant with live `text` deltas and
+  `tool_call` / `tool_result` events. Mobile client uses this.
+- `GET /insights/chat/history?path=...&library=...` — rendered transcript;
+  each assistant message carries a `tools: [{name, arguments, result}]` array
+- `POST /insights/chat/rewind` — truncate transcript at a rendered index
+  (drops that message + any preceding tool scaffolding + later turns). Used
+  for "try again from here" flows. The initial user message is protected.
+
+Amend mode (`amend: true` in the chat request body) regenerates the insight's
+title and inserts a new row instead of appending to the existing transcript,
+so you can rewrite the saved summary from within chat.
+
+- `AGENTIC_CHAT_MAX_ITERATIONS` - Cap on tool-calling iterations per chat turn [default: `6`]
+  - Per-request `max_iterations` (when sent by the client) is clamped to this cap
+
 #### Fallback Behavior
 - Primary server is tried first with 5-second connection timeout
 - On failure, automatically falls back to secondary server (if configured)
-- 
2.49.1


From e4a3536f877902f7afdbb77fae9560a52f752e09 Mon Sep 17 00:00:00 2001
From: Cameron <ccordes12@mail.bw.edu>
Date: Wed, 22 Apr 2026 10:56:03 -0400
Subject: [PATCH 10/24] feat(ai): search_messages tool + RAG reranker
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a search_messages tool that hits the Django FTS5/semantic/hybrid
endpoint for keyword-quality text search over message bodies, and an
LLM-based reranker inside tool_search_rag (gated by SEARCH_RAG_RERANK,
default on). Reranker pulls ~3x candidates from the vector index, asks
the chat model to rank by relevance, and falls back to vector order on
parse failure.

The reranker shares the active chat turn's OllamaClient so num_ctx and
sampling match — otherwise Ollama unloads/reloads the model on every
rerank call. (Unverified end-to-end; caught by inspection, awaiting
e2e confirmation.)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/ai/insight_generator.rs | 228 +++++++++++++++++++++++++++++++++++-
 src/ai/sms_client.rs        |  64 ++++++++++
 2 files changed, 286 insertions(+), 6 deletions(-)

diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs
index 09fd585..a1edd0c 100644
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -1421,7 +1421,8 @@ Return ONLY the summary, nothing else."#,
         cx: &opentelemetry::Context,
     ) -> String {
         let result = match tool_name {
-            "search_rag" => self.tool_search_rag(arguments, cx).await,
+            "search_rag" => self.tool_search_rag(arguments, ollama, cx).await,
+            "search_messages" => self.tool_search_messages(arguments).await,
             "get_sms_messages" => self.tool_get_sms_messages(arguments, cx).await,
             "get_calendar_events" => self.tool_get_calendar_events(arguments, cx).await,
             "get_location_history" => self.tool_get_location_history(arguments, cx).await,
@@ -1447,6 +1448,7 @@ Return ONLY the summary, nothing else."#,
     async fn tool_search_rag(
         &self,
         args: &serde_json::Value,
+        ollama: &OllamaClient,
         _cx: &opentelemetry::Context,
     ) -> String {
         let query = match args.get("query").and_then(|v| v.as_str()) {
@@ -1479,13 +1481,204 @@ Return ONLY the summary, nothing else."#,
             limit
         );
 
-        match self
-            .find_relevant_messages_rag(date, None, contact.as_deref(), None, limit, Some(&query))
+        // Pull a wider candidate pool than the final limit so the LLM
+        // reranker has room to promote less-obvious hits. Candidates_factor
+        // is capped so a big `limit` doesn't blow past what the reranker
+        // can sensibly judge in one prompt.
+        let rerank_enabled = std::env::var("SEARCH_RAG_RERANK")
+            .ok()
+            .map(|v| v.to_lowercase() != "off" && v != "0")
+            .unwrap_or(true);
+        let candidate_limit = if rerank_enabled {
+            (limit * 3).min(40)
+        } else {
+            limit
+        };
+
+        let results = match self
+            .find_relevant_messages_rag(
+                date,
+                None,
+                contact.as_deref(),
+                None,
+                candidate_limit,
+                Some(&query),
+            )
             .await
         {
-            Ok(results) if !results.is_empty() => results.join("\n\n"),
-            Ok(_) => "No relevant messages found.".to_string(),
-            Err(e) => format!("Error searching RAG: {}", e),
+            Ok(results) if !results.is_empty() => results,
+            Ok(_) => return "No relevant messages found.".to_string(),
+            Err(e) => return format!("Error searching RAG: {}", e),
+        };
+
+        let final_results = if rerank_enabled && results.len() > limit {
+            match self.rerank_with_llm(&query, &results, limit, ollama).await {
+                Ok(reordered) => reordered,
+                Err(e) => {
+                    log::warn!("rerank failed, using vector order: {}", e);
+                    results.into_iter().take(limit).collect()
+                }
+            }
+        } else {
+            results.into_iter().take(limit).collect::<Vec<_>>()
+        };
+
+        final_results.join("\n\n")
+    }
+
+    /// LLM-based reranker: ask the local model to pick the top-`limit`
+    /// passages from `candidates` that are most relevant to `query`.
+    /// Returns the reordered subset.
+    ///
+    /// Cheap-ish because the reranker prompt and output live outside the
+    /// agent's visible context — only the final selection lands in the
+    /// tool_result. On parse failure we fall back to the input order.
+    async fn rerank_with_llm(
+        &self,
+        query: &str,
+        candidates: &[String],
+        limit: usize,
+        ollama: &OllamaClient,
+    ) -> Result<Vec<String>> {
+        // Build numbered list (1-based for readability). Cap each passage
+        // at ~1000 chars so very long summaries don't eat the prompt.
+        let numbered: String = candidates
+            .iter()
+            .enumerate()
+            .map(|(i, c)| {
+                let trimmed = if c.len() > 1000 {
+                    format!("{}…", &c[..1000])
+                } else {
+                    c.clone()
+                };
+                format!("[{}] {}", i + 1, trimmed)
+            })
+            .collect::<Vec<_>>()
+            .join("\n\n");
+
+        let prompt = format!(
+            "You are ranking search results. From the numbered passages below, \
+             select the {} most relevant to the query. Respond with ONLY a \
+             comma-separated list of passage numbers in order from most to \
+             least relevant. No explanation, no other text.\n\n\
+             Query: {}\n\n\
+             Passages:\n{}\n\n\
+             Top {} passage numbers:",
+            limit, query, numbered, limit
+        );
+
+        let response = ollama
+            .generate(
+                &prompt,
+                Some(
+                    "You are a terse relevance ranker. You output only numbers separated by commas.",
+                ),
+            )
+            .await?;
+
+        // Extract indices from the response. Accept "3, 1, 7" and also
+        // tolerate "[3, 1, 7]" or "3,1,7,..." with trailing junk.
+        let picks: Vec<usize> = response
+            .split(|c: char| !c.is_ascii_digit())
+            .filter_map(|s| s.parse::<usize>().ok())
+            .filter(|&n| n >= 1 && n <= candidates.len())
+            .collect();
+
+        if picks.is_empty() {
+            return Err(anyhow::anyhow!(
+                "reranker returned no usable indices (raw: {})",
+                response.chars().take(120).collect::<String>()
+            ));
+        }
+
+        let mut seen = std::collections::HashSet::new();
+        let mut reordered: Vec<String> = Vec::with_capacity(limit);
+        for n in picks {
+            if seen.insert(n) {
+                reordered.push(candidates[n - 1].clone());
+                if reordered.len() >= limit {
+                    break;
+                }
+            }
+        }
+        // Top-up from original order if the reranker returned fewer than
+        // `limit` distinct entries.
+        if reordered.len() < limit {
+            for (i, c) in candidates.iter().enumerate() {
+                if !seen.contains(&(i + 1)) {
+                    reordered.push(c.clone());
+                    if reordered.len() >= limit {
+                        break;
+                    }
+                }
+            }
+        }
+        Ok(reordered)
+    }
+
+    /// Tool: search_messages — keyword / semantic / hybrid search over all
+    /// SMS message bodies via the Django FTS5 + embeddings pipeline. Unlike
+    /// `search_rag` (daily summaries, date-weighted) this hits raw message
+    /// text across time and is the right choice for exact phrases, proper
+    /// nouns, URLs, or anything where specific wording matters.
+    async fn tool_search_messages(&self, args: &serde_json::Value) -> String {
+        let query = match args.get("query").and_then(|v| v.as_str()) {
+            Some(q) if !q.trim().is_empty() => q.trim(),
+            _ => return "Error: missing required parameter 'query'".to_string(),
+        };
+        if query.len() < 3 {
+            return "Error: query must be at least 3 characters".to_string();
+        }
+        let mode = args
+            .get("mode")
+            .and_then(|v| v.as_str())
+            .map(|s| s.to_lowercase())
+            .unwrap_or_else(|| "hybrid".to_string());
+        if !matches!(mode.as_str(), "fts5" | "semantic" | "hybrid") {
+            return format!(
+                "Error: unknown mode '{}'; expected one of: fts5, semantic, hybrid",
+                mode
+            );
+        }
+        let limit = args
+            .get("limit")
+            .and_then(|v| v.as_i64())
+            .unwrap_or(20)
+            .clamp(1, 50) as usize;
+
+        log::info!(
+            "tool_search_messages: query='{}', mode={}, limit={}",
+            query,
+            mode,
+            limit
+        );
+
+        match self.sms_client.search_messages(query, &mode, limit).await {
+            Ok(hits) if hits.is_empty() => "No messages matched.".to_string(),
+            Ok(hits) => {
+                let mut out = String::new();
+                out.push_str(&format!(
+                    "Found {} messages (mode: {}):\n\n",
+                    hits.len(),
+                    mode
+                ));
+                for h in hits {
+                    let date = chrono::DateTime::from_timestamp(h.date, 0)
+                        .map(|dt| dt.format("%Y-%m-%d").to_string())
+                        .unwrap_or_else(|| h.date.to_string());
+                    let direction = if h.type_ == 2 { "Me" } else { &h.contact_name };
+                    let score = h
+                        .similarity_score
+                        .map(|s| format!(" [score {:.2}]", s))
+                        .unwrap_or_default();
+                    out.push_str(&format!(
+                        "[{}]{} {} — {}\n\n",
+                        date, score, direction, h.body
+                    ));
+                }
+                out
+            }
+            Err(e) => format!("Error searching messages: {}", e),
         }
     }
 
@@ -2164,6 +2357,29 @@ Return ONLY the summary, nothing else."#,
                     }
                 }),
             ),
+            Tool::function(
+                "search_messages",
+                "Keyword/semantic/hybrid search over ALL SMS message bodies (not just summaries) across all time. Prefer this for specific phrases, proper nouns, URLs, or when you don't know the date. Modes: 'fts5' (keyword, supports \"phrase\" / prefix* / AND / NEAR(w1 w2, 5)), 'semantic' (embedding similarity), 'hybrid' (recommended — merges both via reciprocal rank fusion).",
+                serde_json::json!({
+                    "type": "object",
+                    "required": ["query"],
+                    "properties": {
+                        "query": {
+                            "type": "string",
+                            "description": "Search query. Min 3 chars. For fts5 mode, supports phrase (\"\"), prefix (*), AND/OR/NOT, and NEAR proximity."
+                        },
+                        "mode": {
+                            "type": "string",
+                            "enum": ["fts5", "semantic", "hybrid"],
+                            "description": "Search strategy. Default: hybrid."
+                        },
+                        "limit": {
+                            "type": "integer",
+                            "description": "Maximum number of results (default: 20, max: 50)"
+                        }
+                    }
+                }),
+            ),
             Tool::function(
                 "get_sms_messages",
                 "Fetch SMS/text messages near a specific date. Returns the actual message conversation. Omit contact to search across all conversations.",
diff --git a/src/ai/sms_client.rs b/src/ai/sms_client.rs
index 1b6b605..57d28a1 100644
--- a/src/ai/sms_client.rs
+++ b/src/ai/sms_client.rs
@@ -250,6 +250,45 @@ impl SmsApiClient {
             .collect())
     }
 
+    /// Search message bodies via the Django side's FTS5 / semantic / hybrid
+    /// endpoint. `mode` selects the ranking strategy:
+    ///   - "fts5"     keyword-only, supports phrase / prefix / boolean / NEAR
+    ///   - "semantic" embedding similarity
+    ///   - "hybrid"   both merged via reciprocal rank fusion (recommended)
+    pub async fn search_messages(
+        &self,
+        query: &str,
+        mode: &str,
+        limit: usize,
+    ) -> Result<Vec<SmsSearchHit>> {
+        let url = format!(
+            "{}/api/messages/search/?q={}&mode={}&limit={}",
+            self.base_url,
+            urlencoding::encode(query),
+            urlencoding::encode(mode),
+            limit
+        );
+
+        let mut request = self.client.get(&url);
+        if let Some(token) = &self.token {
+            request = request.header("Authorization", format!("Bearer {}", token));
+        }
+
+        let response = request.send().await?;
+        if !response.status().is_success() {
+            let status = response.status();
+            let body = response.text().await.unwrap_or_default();
+            return Err(anyhow::anyhow!(
+                "SMS search request failed: {} - {}",
+                status,
+                body
+            ));
+        }
+
+        let data: SmsSearchResponse = response.json().await?;
+        Ok(data.results)
+    }
+
     pub async fn summarize_context(
         &self,
         messages: &[SmsMessage],
@@ -314,3 +353,28 @@ struct SmsApiMessage {
     #[serde(rename = "type")]
     type_: i32,
 }
+
+#[derive(Debug, Clone, Deserialize)]
+pub struct SmsSearchHit {
+    #[allow(dead_code)]
+    pub message_id: i64,
+    pub contact_name: String,
+    #[allow(dead_code)]
+    pub contact_address: String,
+    pub body: String,
+    pub date: i64,
+    /// Message direction code: 1 = received, 2 = sent.
+    #[serde(rename = "type")]
+    pub type_: i32,
+    /// Present for semantic / hybrid modes; absent for fts5.
+    #[serde(default)]
+    pub similarity_score: Option<f32>,
+}
+
+#[derive(Deserialize)]
+struct SmsSearchResponse {
+    results: Vec<SmsSearchHit>,
+    #[allow(dead_code)]
+    #[serde(default)]
+    search_method: String,
+}
-- 
2.49.1


From 6831f509935de54d59b7b0f43dbbf4d479bc8b62 Mon Sep 17 00:00:00 2001
From: Cameron <ccordes12@mail.bw.edu>
Date: Wed, 22 Apr 2026 23:39:37 -0400
Subject: [PATCH 11/24] feat(ai): USER_NAME env + shared summary prompt +
 test-bin knobs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Introduces USER_NAME (default "Me") as the single source for the message
sender label and the first-person persona across daily summaries, SMS
context, insight generation, and chat. Eliminates the "Me:" transcript /
"what I did" ambiguity that confused smaller models, and unhardcodes
"Cameron" from prompt text + the knowledge-graph owner entity. Set
USER_NAME=Cameron in .env to preserve the existing owner entity row
(keyed on UNIQUE(name, entity_type)) — otherwise the next run creates
a fresh owner entity and orphans the existing facts/photo-links.

Also:
- search_messages redirect: when the model calls it with date/contact
  but no query, return a hint pointing at get_sms_messages instead of
  a bare missing-parameter error (prevents same-turn retry loops)
- sharpen search_messages vs get_sms_messages tool descriptions so
  content-vs-time-based intent is unambiguous
- extract build_daily_summary_prompt (+ DAILY_SUMMARY_MESSAGE_LIMIT,
  DAILY_SUMMARY_SYSTEM_PROMPT) shared by daily_summary_job and
  test_daily_summary binary — prompt tweaks now land in both
- EMBEDDING_MODEL const; fixes both insert sites that stored
  "mxbai-embed-large:335m" while generate_embeddings actually runs
  "nomic-embed-text:v1.5"
- test_daily_summary: add --num-ctx / --temperature / --top-p /
  --top-k / --min-p flags wired into OllamaClient setters, and print
  the configured knobs at the top of each run
- OllamaClient::generate now logs prompt/gen token counts and tok/s
  via log_chat_metrics (symmetric with chat_with_tools)

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/ai/daily_summary_job.rs   | 136 +++++++++++++++++++---------------
 src/ai/insight_generator.rs   |  79 ++++++++++++++------
 src/ai/mod.rs                 |  16 +++-
 src/ai/ollama.rs              |  42 ++++++++---
 src/ai/sms_client.rs          |   3 +-
 src/bin/test_daily_summary.rs | 106 ++++++++++++--------------
 6 files changed, 226 insertions(+), 156 deletions(-)

diff --git a/src/ai/daily_summary_job.rs b/src/ai/daily_summary_job.rs
index 9d9c9e0..18ad15c 100644
--- a/src/ai/daily_summary_job.rs
+++ b/src/ai/daily_summary_job.rs
@@ -6,12 +6,84 @@ use std::collections::HashMap;
 use std::sync::{Arc, Mutex};
 use tokio::time::sleep;
 
-use crate::ai::{OllamaClient, SmsApiClient, SmsMessage};
+use crate::ai::{EMBEDDING_MODEL, OllamaClient, SmsApiClient, SmsMessage, user_display_name};
 use crate::database::{DailySummaryDao, InsertDailySummary};
 use crate::otel::global_tracer;
 
 /// Strip boilerplate prefixes and common phrases from summaries before embedding.
 /// This improves embedding diversity by removing structural similarity.
+/// Maximum number of messages passed to the summarizer for a single day.
+/// Tuned to avoid token overflow on typical chat models; shared between
+/// the production job and the test binary so they can't drift.
+pub const DAILY_SUMMARY_MESSAGE_LIMIT: usize = 300;
+
+/// System prompt used when generating daily conversation summaries.
+pub const DAILY_SUMMARY_SYSTEM_PROMPT: &str =
+    "You are a conversation summarizer. Create clear, factual summaries with \
+     precise subject attribution AND extract distinctive keywords. Focus on \
+     specific, unique terms that differentiate this conversation from others.";
+
+/// Build the prompt for a single day's conversation summary. Shared by the
+/// production job and the test binary so prompt tweaks land in both places.
+/// Returns `(prompt, system_prompt)`.
+pub fn build_daily_summary_prompt(
+    contact: &str,
+    date: &NaiveDate,
+    messages: &[SmsMessage],
+) -> (String, &'static str) {
+    let user_name = user_display_name();
+    let messages_text: String = messages
+        .iter()
+        .take(DAILY_SUMMARY_MESSAGE_LIMIT)
+        .map(|m| {
+            if m.is_sent {
+                format!("{}: {}", user_name, m.body)
+            } else {
+                format!("{}: {}", m.contact, m.body)
+            }
+        })
+        .collect::<Vec<_>>()
+        .join("\n");
+
+    let prompt = format!(
+        r#"Summarize this day's conversation between {user_name} and {contact}.
+
+CRITICAL FORMAT RULES:
+- Do NOT start with "Based on the conversation..." or "Here is a summary..." or similar preambles
+- Do NOT repeat the date at the beginning
+- Start DIRECTLY with the content - begin with a person's name or action
+- Write in past tense, as if recording what happened
+
+NARRATIVE (4-8 sentences):
+- What specific topics, activities, or events were discussed?
+- What places, people, or organizations were mentioned?
+- What plans were made or decisions discussed?
+- Clearly distinguish between what {user_name} did versus what {contact} did
+
+KEYWORDS (comma-separated):
+5-10 specific keywords that capture this conversation's unique content:
+- Proper nouns (people, places, brands)
+- Specific activities ("drum corps audition" not just "music")
+- Distinctive terms that make this day unique
+
+Date: {month_day_year} ({weekday})
+Messages:
+{messages_text}
+
+YOUR RESPONSE (follow this format EXACTLY):
+Summary: [Start directly with content, NO preamble]
+
+Keywords: [specific, unique terms]"#,
+        user_name = user_name,
+        contact = contact,
+        month_day_year = date.format("%B %d, %Y"),
+        weekday = date.format("%A"),
+        messages_text = messages_text,
+    );
+
+    (prompt, DAILY_SUMMARY_SYSTEM_PROMPT)
+}
+
 pub fn strip_summary_boilerplate(summary: &str) -> String {
     let mut text = summary.trim().to_string();
 
@@ -290,65 +362,10 @@ async fn generate_and_store_daily_summary(
     span.set_attribute(KeyValue::new("contact", contact.to_string()));
     span.set_attribute(KeyValue::new("message_count", messages.len() as i64));
 
-    // Format messages for LLM
-    let messages_text: String = messages
-        .iter()
-        .take(200) // Limit to 200 messages per day to avoid token overflow
-        .map(|m| {
-            if m.is_sent {
-                format!("Me: {}", m.body)
-            } else {
-                format!("{}: {}", m.contact, m.body)
-            }
-        })
-        .collect::<Vec<_>>()
-        .join("\n");
-
-    let weekday = date.format("%A");
-
-    let prompt = format!(
-        r#"Summarize this day's conversation between me and {}.
-
-CRITICAL FORMAT RULES:
-- Do NOT start with "Based on the conversation..." or "Here is a summary..." or similar preambles
-- Do NOT repeat the date at the beginning
-- Start DIRECTLY with the content - begin with a person's name or action
-- Write in past tense, as if recording what happened
-
-NARRATIVE (3-5 sentences):
-- What specific topics, activities, or events were discussed?
-- What places, people, or organizations were mentioned?
-- What plans were made or decisions discussed?
-- Clearly distinguish between what "I" did versus what {} did
-
-KEYWORDS (comma-separated):
-5-10 specific keywords that capture this conversation's unique content:
-- Proper nouns (people, places, brands)
-- Specific activities ("drum corps audition" not just "music")
-- Distinctive terms that make this day unique
-
-Date: {} ({})
-Messages:
-{}
-
-YOUR RESPONSE (follow this format EXACTLY):
-Summary: [Start directly with content, NO preamble]
-
-Keywords: [specific, unique terms]"#,
-        contact,
-        contact,
-        date.format("%B %d, %Y"),
-        weekday,
-        messages_text
-    );
+    let (prompt, system_prompt) = build_daily_summary_prompt(contact, date, messages);
 
     // Generate summary with LLM
-    let summary = ollama
-        .generate(
-            &prompt,
-            Some("You are a conversation summarizer. Create clear, factual summaries with precise subject attribution AND extract distinctive keywords. Focus on specific, unique terms that differentiate this conversation from others."),
-        )
-        .await?;
+    let summary = ollama.generate(&prompt, Some(system_prompt)).await?;
 
     log::debug!(
         "Generated summary for {}: {}",
@@ -381,8 +398,7 @@ Keywords: [specific, unique terms]"#,
         message_count: messages.len() as i32,
         embedding,
         created_at: Utc::now().timestamp(),
-        // model_version: "nomic-embed-text:v1.5".to_string(),
-        model_version: "mxbai-embed-large:335m".to_string(),
+        model_version: EMBEDDING_MODEL.to_string(),
     };
 
     // Create context from current span for DB operation
diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs
index a1edd0c..bffd141 100644
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -13,6 +13,7 @@ use crate::ai::llm_client::LlmClient;
 use crate::ai::ollama::{ChatMessage, OllamaClient, Tool};
 use crate::ai::openrouter::OpenRouterClient;
 use crate::ai::sms_client::SmsApiClient;
+use crate::ai::user_display_name;
 use crate::database::models::InsertPhotoInsight;
 use crate::database::{
     CalendarEventDao, DailySummaryDao, ExifDao, InsightDao, KnowledgeDao, LocationHistoryDao,
@@ -1260,10 +1261,14 @@ impl InsightGenerator {
 
         // Format a sample of messages for topic extraction
         let sample_size = messages.len().min(20);
+        let user_name = user_display_name();
         let sample_text: Vec<String> = messages
             .iter()
             .take(sample_size)
-            .map(|m| format!("{}: {}", if m.is_sent { "Me" } else { &m.contact }, m.body))
+            .map(|m| {
+                let sender: &str = if m.is_sent { &user_name } else { &m.contact };
+                format!("{}: {}", sender, m.body)
+            })
             .collect();
 
         let prompt = format!(
@@ -1361,10 +1366,11 @@ Return ONLY the summary, nothing else."#,
         }
 
         // Format messages
+        let user_name = user_display_name();
         let formatted: Vec<String> = messages
             .iter()
             .map(|m| {
-                let sender = if m.is_sent { "Me" } else { &m.contact };
+                let sender: &str = if m.is_sent { &user_name } else { &m.contact };
                 let timestamp = chrono::DateTime::from_timestamp(m.timestamp, 0)
                     .map(|dt| {
                         dt.with_timezone(&Local)
@@ -1624,7 +1630,21 @@ Return ONLY the summary, nothing else."#,
     async fn tool_search_messages(&self, args: &serde_json::Value) -> String {
         let query = match args.get("query").and_then(|v| v.as_str()) {
             Some(q) if !q.trim().is_empty() => q.trim(),
-            _ => return "Error: missing required parameter 'query'".to_string(),
+            _ => {
+                // Redirect when the model reached for this tool with a
+                // date/contact-shaped intent — get_sms_messages is the right
+                // call. Without this hint, small models often just retry
+                // search_messages again with the same args.
+                let has_date = args.get("date").is_some();
+                let has_contact = args.get("contact").is_some();
+                if has_date || has_contact {
+                    return "Error: search_messages needs a 'query' (keywords/phrase). \
+                            To fetch messages around a date or from a contact, call \
+                            get_sms_messages with { date, contact? } instead."
+                        .to_string();
+                }
+                return "Error: missing required parameter 'query'".to_string();
+            }
         };
         if query.len() < 3 {
             return "Error: query must be at least 3 characters".to_string();
@@ -1662,11 +1682,12 @@ Return ONLY the summary, nothing else."#,
                     hits.len(),
                     mode
                 ));
+                let user_name = user_display_name();
                 for h in hits {
                     let date = chrono::DateTime::from_timestamp(h.date, 0)
                         .map(|dt| dt.format("%Y-%m-%d").to_string())
                         .unwrap_or_else(|| h.date.to_string());
-                    let direction = if h.type_ == 2 { "Me" } else { &h.contact_name };
+                    let direction: &str = if h.type_ == 2 { &user_name } else { &h.contact_name };
                     let score = h
                         .similarity_score
                         .map(|s| format!(" [score {:.2}]", s))
@@ -1726,11 +1747,12 @@ Return ONLY the summary, nothing else."#,
             .await
         {
             Ok(messages) if !messages.is_empty() => {
+                let user_name = user_display_name();
                 let formatted: Vec<String> = messages
                     .iter()
                     .take(limit)
                     .map(|m| {
-                        let sender = if m.is_sent { "Me" } else { &m.contact };
+                        let sender: &str = if m.is_sent { &user_name } else { &m.contact };
                         let ts = DateTime::from_timestamp(m.timestamp, 0)
                             .map(|dt| {
                                 dt.with_timezone(&Local)
@@ -2359,7 +2381,7 @@ Return ONLY the summary, nothing else."#,
             ),
             Tool::function(
                 "search_messages",
-                "Keyword/semantic/hybrid search over ALL SMS message bodies (not just summaries) across all time. Prefer this for specific phrases, proper nouns, URLs, or when you don't know the date. Modes: 'fts5' (keyword, supports \"phrase\" / prefix* / AND / NEAR(w1 w2, 5)), 'semantic' (embedding similarity), 'hybrid' (recommended — merges both via reciprocal rank fusion).",
+                "CONTENT search over SMS message bodies by keywords/phrases/topics across all time. Use when you're looking for specific wording (phrases, proper nouns, URLs, topics) and DON'T have a date in mind. NOT for time-based queries — if you know the date or want messages around a date, call get_sms_messages instead. Modes: 'fts5' (keyword, supports \"phrase\" / prefix* / AND / NEAR(w1 w2, 5)), 'semantic' (embedding similarity), 'hybrid' (recommended — merges both via reciprocal rank fusion).",
                 serde_json::json!({
                     "type": "object",
                     "required": ["query"],
@@ -2382,7 +2404,7 @@ Return ONLY the summary, nothing else."#,
             ),
             Tool::function(
                 "get_sms_messages",
-                "Fetch SMS/text messages near a specific date. Returns the actual message conversation. Omit contact to search across all conversations.",
+                "TIME-BASED fetch of SMS/text messages around a specific date (and optionally from a specific contact). Returns the actual message conversation for that window. Use this whenever you know the date or want the context around a photo's timestamp. Omit contact to search across all conversations. For keyword/topic search without a date, use search_messages instead.",
                 serde_json::json!({
                     "type": "object",
                     "required": ["date"],
@@ -2561,7 +2583,7 @@ Return ONLY the summary, nothing else."#,
                     },
                     "object_entity_id": {
                         "type": "integer",
-                        "description": "Use when the object is a known entity (e.g. Cameron's entity ID for 'is_friend_of Cameron'). Takes precedence over object_value."
+                        "description": "Use when the object is a known entity (e.g. another person's entity ID for 'is_friend_of <that person>'). Takes precedence over object_value."
                     },
                     "object_value": {
                         "type": "string",
@@ -2871,8 +2893,9 @@ Return ONLY the summary, nothing else."#,
         };
 
         // 6. Clear existing entity-photo links for this file so the run starts fresh,
-        //    and ensure the owner entity (Cameron) exists so the agent can reference it.
-        let cameron_entity_id: Option<i32> = {
+        //    and ensure the owner entity exists so the agent can reference it.
+        let owner_name = user_display_name();
+        let owner_entity_id: Option<i32> = {
             let mut kdao = self
                 .knowledge_dao
                 .lock()
@@ -2888,9 +2911,12 @@ Return ONLY the summary, nothing else."#,
 
             // Upsert the owner entity so the agent always has a stable entity ID to reference.
             let owner = crate::database::models::InsertEntity {
-                name: "Cameron".to_string(),
+                name: owner_name.clone(),
                 entity_type: "person".to_string(),
-                description: "The owner of this photo collection. All memories are written from Cameron's perspective.".to_string(),
+                description: format!(
+                    "The owner of this photo collection. All memories are written from {}'s perspective.",
+                    owner_name
+                ),
                 embedding: None,
                 confidence: 1.0,
                 status: "active".to_string(),
@@ -2899,11 +2925,11 @@ Return ONLY the summary, nothing else."#,
             };
             match kdao.upsert_entity(&insight_cx, owner) {
                 Ok(e) => {
-                    log::info!("Cameron entity ID: {}", e.id);
+                    log::info!("Owner entity '{}' ID: {}", owner_name, e.id);
                     Some(e.id)
                 }
                 Err(e) => {
-                    log::warn!("Failed to upsert Cameron entity: {:?}", e);
+                    log::warn!("Failed to upsert owner entity '{}': {:?}", owner_name, e);
                     None
                 }
             }
@@ -2953,28 +2979,30 @@ Return ONLY the summary, nothing else."#,
         };
 
         // 8. Build system message
-        let cameron_id_note = match cameron_entity_id {
+        let owner_id_note = match owner_entity_id {
             Some(id) => format!(
-                "\n\nYour identity in the knowledge store: Cameron (entity ID: {}). \
-                When storing facts where you (Cameron) are the object — for example, someone is your friend, \
+                "\n\nYour identity in the knowledge store: {name} (entity ID: {id}). \
+                When storing facts where you ({name}) are the object — for example, someone is your friend, \
                 sibling, or colleague — use subject_entity_id for the other person and set object_value to \
-                \"Cameron\" (or use store_fact with the other person as subject). When storing facts about \
-                Cameron directly, use {} as the subject_entity_id.",
-                id, id
+                \"{name}\" (or use store_fact with the other person as subject). When storing facts about \
+                {name} directly, use {id} as the subject_entity_id.",
+                name = owner_name,
+                id = id
             ),
             None => String::new(),
         };
         let base_system = format!(
-            "You are a personal photo memory assistant helping to reconstruct a memory from a photo.{cameron_id_note}\n\n\
+            "You are a personal photo memory assistant helping to reconstruct a memory from a photo.{owner_id_note}\n\n\
             IMPORTANT INSTRUCTIONS:\n\
             1. You MUST call multiple tools to gather context BEFORE writing any final insight. Do not produce a final answer after only one or two tool calls.\n\
-            2. When calling get_sms_messages and search_rag, always make at least one call WITHOUT a contact filter to capture what else was happening in Cameron's life around this date — other conversations, events, and activities provide important wider context even when a specific contact is known.\n\
+            2. When calling get_sms_messages and search_rag, always make at least one call WITHOUT a contact filter to capture what else was happening in {owner_name}'s life around this date — other conversations, events, and activities provide important wider context even when a specific contact is known.\n\
             3. Use recall_facts_for_photo to load any previously stored knowledge about subjects in this photo.\n\
             4. Use recall_entities to look up known people, places, or things that appear in this photo.\n\
             5. When you identify people, places, events, or notable things in this photo: use store_entity to record them and store_fact to record key facts (relationships, roles, attributes). This builds a persistent memory for future insights.\n\
             6. Only produce your final insight AFTER you have gathered context from at least 5 tool calls.\n\
             7. If a tool returns no results, that is useful information — continue calling the remaining tools anyway.",
-            cameron_id_note = cameron_id_note
+            owner_id_note = owner_id_note,
+            owner_name = owner_name
         );
         let system_content = if let Some(ref custom) = custom_system_prompt {
             format!("{}\n\n{}", custom, base_system)
@@ -3125,7 +3153,10 @@ Return ONLY the summary, nothing else."#,
                 iterations_used
             );
             messages.push(ChatMessage::user(
-                "Based on the context gathered, please write the final photo insight: a title and a detailed personal summary. Write in first person as Cameron.",
+                &format!(
+                    "Based on the context gathered, please write the final photo insight: a title and a detailed personal summary. Write in first person as {}.",
+                    user_display_name()
+                ),
             ));
             let (final_response, prompt_tokens, eval_tokens) = chat_backend
                 .chat_with_tools(messages.clone(), vec![])
diff --git a/src/ai/mod.rs b/src/ai/mod.rs
index 8e38930..94e8541 100644
--- a/src/ai/mod.rs
+++ b/src/ai/mod.rs
@@ -9,7 +9,10 @@ pub mod sms_client;
 
 // strip_summary_boilerplate is used by binaries (test_daily_summary), not the library
 #[allow(unused_imports)]
-pub use daily_summary_job::{generate_daily_summaries, strip_summary_boilerplate};
+pub use daily_summary_job::{
+    DAILY_SUMMARY_MESSAGE_LIMIT, DAILY_SUMMARY_SYSTEM_PROMPT, build_daily_summary_prompt,
+    generate_daily_summaries, strip_summary_boilerplate,
+};
 pub use handlers::{
     chat_history_handler, chat_rewind_handler, chat_stream_handler, chat_turn_handler,
     delete_insight_handler, export_training_data_handler, generate_agentic_insight_handler,
@@ -21,5 +24,14 @@ pub use insight_generator::InsightGenerator;
 pub use llm_client::{
     ChatMessage, LlmClient, ModelCapabilities, Tool, ToolCall, ToolCallFunction, ToolFunction,
 };
-pub use ollama::OllamaClient;
+pub use ollama::{EMBEDDING_MODEL, OllamaClient};
 pub use sms_client::{SmsApiClient, SmsMessage};
+
+/// Display name used for the user in message transcripts and first-person
+/// prompt text. Reads the `USER_NAME` env var; defaults to `"Me"`. Models
+/// often confuse `"Me:"` in a transcript with their own role — setting
+/// `USER_NAME=Cameron` (or similar) in the environment eliminates that
+/// ambiguity across daily summaries, insight generation, and chat.
+pub fn user_display_name() -> String {
+    std::env::var("USER_NAME").unwrap_or_else(|_| "Me".to_string())
+}
diff --git a/src/ai/ollama.rs b/src/ai/ollama.rs
index 1dc67f8..81185f0 100644
--- a/src/ai/ollama.rs
+++ b/src/ai/ollama.rs
@@ -19,6 +19,11 @@ pub use crate::ai::llm_client::{ToolCall, ToolCallFunction, ToolFunction};
 // Cache duration: 15 minutes
 const CACHE_DURATION_SECS: u64 = 15 * 60;
 
+/// Embedding model used across the app. Callers that persist a
+/// `model_version` alongside an embedding should read this constant so the
+/// stored label always matches what `generate_embeddings` actually ran.
+pub const EMBEDDING_MODEL: &str = "nomic-embed-text:v1.5";
+
 // Cached entry with timestamp
 #[derive(Clone)]
 struct CachedEntry<T> {
@@ -349,6 +354,12 @@ impl OllamaClient {
         }
 
         let result: OllamaResponse = response.json().await?;
+        log_chat_metrics(
+            result.prompt_eval_count,
+            result.prompt_eval_duration,
+            result.eval_count,
+            result.eval_duration,
+        );
         Ok(result.response)
     }
 
@@ -481,6 +492,7 @@ Capture the key moment or theme. Return ONLY the title, nothing else."#,
     ) -> Result<String> {
         let location_str = location.unwrap_or("Unknown");
         let sms_str = sms_summary.unwrap_or("No messages");
+        let user_name = crate::ai::user_display_name();
 
         let prompt = if image_base64.is_some() {
             if let Some(contact_name) = contact {
@@ -492,13 +504,14 @@ Location: {}
 Person/Contact: {}
 Messages: {}
 
-Analyze the image and use specific details from both the visual content and the context above. The photo is from a folder for {}, so they are likely in or related to this photo. Mention people's names (especially {}), places, or activities if they appear in either the image or the context. Write in first person as Cameron with the tone of a journal entry. If limited information is available, keep it simple and factual based on what you see and know. If the location is unknown omit it"#,
+Analyze the image and use specific details from both the visual content and the context above. The photo is from a folder for {}, so they are likely in or related to this photo. Mention people's names (especially {}), places, or activities if they appear in either the image or the context. Write in first person as {} with the tone of a journal entry. If limited information is available, keep it simple and factual based on what you see and know. If the location is unknown omit it"#,
                     date.format("%B %d, %Y"),
                     location_str,
                     contact_name,
                     sms_str,
                     contact_name,
-                    contact_name
+                    contact_name,
+                    user_name
                 )
             } else {
                 format!(
@@ -508,10 +521,11 @@ Date: {}
 Location: {}
 Messages: {}
 
-Analyze the image and use specific details from both the visual content and the context above. Mention people's names, places, or activities if they appear in either the image or the context. Write in first person as Cameron with the tone of a journal entry. If limited information is available, keep it simple and factual based on what you see and know. If the location is unknown omit it"#,
+Analyze the image and use specific details from both the visual content and the context above. Mention people's names, places, or activities if they appear in either the image or the context. Write in first person as {} with the tone of a journal entry. If limited information is available, keep it simple and factual based on what you see and know. If the location is unknown omit it"#,
                     date.format("%B %d, %Y"),
                     location_str,
-                    sms_str
+                    sms_str,
+                    user_name
                 )
             }
         } else if let Some(contact_name) = contact {
@@ -523,13 +537,14 @@ Analyze the image and use specific details from both the visual content and the
         Person/Contact: {}
         Messages: {}
 
-        Use only the specific details provided above. The photo is from a folder for {}, so they are likely related to this moment. Mention people's names (especially {}), places, or activities if they appear in the context. Write in first person as Cameron with the tone of a journal entry. If limited information is available, keep it simple and factual. If the location is unknown omit it"#,
+        Use only the specific details provided above. The photo is from a folder for {}, so they are likely related to this moment. Mention people's names (especially {}), places, or activities if they appear in the context. Write in first person as {} with the tone of a journal entry. If limited information is available, keep it simple and factual. If the location is unknown omit it"#,
                 date.format("%B %d, %Y"),
                 location_str,
                 contact_name,
                 sms_str,
                 contact_name,
-                contact_name
+                contact_name,
+                user_name
             )
         } else {
             format!(
@@ -539,10 +554,11 @@ Analyze the image and use specific details from both the visual content and the
         Location: {}
         Messages: {}
 
-        Use only the specific details provided above. Mention people's names, places, or activities if they appear in the context. Write in first person as Cameron with the tone of a journal entry. If limited information is available, keep it simple and factual. If the location is unknown omit it"#,
+        Use only the specific details provided above. Mention people's names, places, or activities if they appear in the context. Write in first person as {} with the tone of a journal entry. If limited information is available, keep it simple and factual. If the location is unknown omit it"#,
                 date.format("%B %d, %Y"),
                 location_str,
-                sms_str
+                sms_str,
+                user_name
             )
         };
 
@@ -892,7 +908,7 @@ Analyze the image and use specific details from both the visual content and the
     /// Returns a vector of 768-dimensional vectors
     /// This is much more efficient than calling generate_embedding multiple times
     pub async fn generate_embeddings(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>> {
-        let embedding_model = "nomic-embed-text:v1.5";
+        let embedding_model = EMBEDDING_MODEL;
 
         log::debug!("=== Ollama Batch Embedding Request ===");
         log::debug!("Model: {}", embedding_model);
@@ -1145,6 +1161,14 @@ struct OllamaStreamMessage {
 #[derive(Deserialize)]
 struct OllamaResponse {
     response: String,
+    #[serde(default)]
+    prompt_eval_count: Option<i32>,
+    #[serde(default)]
+    prompt_eval_duration: Option<u64>,
+    #[serde(default)]
+    eval_count: Option<i32>,
+    #[serde(default)]
+    eval_duration: Option<u64>,
 }
 
 fn log_chat_metrics(
diff --git a/src/ai/sms_client.rs b/src/ai/sms_client.rs
index 57d28a1..ad6d28e 100644
--- a/src/ai/sms_client.rs
+++ b/src/ai/sms_client.rs
@@ -299,12 +299,13 @@ impl SmsApiClient {
         }
 
         // Create prompt for Ollama with sender/receiver distinction
+        let user_name = crate::ai::user_display_name();
         let messages_text: String = messages
             .iter()
             .take(60) // Limit to avoid token overflow
             .map(|m| {
                 if m.is_sent {
-                    format!("Me: {}", m.body)
+                    format!("{}: {}", user_name, m.body)
                 } else {
                     format!("{}: {}", m.contact, m.body)
                 }
diff --git a/src/bin/test_daily_summary.rs b/src/bin/test_daily_summary.rs
index fbbb621..aff2790 100644
--- a/src/bin/test_daily_summary.rs
+++ b/src/bin/test_daily_summary.rs
@@ -1,7 +1,10 @@
 use anyhow::Result;
 use chrono::NaiveDate;
 use clap::Parser;
-use image_api::ai::{OllamaClient, SmsApiClient, strip_summary_boilerplate};
+use image_api::ai::{
+    EMBEDDING_MODEL, OllamaClient, SmsApiClient, build_daily_summary_prompt,
+    strip_summary_boilerplate, user_display_name,
+};
 use image_api::database::{DailySummaryDao, InsertDailySummary, SqliteDailySummaryDao};
 use std::env;
 use std::sync::{Arc, Mutex};
@@ -25,6 +28,26 @@ struct Args {
     #[arg(short, long)]
     model: Option<String>,
 
+    /// Context window size passed as Ollama `num_ctx`. Omit for server default.
+    #[arg(long)]
+    num_ctx: Option<i32>,
+
+    /// Sampling temperature. Omit for server default.
+    #[arg(long)]
+    temperature: Option<f32>,
+
+    /// Top-p (nucleus) sampling. Omit for server default.
+    #[arg(long)]
+    top_p: Option<f32>,
+
+    /// Top-k sampling. Omit for server default.
+    #[arg(long)]
+    top_k: Option<i32>,
+
+    /// Min-p sampling. Omit for server default.
+    #[arg(long)]
+    min_p: Option<f32>,
+
     /// Test mode: Generate but don't save to database (shows output only)
     #[arg(short = 't', long, default_value_t = false)]
     test_mode: bool,
@@ -86,12 +109,28 @@ async fn main() -> Result<()> {
             .unwrap_or_else(|_| "nemotron-3-nano:30b".to_string())
     });
 
-    let ollama = OllamaClient::new(
+    let mut ollama = OllamaClient::new(
         ollama_primary_url,
         ollama_fallback_url.clone(),
         model_to_use.clone(),
         Some(model_to_use), // Use same model for fallback
     );
+    if let Some(ctx) = args.num_ctx {
+        ollama.set_num_ctx(Some(ctx));
+    }
+    if args.temperature.is_some()
+        || args.top_p.is_some()
+        || args.top_k.is_some()
+        || args.min_p.is_some()
+    {
+        ollama.set_sampling_params(args.temperature, args.top_p, args.top_k, args.min_p);
+    }
+
+    // Surface what's actually configured so comparison runs are auditable.
+    println!(
+        "num_ctx={:?} temperature={:?} top_p={:?} top_k={:?} min_p={:?}",
+        args.num_ctx, args.temperature, args.top_p, args.top_k, args.min_p
+    );
 
     let sms_api_url =
         env::var("SMS_API_URL").unwrap_or_else(|_| "http://localhost:8000".to_string());
@@ -160,9 +199,10 @@ async fn main() -> Result<()> {
         println!("━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━");
 
         if args.verbose {
+            let user_name = user_display_name();
             println!("\nMessage preview:");
             for (i, msg) in messages.iter().take(3).enumerate() {
-                let sender = if msg.is_sent { "Me" } else { &msg.contact };
+                let sender: &str = if msg.is_sent { &user_name } else { &msg.contact };
                 let preview = msg.body.chars().take(60).collect::<String>();
                 println!("  {}. {}: {}...", i + 1, sender, preview);
             }
@@ -172,64 +212,11 @@ async fn main() -> Result<()> {
             println!();
         }
 
-        // Format messages for LLM
-        let messages_text: String = messages
-            .iter()
-            .take(200)
-            .map(|m| {
-                if m.is_sent {
-                    format!("Me: {}", m.body)
-                } else {
-                    format!("{}: {}", m.contact, m.body)
-                }
-            })
-            .collect::<Vec<_>>()
-            .join("\n");
-
-        let prompt = format!(
-            r#"Summarize this day's conversation between me and {}.
-
-CRITICAL FORMAT RULES:
-- Do NOT start with "Based on the conversation..." or "Here is a summary..." or similar preambles
-- Do NOT repeat the date at the beginning
-- Start DIRECTLY with the content - begin with a person's name or action
-- Write in past tense, as if recording what happened
-
-NARRATIVE (3-5 sentences):
-- What specific topics, activities, or events were discussed?
-- What places, people, or organizations were mentioned?
-- What plans were made or decisions discussed?
-- Clearly distinguish between what "I" did versus what {} did
-
-KEYWORDS (comma-separated):
-5-10 specific keywords that capture this conversation's unique content:
-- Proper nouns (people, places, brands)
-- Specific activities ("drum corps audition" not just "music")
-- Distinctive terms that make this day unique
-
-Date: {} ({})
-Messages:
-{}
-
-YOUR RESPONSE (follow this format EXACTLY):
-Summary: [Start directly with content, NO preamble]
-
-Keywords: [specific, unique terms]"#,
-            args.contact,
-            args.contact,
-            date.format("%B %d, %Y"),
-            weekday,
-            messages_text
-        );
+        let (prompt, system_prompt) = build_daily_summary_prompt(&args.contact, date, messages);
 
         println!("Generating summary...");
 
-        let summary = ollama
-            .generate(
-                &prompt,
-                Some("You are a conversation summarizer. Create clear, factual summaries with precise subject attribution AND extract distinctive keywords. Focus on specific, unique terms that differentiate this conversation from others."),
-            )
-            .await?;
+        let summary = ollama.generate(&prompt, Some(system_prompt)).await?;
 
         println!("\n📝 GENERATED SUMMARY:");
         println!("─────────────────────────────────────────");
@@ -256,8 +243,7 @@ Keywords: [specific, unique terms]"#,
                 message_count: messages.len() as i32,
                 embedding,
                 created_at: chrono::Utc::now().timestamp(),
-                // model_version: "nomic-embed-text:v1.5".to_string(),
-                model_version: "mxbai-embed-large:335m".to_string(),
+                model_version: EMBEDDING_MODEL.to_string(),
             };
 
             let mut dao = summary_dao.lock().expect("Unable to lock DailySummaryDao");
-- 
2.49.1


From aa651d1c7b0358c364c4aed524421cdceefedc84 Mon Sep 17 00:00:00 2001
From: Cameron <ccordes12@mail.bw.edu>
Date: Thu, 23 Apr 2026 16:28:48 -0400
Subject: [PATCH 12/24] feat(ai): iteration budget in prompt + preserve
 photo-knowledge links

- Inject the max-iterations budget into the agentic system prompt for
  both insight generation and chat turns. Chat does this per-turn by
  appending a note to the replayed system message and restoring it
  before persistence so the note doesn't accumulate across turns.
- Stop deleting entity_photo_links at the start of agentic insight
  generation. The clear made recall_facts_for_photo always return
  empty, wasting a tool call and discarding knowledge from prior runs.
  Re-linking the same entity is already an INSERT OR IGNORE no-op.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/ai/insight_chat.rs      | 50 +++++++++++++++++++++++++++++++++++++
 src/ai/insight_generator.rs | 21 +++++++---------
 2 files changed, 59 insertions(+), 12 deletions(-)

diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs
index a4d21ea..01dd4de 100644
--- a/src/ai/insight_chat.rs
+++ b/src/ai/insight_chat.rs
@@ -382,6 +382,12 @@ impl InsightChatService {
         // 7. Append the new user turn.
         messages.push(ChatMessage::user(req.user_message.clone()));
 
+        // Temporarily annotate the system message with this turn's iteration
+        // budget so the model knows how many tool-calling rounds it has. We
+        // restore the original content before persistence so the note doesn't
+        // accumulate across turns.
+        let original_system_content = annotate_system_with_budget(&mut messages, max_iterations);
+
         let insight_cx = parent_cx.with_span(span);
 
         // 8. Agentic loop — same shape as insight_generator's, but capped
@@ -468,6 +474,10 @@ impl InsightChatService {
 
         loop_cx.span().set_status(Status::Ok);
 
+        // Drop the per-turn iteration-budget note from the system message
+        // before we persist so it doesn't snowball on each subsequent turn.
+        restore_system_content(&mut messages, original_system_content);
+
         // 9. Persist. Append mode rewrites the JSON blob in place; amend
         //    mode regenerates the title and inserts a new insight row,
         //    relying on store_insight to flip prior rows' is_current=false.
@@ -789,6 +799,8 @@ impl InsightChatService {
 
         messages.push(ChatMessage::user(req.user_message.clone()));
 
+        let original_system_content = annotate_system_with_budget(&mut messages, max_iterations);
+
         let mut tool_calls_made = 0usize;
         let mut iterations_used = 0usize;
         let mut last_prompt_eval_count: Option<i32> = None;
@@ -917,6 +929,10 @@ impl InsightChatService {
             messages.push(final_response);
         }
 
+        // Drop the per-turn iteration-budget note from the system message
+        // before we persist so it doesn't snowball on each subsequent turn.
+        restore_system_content(&mut messages, original_system_content);
+
         // Persist.
         let json = serde_json::to_string(&messages)
             .map_err(|e| anyhow!("failed to serialize chat history: {}", e))?;
@@ -1080,6 +1096,40 @@ fn env_max_iterations() -> usize {
         .max(1)
 }
 
+/// Append a per-turn iteration-budget reminder to the replayed system
+/// message so the model knows how many tool-calling rounds this turn gets.
+/// Returns the original `content` so the caller can restore it before
+/// persistence — otherwise the note would accumulate across turns.
+///
+/// No-op (returns `None`) when `messages` has no leading system message.
+fn annotate_system_with_budget(
+    messages: &mut [ChatMessage],
+    max_iterations: usize,
+) -> Option<String> {
+    let first = messages.first_mut()?;
+    if first.role != "system" {
+        return None;
+    }
+    let original = first.content.clone();
+    first.content = format!(
+        "{}\n\n(Budget for this chat turn: up to {} tool-calling iterations. Produce your final reply before the budget is exhausted.)",
+        first.content, max_iterations
+    );
+    Some(original)
+}
+
+/// Restore a system-message content previously captured by
+/// [`annotate_system_with_budget`]. No-op when `original` is `None` or the
+/// first message isn't a system message.
+fn restore_system_content(messages: &mut [ChatMessage], original: Option<String>) {
+    let Some(original) = original else { return };
+    if let Some(first) = messages.first_mut()
+        && first.role == "system"
+    {
+        first.content = original;
+    }
+}
+
 /// View returned to clients for chat-UI rendering.
 #[derive(Debug)]
 pub struct HistoryView {
diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs
index bffd141..8a0ed11 100644
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -2892,8 +2892,11 @@ Return ONLY the summary, nothing else."#,
                 .collect()
         };
 
-        // 6. Clear existing entity-photo links for this file so the run starts fresh,
-        //    and ensure the owner entity exists so the agent can reference it.
+        // 6. Ensure the owner entity exists so the agent can reference it.
+        //    Prior entity_photo_links for this file are intentionally preserved
+        //    across regenerations — clearing them made `recall_facts_for_photo`
+        //    always return empty and discarded hard-won knowledge. Re-linking
+        //    the same entity is a no-op (INSERT OR IGNORE).
         let owner_name = user_display_name();
         let owner_entity_id: Option<i32> = {
             let mut kdao = self
@@ -2901,14 +2904,6 @@ Return ONLY the summary, nothing else."#,
                 .lock()
                 .expect("Unable to lock KnowledgeDao");
 
-            if let Err(e) = kdao.delete_photo_links_for_file(&insight_cx, &file_path) {
-                log::warn!(
-                    "Failed to clear entity_photo_links for {}: {:?}",
-                    file_path,
-                    e
-                );
-            }
-
             // Upsert the owner entity so the agent always has a stable entity ID to reference.
             let owner = crate::database::models::InsertEntity {
                 name: owner_name.clone(),
@@ -3000,9 +2995,11 @@ Return ONLY the summary, nothing else."#,
             4. Use recall_entities to look up known people, places, or things that appear in this photo.\n\
             5. When you identify people, places, events, or notable things in this photo: use store_entity to record them and store_fact to record key facts (relationships, roles, attributes). This builds a persistent memory for future insights.\n\
             6. Only produce your final insight AFTER you have gathered context from at least 5 tool calls.\n\
-            7. If a tool returns no results, that is useful information — continue calling the remaining tools anyway.",
+            7. If a tool returns no results, that is useful information — continue calling the remaining tools anyway.\n\
+            8. You have a hard budget of {max_iterations} tool-calling iterations before the loop ends. Plan your context gathering so you can write a complete final insight within that budget.",
             owner_id_note = owner_id_note,
-            owner_name = owner_name
+            owner_name = owner_name,
+            max_iterations = max_iterations
         );
         let system_content = if let Some(ref custom) = custom_system_prompt {
             format!("{}\n\n{}", custom, base_system)
-- 
2.49.1


From d54419e779a3e6e49a41cc728ac93128309ecd8c Mon Sep 17 00:00:00 2001
From: Cameron <ccordes12@mail.bw.edu>
Date: Thu, 23 Apr 2026 17:19:59 -0400
Subject: [PATCH 13/24] style: cargo fmt drift

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/ai/daily_summary_job.rs   | 3 +--
 src/bin/test_daily_summary.rs | 6 +++++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/ai/daily_summary_job.rs b/src/ai/daily_summary_job.rs
index 18ad15c..3fede71 100644
--- a/src/ai/daily_summary_job.rs
+++ b/src/ai/daily_summary_job.rs
@@ -18,8 +18,7 @@ use crate::otel::global_tracer;
 pub const DAILY_SUMMARY_MESSAGE_LIMIT: usize = 300;
 
 /// System prompt used when generating daily conversation summaries.
-pub const DAILY_SUMMARY_SYSTEM_PROMPT: &str =
-    "You are a conversation summarizer. Create clear, factual summaries with \
+pub const DAILY_SUMMARY_SYSTEM_PROMPT: &str = "You are a conversation summarizer. Create clear, factual summaries with \
      precise subject attribution AND extract distinctive keywords. Focus on \
      specific, unique terms that differentiate this conversation from others.";
 
diff --git a/src/bin/test_daily_summary.rs b/src/bin/test_daily_summary.rs
index aff2790..d04aa32 100644
--- a/src/bin/test_daily_summary.rs
+++ b/src/bin/test_daily_summary.rs
@@ -202,7 +202,11 @@ async fn main() -> Result<()> {
             let user_name = user_display_name();
             println!("\nMessage preview:");
             for (i, msg) in messages.iter().take(3).enumerate() {
-                let sender: &str = if msg.is_sent { &user_name } else { &msg.contact };
+                let sender: &str = if msg.is_sent {
+                    &user_name
+                } else {
+                    &msg.contact
+                };
                 let preview = msg.body.chars().take(60).collect::<String>();
                 println!("  {}. {}: {}...", i + 1, sender, preview);
             }
-- 
2.49.1


From dc2a96162e3210d8dacdaa5c8caa99ba15378260 Mon Sep 17 00:00:00 2001
From: Cameron <ccordes12@mail.bw.edu>
Date: Thu, 23 Apr 2026 17:20:12 -0400
Subject: [PATCH 14/24] fix(dates): prefer earliest of fs created/modified as
 fallback

On copied or restored files (e.g. a backup library), the OS stamps
created at copy time while modified is preserved from the source, so
the earlier of the two is a better proxy for when the content
originated. Adds utils::earliest_fs_time and threads it through the
three spots that fall back to filesystem dates: photos-list sort,
memories grouping, and insight-generation timestamp.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/ai/insight_generator.rs | 16 ++++++++++------
 src/files.rs                |  3 ++-
 src/memories.rs             |  5 +++--
 src/utils.rs                | 16 ++++++++++++++++
 4 files changed, 31 insertions(+), 9 deletions(-)

diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs
index 8a0ed11..dad7040 100644
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -23,7 +23,7 @@ use crate::libraries::Library;
 use crate::memories::extract_date_from_filename;
 use crate::otel::global_tracer;
 use crate::tags::TagDao;
-use crate::utils::normalize_path;
+use crate::utils::{earliest_fs_time, normalize_path};
 
 #[derive(Deserialize)]
 struct NominatimResponse {
@@ -760,8 +760,6 @@ impl InsightGenerator {
                     let full_path = self.resolve_full_path(&file_path)?;
                     File::open(&full_path)
                         .and_then(|f| f.metadata())
-                        .and_then(|m| m.created().or(m.modified()))
-                        .map(|t| DateTime::<Utc>::from(t).timestamp())
                         .inspect_err(|e| {
                             log::warn!(
                                 "Failed to get file timestamp for insight {}: {}",
@@ -770,6 +768,8 @@ impl InsightGenerator {
                             )
                         })
                         .ok()
+                        .and_then(|m| earliest_fs_time(&m))
+                        .map(|t| DateTime::<Utc>::from(t).timestamp())
                 })
                 .unwrap_or_else(|| Utc::now().timestamp())
         };
@@ -1687,7 +1687,11 @@ Return ONLY the summary, nothing else."#,
                     let date = chrono::DateTime::from_timestamp(h.date, 0)
                         .map(|dt| dt.format("%Y-%m-%d").to_string())
                         .unwrap_or_else(|| h.date.to_string());
-                    let direction: &str = if h.type_ == 2 { &user_name } else { &h.contact_name };
+                    let direction: &str = if h.type_ == 2 {
+                        &user_name
+                    } else {
+                        &h.contact_name
+                    };
                     let score = h
                         .similarity_score
                         .map(|s| format!(" [score {:.2}]", s))
@@ -2858,8 +2862,6 @@ Return ONLY the summary, nothing else."#,
                     let full_path = self.resolve_full_path(&file_path)?;
                     File::open(&full_path)
                         .and_then(|f| f.metadata())
-                        .and_then(|m| m.created().or(m.modified()))
-                        .map(|t| DateTime::<Utc>::from(t).timestamp())
                         .inspect_err(|e| {
                             log::warn!(
                                 "Failed to get file timestamp for agentic insight {}: {}",
@@ -2868,6 +2870,8 @@ Return ONLY the summary, nothing else."#,
                             )
                         })
                         .ok()
+                        .and_then(|m| earliest_fs_time(&m))
+                        .map(|t| DateTime::<Utc>::from(t).timestamp())
                 })
                 .unwrap_or_else(|| Utc::now().timestamp())
         };
diff --git a/src/files.rs b/src/files.rs
index 561414c..75343b1 100644
--- a/src/files.rs
+++ b/src/files.rs
@@ -15,6 +15,7 @@ use crate::database::ExifDao;
 use crate::file_types;
 use crate::geo::{gps_bounding_box, haversine_distance};
 use crate::memories::extract_date_from_filename;
+use crate::utils::earliest_fs_time;
 use crate::{AppState, create_thumbnails};
 use actix_web::web::Data;
 use actix_web::{
@@ -138,8 +139,8 @@ fn in_memory_date_sort(
                     lib_roots.get(&lib_id).and_then(|root| {
                         let full_path = Path::new(root).join(&f.file_name);
                         std::fs::metadata(full_path)
-                            .and_then(|md| md.created().or(md.modified()))
                             .ok()
+                            .and_then(|md| earliest_fs_time(&md))
                             .map(|system_time| {
                                 <SystemTime as Into<DateTime<Utc>>>::into(system_time).timestamp()
                             })
diff --git a/src/memories.rs b/src/memories.rs
index 875a72c..0e2aad5 100644
--- a/src/memories.rs
+++ b/src/memories.rs
@@ -19,6 +19,7 @@ use crate::files::is_image_or_video;
 use crate::libraries::Library;
 use crate::otel::{extract_context_from_request, global_tracer};
 use crate::state::AppState;
+use crate::utils::earliest_fs_time;
 
 // Helper that encapsulates path-exclusion semantics
 #[derive(Debug)]
@@ -336,8 +337,8 @@ fn get_memory_date_with_priority(
         return Some((date, Some(exif_timestamp), modified));
     }
 
-    // Priority 3: Fall back to metadata
-    let system_time = meta.created().ok().or_else(|| meta.modified().ok())?;
+    // Priority 3: Fall back to metadata (earlier of created/modified — see utils::earliest_fs_time)
+    let system_time = earliest_fs_time(&meta)?;
     let dt_utc: DateTime<Utc> = system_time.into();
 
     let date_in_timezone = if let Some(tz) = client_timezone {
diff --git a/src/utils.rs b/src/utils.rs
index 1779c15..fdfef9b 100644
--- a/src/utils.rs
+++ b/src/utils.rs
@@ -1,3 +1,5 @@
+use std::time::SystemTime;
+
 /// Normalize a file path to use forward slashes for cross-platform consistency
 /// This ensures paths stored in the database always use `/` regardless of OS
 ///
@@ -12,6 +14,20 @@ pub fn normalize_path(path: &str) -> String {
     path.replace('\\', "/")
 }
 
+/// Pick the earlier of a file's created and modified timestamps.
+///
+/// On copied/restored files (e.g., a backup library), `created` is stamped at
+/// copy time while `modified` is preserved from the source — so the earlier
+/// of the two is a better proxy for when the content originated. Falls back
+/// to whichever timestamp is available if one platform lacks the other.
+pub fn earliest_fs_time(md: &std::fs::Metadata) -> Option<SystemTime> {
+    match (md.created().ok(), md.modified().ok()) {
+        (Some(c), Some(m)) => Some(c.min(m)),
+        (Some(t), None) | (None, Some(t)) => Some(t),
+        (None, None) => None,
+    }
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
-- 
2.49.1


From 13b9d54861882dc6fe21b53eb9989ce6c38b9bf8 Mon Sep 17 00:00:00 2001
From: Cameron <ccordes12@mail.bw.edu>
Date: Thu, 23 Apr 2026 20:47:13 -0400
Subject: [PATCH 15/24] fix(scan): quiet startup scans & thumbnail RAW/HEIC

Three recurring issues on every full scan:

1. Video playlist scans re-enqueued every file only to reject it as
   AlreadyExists. Pre-filter in ScanDirectoryMessage and QueueVideosMessage
   so we skip videos whose .m3u8 already exists, and demote the leaked
   AlreadyExists log to debug.

2. image crate was built with only jpeg/png features, so webp/tiff/avif
   files logged "format not supported" every scan. Enable those features.

3. RAW (ARW/NEF/CR2/...) and HEIC thumbnails weren't generated, so the
   scan kept retrying them. Try the file's embedded JPEG preview via
   kamadak-exif first (fast, pure-Rust, works on Sony ARW where ffmpeg's
   TIFF decoder fails). Fall back to ffmpeg for HEIC/HEIF and RAWs with
   no preview. Anything still undecodable gets a <thumb>.unsupported
   sentinel so future scans skip it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 Cargo.lock          |  70 +++++++++++++++++++++++++
 Cargo.toml          |   2 +-
 README.md           |  33 +++++++++++-
 src/exif.rs         |  56 +++++++++++++++++++-
 src/file_types.rs   |  15 +++++-
 src/main.rs         | 121 +++++++++++++++++++++++++++++---------------
 src/video/actors.rs |  51 ++++++++++++++++++-
 7 files changed, 300 insertions(+), 48 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index a404abc..d4a65c9 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -908,6 +908,12 @@ version = "0.8.21"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
 
+[[package]]
+name = "crunchy"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5"
+
 [[package]]
 name = "crypto-common"
 version = "0.1.6"
@@ -1218,6 +1224,26 @@ version = "2.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be"
 
+[[package]]
+name = "fax"
+version = "0.2.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "f05de7d48f37cd6730705cbca900770cab77a89f413d23e100ad7fad7795a0ab"
+dependencies = [
+ "fax_derive",
+]
+
+[[package]]
+name = "fax_derive"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a0aca10fb742cb43f9e7bb8467c91aa9bcb8e3ffbc6a6f7389bb93ffc920577d"
+dependencies = [
+ "proc-macro2",
+ "quote",
+ "syn",
+]
+
 [[package]]
 name = "fdeflate"
 version = "0.3.7"
@@ -1501,6 +1527,17 @@ dependencies = [
  "tracing",
 ]
 
+[[package]]
+name = "half"
+version = "2.7.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6ea2d84b969582b4b1864a92dc5d27cd2b77b622a8d79306834f1be5ba20d84b"
+dependencies = [
+ "cfg-if",
+ "crunchy",
+ "zerocopy",
+]
+
 [[package]]
 name = "hashbrown"
 version = "0.14.5"
@@ -1843,11 +1880,14 @@ checksum = "1c6a3ce16143778e24df6f95365f12ed105425b22abefd289dd88a64bab59605"
 dependencies = [
  "bytemuck",
  "byteorder-lite",
+ "image-webp",
  "moxcms",
  "num-traits",
  "png",
  "ravif",
  "rayon",
+ "rgb",
+ "tiff",
  "zune-core",
  "zune-jpeg",
 ]
@@ -1908,6 +1948,16 @@ dependencies = [
  "zerocopy",
 ]
 
+[[package]]
+name = "image-webp"
+version = "0.2.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "525e9ff3e1a4be2fbea1fdf0e98686a6d98b4d8f937e1bf7402245af1909e8c3"
+dependencies = [
+ "byteorder-lite",
+ "quick-error",
+]
+
 [[package]]
 name = "imgref"
 version = "1.11.0"
@@ -3712,6 +3762,20 @@ dependencies = [
  "syn",
 ]
 
+[[package]]
+name = "tiff"
+version = "0.10.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "af9605de7fee8d9551863fd692cce7637f548dbd9db9180fcc07ccc6d26c336f"
+dependencies = [
+ "fax",
+ "flate2",
+ "half",
+ "quick-error",
+ "weezl",
+ "zune-jpeg",
+]
+
 [[package]]
 name = "time"
 version = "0.3.42"
@@ -4269,6 +4333,12 @@ dependencies = [
  "wasm-bindgen",
 ]
 
+[[package]]
+name = "weezl"
+version = "0.1.12"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a28ac98ddc8b9274cb41bb4d9d4d5c425b6020c50c46f25559911905610b4a88"
+
 [[package]]
 name = "winapi"
 version = "0.3.9"
diff --git a/Cargo.toml b/Cargo.toml
index 2b966b7..847c9f6 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -29,7 +29,7 @@ chrono = "0.4"
 clap = { version = "4.5", features = ["derive"] }
 dotenv = "0.15"
 bcrypt = "0.17.1"
-image = { version = "0.25.5", default-features = false, features = ["jpeg", "png", "rayon"] }
+image = { version = "0.25.5", default-features = false, features = ["jpeg", "png", "rayon", "webp", "tiff", "avif"] }
 infer = "0.16"
 walkdir = "2.4.0"
 rayon = "1.5"
diff --git a/README.md b/README.md
index b625b04..31978d9 100644
--- a/README.md
+++ b/README.md
@@ -14,14 +14,43 @@ Upon first run it will generate thumbnails for all images and videos at `BASE_PA
 - **RAG-based Context Retrieval** - Semantic search over daily conversation summaries
 - **Automatic Daily Summaries** - LLM-generated summaries of daily conversations with embeddings
 
+## External Dependencies
+
+### ffmpeg (required)
+`ffmpeg` must be on `PATH`. It is used for:
+- **HLS video streaming** — transcoding/segmenting source videos into `.m3u8` + `.ts` playlists
+- **Video thumbnails** — extracting a frame at the 3-second mark
+- **Video preview clips** — short looping previews for the Video Wall
+- **HEIC / HEIF thumbnails** — decoding Apple's HEIC format (your ffmpeg build must include
+  `libheif`; most modern builds do)
+
+Builds used in development: the `gyan.dev` full build on Windows, and distro `ffmpeg`
+packages on Linux work fine. If HEIC thumbnails silently fail, check
+`ffmpeg -formats | grep heif` to confirm HEIF support.
+
+### RAW photo thumbnails (no extra dependency)
+RAW formats (ARW, NEF, CR2, CR3, DNG, RAF, ORF, RW2, PEF, SRW, TIFF) are thumbnailed
+by reading the embedded JPEG preview from the TIFF IFD1 using `kamadak-exif`. No
+external RAW decoder (libraw / dcraw) is required. Files without an embedded preview
+fall back to ffmpeg (works for most NEF files), and anything that still can't be
+decoded is marked with a `<thumb>.unsupported` sentinel in the thumbnail directory
+so we don't retry it every scan. Delete those sentinels to force retries after a
+tooling upgrade.
+
 ## Environment
 There are a handful of required environment variables to have the API run.
 They should be defined where the binary is located or above it in an `.env` file.
-You must have `ffmpeg` installed for streaming video and generating video thumbnails.
 
 - `DATABASE_URL` is a path or url to a database (currently only SQLite is tested)
 - `BASE_PATH` is the root from which you want to serve images and videos
-- `THUMBNAILS` is a path where generated thumbnails should be stored
+- `THUMBNAILS` is a path where generated thumbnails should be stored. Thumbnails
+  mirror the source tree under `BASE_PATH` and keep the source's original
+  extension (e.g. `foo.arw` or `bar.mp4`), though the file contents are always
+  JPEG bytes — browsers content-sniff. Files that can't be thumbnailed by the
+  `image` crate, ffmpeg, or an embedded RAW preview get a zero-byte
+  `<thumb_path>.unsupported` sentinel in this directory so subsequent scans
+  skip them. Delete the `*.unsupported` files to force retries (for example
+  after upgrading ffmpeg or adding libheif)
 - `VIDEO_PATH` is a path where HLS playlists and video parts should be stored
 - `GIFS_DIRECTORY` is a path where generated video GIF thumbnails should be stored
 - `BIND_URL` is the url and port to bind to (typically your own IP address)
diff --git a/src/exif.rs b/src/exif.rs
index c096f71..0cd29d9 100644
--- a/src/exif.rs
+++ b/src/exif.rs
@@ -1,5 +1,5 @@
 use std::fs::File;
-use std::io::BufReader;
+use std::io::{BufReader, Read, Seek, SeekFrom};
 use std::path::Path;
 
 use anyhow::{Result, anyhow};
@@ -25,6 +25,60 @@ pub struct ExifData {
     pub date_taken: Option<i64>,
 }
 
+/// TIFF-based RAW formats where `JPEGInterchangeFormat` offsets are
+/// absolute file offsets (the file itself is a TIFF container).
+fn is_tiff_raw(path: &Path) -> bool {
+    matches!(
+        path.extension()
+            .and_then(|e| e.to_str())
+            .map(|s| s.to_lowercase())
+            .as_deref(),
+        Some(
+            "tiff" | "tif" | "nef" | "cr2" | "arw" | "dng" | "raf" | "orf" | "rw2" | "pef" | "srw"
+        )
+    )
+}
+
+/// Returns the bytes of the embedded JPEG thumbnail in a TIFF-based RAW or
+/// TIFF file. Used to thumbnail formats whose RAW pixel data can't be decoded
+/// by our normal tools (e.g. Sony ARW). Returns `None` if no preview is
+/// present, the file isn't a TIFF container, or the data doesn't look like
+/// a valid JPEG.
+pub fn extract_embedded_jpeg_preview(path: &Path) -> Option<Vec<u8>> {
+    if !is_tiff_raw(path) {
+        return None;
+    }
+
+    let file = File::open(path).ok()?;
+    let mut bufreader = BufReader::new(file);
+    let exif = Reader::new().read_from_container(&mut bufreader).ok()?;
+
+    let offset = exif
+        .get_field(Tag::JPEGInterchangeFormat, In::THUMBNAIL)?
+        .value
+        .get_uint(0)?;
+    let length = exif
+        .get_field(Tag::JPEGInterchangeFormatLength, In::THUMBNAIL)?
+        .value
+        .get_uint(0)?;
+    if length == 0 {
+        return None;
+    }
+
+    let mut file = File::open(path).ok()?;
+    file.seek(SeekFrom::Start(offset as u64)).ok()?;
+    let mut buf = vec![0u8; length as usize];
+    file.read_exact(&mut buf).ok()?;
+
+    // JPEG SOI marker sanity check — MakerNote offsets sometimes point at
+    // TIFF-wrapped previews or other non-JPEG data.
+    if buf.len() < 2 || buf[0] != 0xFF || buf[1] != 0xD8 {
+        return None;
+    }
+
+    Some(buf)
+}
+
 pub fn supports_exif(path: &Path) -> bool {
     if let Some(ext) = path.extension() {
         let ext_lower = ext.to_string_lossy().to_lowercase();
diff --git a/src/file_types.rs b/src/file_types.rs
index c1249d0..f312916 100644
--- a/src/file_types.rs
+++ b/src/file_types.rs
@@ -3,9 +3,22 @@ use walkdir::DirEntry;
 
 /// Supported image file extensions
 pub const IMAGE_EXTENSIONS: &[&str] = &[
-    "jpg", "jpeg", "png", "webp", "tiff", "tif", "heif", "heic", "avif", "nef",
+    "jpg", "jpeg", "png", "webp", "tiff", "tif", "heif", "heic", "avif", "nef", "arw",
 ];
 
+/// Extensions the `image` crate cannot decode — we fall back to ffmpeg to
+/// extract an embedded preview or decode the frame.
+pub const FFMPEG_THUMBNAIL_EXTENSIONS: &[&str] = &["heif", "heic", "nef", "arw"];
+
+/// Returns true if thumbnail generation should go through ffmpeg instead of
+/// the `image` crate (RAW formats, HEIF/HEIC).
+pub fn needs_ffmpeg_thumbnail(path: &Path) -> bool {
+    match path.extension().and_then(|e| e.to_str()) {
+        Some(ext) => FFMPEG_THUMBNAIL_EXTENSIONS.contains(&ext.to_lowercase().as_str()),
+        None => false,
+    }
+}
+
 /// Supported video file extensions
 pub const VIDEO_EXTENSIONS: &[&str] = &["mp4", "mov", "avi", "mkv"];
 
diff --git a/src/main.rs b/src/main.rs
index da0c4dd..db63ef0 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -52,7 +52,8 @@ use crate::state::AppState;
 use crate::tags::*;
 use crate::video::actors::{
     GeneratePreviewClipMessage, ProcessMessage, QueueVideosMessage, ScanDirectoryMessage,
-    VideoPlaylistManager, create_playlist, generate_video_thumbnail,
+    VideoPlaylistManager, create_playlist, generate_image_thumbnail_ffmpeg,
+    generate_video_thumbnail,
 };
 use log::{debug, error, info, trace, warn};
 use opentelemetry::trace::{Span, Status, TraceContextExt, Tracer};
@@ -1060,6 +1061,47 @@ async fn delete_favorite(
     }
 }
 
+/// Sentinel path written next to a would-be thumbnail when a file cannot be
+/// decoded by either the `image` crate or ffmpeg. Its presence causes future
+/// scans to skip the file instead of re-logging the failure.
+pub fn unsupported_thumbnail_sentinel(thumb_path: &Path) -> PathBuf {
+    let mut s = thumb_path.as_os_str().to_owned();
+    s.push(".unsupported");
+    PathBuf::from(s)
+}
+
+fn generate_image_thumbnail(src: &Path, thumb_path: &Path) -> std::io::Result<()> {
+    // RAW formats (ARW/NEF/CR2/etc): try the file's embedded JPEG preview
+    // first. Avoids ffmpeg choking on proprietary RAW compression (Sony ARW
+    // in particular), and is faster than decoding RAW pixels anyway.
+    if let Some(preview) = exif::extract_embedded_jpeg_preview(src) {
+        let img = image::load_from_memory(&preview).map_err(|e| {
+            std::io::Error::new(
+                std::io::ErrorKind::InvalidData,
+                format!("decode embedded preview {:?}: {}", src, e),
+            )
+        })?;
+        let scaled = img.thumbnail(200, u32::MAX);
+        scaled
+            .save_with_format(thumb_path, image::ImageFormat::Jpeg)
+            .map_err(|e| std::io::Error::other(format!("save {:?}: {}", thumb_path, e)))?;
+        return Ok(());
+    }
+
+    if file_types::needs_ffmpeg_thumbnail(src) {
+        return generate_image_thumbnail_ffmpeg(src, thumb_path);
+    }
+
+    let img = image::open(src).map_err(|e| {
+        std::io::Error::new(std::io::ErrorKind::InvalidData, format!("{:?}: {}", src, e))
+    })?;
+    let scaled = img.thumbnail(200, u32::MAX);
+    scaled
+        .save(thumb_path)
+        .map_err(|e| std::io::Error::other(format!("save {:?}: {}", thumb_path, e)))?;
+    Ok(())
+}
+
 fn create_thumbnails(libs: &[libraries::Library]) {
     let tracer = global_tracer();
     let span = tracer.start("creating thumbnails");
@@ -1080,17 +1122,26 @@ fn create_thumbnails(libs: &[libraries::Library]) {
             .into_par_iter()
             .filter_map(|entry| entry.ok())
             .filter(|entry| entry.file_type().is_file())
-            .filter(|entry| {
-                if is_video(entry) {
-                    let relative_path = &entry.path().strip_prefix(&images).unwrap();
-                    let thumb_path = Path::new(thumbnail_directory).join(relative_path);
-                    std::fs::create_dir_all(
-                        thumb_path
-                            .parent()
-                            .unwrap_or_else(|| panic!("Thumbnail {:?} has no parent?", thumb_path)),
-                    )
-                    .expect("Error creating directory");
+            .for_each(|entry| {
+                let src = entry.path();
+                let Ok(relative_path) = src.strip_prefix(&images) else {
+                    return;
+                };
+                let thumb_path = Path::new(thumbnail_directory).join(relative_path);
 
+                if thumb_path.exists() || unsupported_thumbnail_sentinel(&thumb_path).exists() {
+                    return;
+                }
+
+                let Some(parent) = thumb_path.parent() else {
+                    return;
+                };
+                if let Err(e) = std::fs::create_dir_all(parent) {
+                    error!("Failed to create thumbnail dir {:?}: {}", parent, e);
+                    return;
+                }
+
+                if is_video(&entry) {
                     let mut video_span = tracer.start_with_context(
                         "generate_video_thumbnail",
                         &opentelemetry::Context::new()
@@ -1103,37 +1154,24 @@ fn create_thumbnails(libs: &[libraries::Library]) {
                     ]);
 
                     debug!("Generating video thumbnail: {:?}", thumb_path);
-                    generate_video_thumbnail(entry.path(), &thumb_path);
+                    generate_video_thumbnail(src, &thumb_path);
                     video_span.end();
-                    false
-                } else {
-                    is_image(entry)
+                } else if is_image(&entry) {
+                    match generate_image_thumbnail(src, &thumb_path) {
+                        Ok(_) => info!("Saved thumbnail: {:?}", thumb_path),
+                        Err(e) => {
+                            let sentinel = unsupported_thumbnail_sentinel(&thumb_path);
+                            error!(
+                                "Unable to thumbnail {:?}: {}. Writing sentinel {:?}",
+                                src, e, sentinel
+                            );
+                            if let Err(se) = std::fs::write(&sentinel, b"") {
+                                warn!("Failed to write sentinel {:?}: {}", sentinel, se);
+                            }
+                        }
+                    }
                 }
-            })
-            .filter(|entry| {
-                let path = entry.path();
-                let relative_path = &path.strip_prefix(&images).unwrap();
-                let thumb_path = Path::new(thumbnail_directory).join(relative_path);
-                !thumb_path.exists()
-            })
-            .map(|entry| (image::open(entry.path()), entry.path().to_path_buf()))
-            .filter(|(img, path)| {
-                if let Err(e) = img {
-                    error!("Unable to open image: {:?}. {}", path, e);
-                }
-                img.is_ok()
-            })
-            .map(|(img, path)| (img.unwrap(), path))
-            .map(|(image, path)| (image.thumbnail(200, u32::MAX), path))
-            .map(|(image, path)| {
-                let relative_path = &path.strip_prefix(&images).unwrap();
-                let thumb_path = Path::new(thumbnail_directory).join(relative_path);
-                std::fs::create_dir_all(thumb_path.parent().unwrap())
-                    .expect("There was an issue creating directory");
-                info!("Saving thumbnail: {:?}", thumb_path);
-                image.save(thumb_path).expect("Failure saving thumbnail");
-            })
-            .for_each(drop);
+            });
     }
 
     debug!("Finished making thumbnails");
@@ -1744,7 +1782,8 @@ fn process_new_files(
     // not just photos with parseable EXIF.
     for (file_path, relative_path) in &files {
         let thumb_path = thumbnail_directory.join(relative_path);
-        let needs_thumbnail = !thumb_path.exists();
+        let needs_thumbnail =
+            !thumb_path.exists() && !unsupported_thumbnail_sentinel(&thumb_path).exists();
         let needs_row = !existing_exif_paths.contains_key(relative_path);
 
         if needs_thumbnail || needs_row {
diff --git a/src/video/actors.rs b/src/video/actors.rs
index 284c8e3..e85feb4 100644
--- a/src/video/actors.rs
+++ b/src/video/actors.rs
@@ -48,6 +48,14 @@ impl Handler<ProcessMessage> for StreamActor {
     }
 }
 
+pub fn playlist_file_for(playlist_dir: &str, video_path: &Path) -> PathBuf {
+    let filename = video_path
+        .file_name()
+        .and_then(|n| n.to_str())
+        .unwrap_or("unknown");
+    PathBuf::from(format!("{}/{}.m3u8", playlist_dir, filename))
+}
+
 pub async fn create_playlist(video_path: &str, playlist_file: &str) -> Result<Child> {
     if Path::new(playlist_file).exists() {
         debug!("Playlist already exists: {}", playlist_file);
@@ -103,6 +111,35 @@ pub fn generate_video_thumbnail(path: &Path, destination: &Path) {
         .expect("Failure to create video frame");
 }
 
+/// Use ffmpeg to extract a 200px-wide thumbnail from formats the `image` crate
+/// can't decode (RAW: NEF/ARW, HEIC/HEIF). Writes JPEG bytes to `destination`
+/// regardless of its extension.
+pub fn generate_image_thumbnail_ffmpeg(path: &Path, destination: &Path) -> std::io::Result<()> {
+    let output = Command::new("ffmpeg")
+        .arg("-y")
+        .arg("-i")
+        .arg(path)
+        .arg("-vframes")
+        .arg("1")
+        .arg("-vf")
+        .arg("scale=200:-1")
+        .arg("-f")
+        .arg("image2")
+        .arg("-c:v")
+        .arg("mjpeg")
+        .arg(destination)
+        .output()?;
+
+    if !output.status.success() {
+        return Err(std::io::Error::other(format!(
+            "ffmpeg failed ({}): {}",
+            output.status,
+            String::from_utf8_lossy(&output.stderr).trim()
+        )));
+    }
+    Ok(())
+}
+
 /// Check if a video is already encoded with h264 codec
 /// Returns true if the video uses h264, false otherwise or if detection fails
 async fn is_h264_encoded(video_path: &str) -> bool {
@@ -246,15 +283,18 @@ impl Handler<ScanDirectoryMessage> for VideoPlaylistManager {
             msg.directory
         );
 
+        let playlist_output_dir = self.playlist_dir.clone();
+        let playlist_dir_str = playlist_output_dir.to_str().unwrap().to_string();
+
         let video_files = WalkDir::new(&msg.directory)
             .into_iter()
             .filter_map(|e| e.ok())
             .filter(|e| e.file_type().is_file())
             .filter(is_video)
+            .filter(|e| !playlist_file_for(&playlist_dir_str, e.path()).exists())
             .collect::<Vec<DirEntry>>();
 
         let scan_dir_name = msg.directory.clone();
-        let playlist_output_dir = self.playlist_dir.clone();
         let playlist_generator = self.playlist_generator.clone();
 
         Box::pin(async move {
@@ -285,6 +325,9 @@ impl Handler<ScanDirectoryMessage> for VideoPlaylistManager {
                             path_as_str
                         );
                     }
+                    Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
+                        debug!("Playlist already exists for '{:?}', skipping", path);
+                    }
                     Err(e) => {
                         warn!("Failed to generate playlist for path '{:?}'. {:?}", path, e);
                     }
@@ -318,14 +361,18 @@ impl Handler<QueueVideosMessage> for VideoPlaylistManager {
         );
 
         let playlist_output_dir = self.playlist_dir.clone();
+        let playlist_dir_str = playlist_output_dir.to_str().unwrap().to_string();
         let playlist_generator = self.playlist_generator.clone();
 
         for video_path in msg.video_paths {
+            if playlist_file_for(&playlist_dir_str, &video_path).exists() {
+                continue;
+            }
             let path_str = video_path.to_string_lossy().to_string();
             debug!("Queueing playlist generation for: {}", path_str);
 
             playlist_generator.do_send(GeneratePlaylistMessage {
-                playlist_path: playlist_output_dir.to_str().unwrap().to_string(),
+                playlist_path: playlist_dir_str.clone(),
                 video_path,
             });
         }
-- 
2.49.1


From 29f32b9d22909b09294577b4538a9a92a1847ecd Mon Sep 17 00:00:00 2001
From: Cameron <ccordes12@mail.bw.edu>
Date: Fri, 24 Apr 2026 10:08:03 -0400
Subject: [PATCH 16/24] FFMPEG playlist improvements

Better playlist management, .tmp renaming, HLS playlist parameter and concurrency tweaking.
---
 src/video/actors.rs | 420 +++++++++++++++++++++++++++++++-------------
 src/video/ffmpeg.rs |   6 +-
 2 files changed, 300 insertions(+), 126 deletions(-)

diff --git a/src/video/actors.rs b/src/video/actors.rs
index e85feb4..a461efe 100644
--- a/src/video/actors.rs
+++ b/src/video/actors.rs
@@ -4,7 +4,6 @@ use crate::libraries::Library;
 use crate::otel::global_tracer;
 use crate::video::ffmpeg::generate_preview_clip;
 use actix::prelude::*;
-use futures::TryFutureExt;
 use log::{debug, error, info, trace, warn};
 use opentelemetry::KeyValue;
 use opentelemetry::trace::{Span, Status, Tracer};
@@ -74,9 +73,11 @@ pub async fn create_playlist(video_path: &str, playlist_file: &str) -> Result<Ch
         .arg("-hls_time")
         .arg("3")
         .arg("-hls_list_size")
-        .arg("100")
+        .arg("0")
+        .arg("-hls_playlist_type")
+        .arg("vod")
         .arg("-vf")
-        .arg("scale=1080:-2,setsar=1:1")
+        .arg("scale='min(1080,iw)':-2,setsar=1:1")
         .arg(playlist_file)
         .stdout(Stdio::null())
         .stderr(Stdio::null())
@@ -140,113 +141,140 @@ pub fn generate_image_thumbnail_ffmpeg(path: &Path, destination: &Path) -> std::
     Ok(())
 }
 
-/// Check if a video is already encoded with h264 codec
-/// Returns true if the video uses h264, false otherwise or if detection fails
-async fn is_h264_encoded(video_path: &str) -> bool {
-    let output = tokio::process::Command::new("ffprobe")
-        .arg("-v")
-        .arg("error")
-        .arg("-select_streams")
-        .arg("v:0")
-        .arg("-show_entries")
-        .arg("stream=codec_name")
-        .arg("-of")
-        .arg("default=noprint_wrappers=1:nokey=1")
-        .arg(video_path)
-        .output()
-        .await;
-
-    match output {
-        Ok(output) if output.status.success() => {
-            let codec = String::from_utf8_lossy(&output.stdout);
-            let codec = codec.trim();
-            debug!("Detected codec for {}: {}", video_path, codec);
-            codec == "h264"
-        }
-        Ok(output) => {
-            warn!(
-                "ffprobe failed for {}: {}",
-                video_path,
-                String::from_utf8_lossy(&output.stderr)
-            );
-            false
-        }
-        Err(e) => {
-            warn!("Failed to run ffprobe for {}: {}", video_path, e);
-            false
-        }
-    }
+/// Video stream metadata needed to pick HLS encode settings. Populated by
+/// a single ffprobe call to avoid spawning multiple subprocesses per video.
+#[derive(Debug, Default)]
+struct VideoStreamMeta {
+    is_h264: bool,
+    /// Rotation in degrees (0/90/180/270). Checks both the legacy `rotate`
+    /// stream tag and the modern display-matrix side data.
+    rotation: i32,
 }
 
-/// Check if a video has rotation metadata
-/// Returns the rotation angle in degrees (0, 90, 180, 270) or 0 if none detected
-/// Checks both legacy stream tags and modern display matrix side data
-async fn get_video_rotation(video_path: &str) -> i32 {
-    // Check legacy rotate stream tag (older videos)
+/// Probe video stream metadata in one ffprobe call. Returns default (codec
+/// unknown, rotation 0) on any failure — callers fall back to transcoding.
+async fn probe_video_stream_meta(video_path: &str) -> VideoStreamMeta {
     let output = tokio::process::Command::new("ffprobe")
         .arg("-v")
         .arg("error")
         .arg("-select_streams")
         .arg("v:0")
+        .arg("-print_format")
+        .arg("json")
         .arg("-show_entries")
-        .arg("stream_tags=rotate")
-        .arg("-of")
-        .arg("default=noprint_wrappers=1:nokey=1")
+        .arg("stream=codec_name:stream_tags=rotate:side_data_list")
         .arg(video_path)
         .output()
         .await;
 
-    if let Ok(output) = output
-        && output.status.success()
-    {
-        let rotation_str = String::from_utf8_lossy(&output.stdout);
-        let rotation_str = rotation_str.trim();
-        if !rotation_str.is_empty()
-            && let Ok(rotation) = rotation_str.parse::<i32>()
-            && rotation != 0
-        {
-            debug!(
-                "Detected rotation {}° from stream tag for {}",
-                rotation, video_path
-            );
-            return rotation;
-        }
+    let Ok(output) = output else {
+        warn!("Failed to run ffprobe for {}", video_path);
+        return VideoStreamMeta::default();
+    };
+    if !output.status.success() {
+        warn!(
+            "ffprobe failed for {}: {}",
+            video_path,
+            String::from_utf8_lossy(&output.stderr).trim()
+        );
+        return VideoStreamMeta::default();
     }
 
-    // Check display matrix side data (modern videos, e.g. iPhone)
+    let Ok(json) = serde_json::from_slice::<serde_json::Value>(&output.stdout) else {
+        warn!("ffprobe returned non-JSON for {}", video_path);
+        return VideoStreamMeta::default();
+    };
+
+    let stream = &json["streams"][0];
+    let is_h264 = stream
+        .get("codec_name")
+        .and_then(|v| v.as_str())
+        .map(|s| s == "h264")
+        .unwrap_or(false);
+
+    // Prefer legacy `tags.rotate` (older containers); fall back to the
+    // display-matrix side data (iPhone and other modern recorders).
+    let rotation = stream
+        .get("tags")
+        .and_then(|t| t.get("rotate"))
+        .and_then(|r| r.as_str())
+        .and_then(|s| s.parse::<i32>().ok())
+        .filter(|r| *r != 0)
+        .or_else(|| {
+            stream
+                .get("side_data_list")
+                .and_then(|l| l.as_array())
+                .and_then(|arr| {
+                    arr.iter()
+                        .find_map(|sd| sd.get("rotation").and_then(|r| r.as_f64()))
+                })
+                .map(|f| f.abs() as i32)
+                .filter(|r| *r != 0)
+        })
+        .unwrap_or(0);
+
+    debug!(
+        "Probed {}: codec_h264={}, rotation={}°",
+        video_path, is_h264, rotation
+    );
+
+    VideoStreamMeta { is_h264, rotation }
+}
+
+/// Probe the max keyframe interval (GOP) in the first ~30s of a video.
+/// Returns `None` on probe failure or if we couldn't see at least two keyframes.
+///
+/// Used to decide between stream-copy and transcode: HLS needs segments to
+/// start on keyframes, so if the source GOP exceeds `hls_time`, copying
+/// produces oversized/glitchy segments and we need to re-encode.
+async fn get_max_gop_seconds(video_path: &str) -> Option<f64> {
     let output = tokio::process::Command::new("ffprobe")
         .arg("-v")
         .arg("error")
         .arg("-select_streams")
         .arg("v:0")
+        .arg("-skip_frame")
+        .arg("nokey")
         .arg("-show_entries")
-        .arg("side_data=rotation")
+        .arg("frame=pts_time")
         .arg("-of")
-        .arg("default=noprint_wrappers=1:nokey=1")
+        .arg("csv=p=0")
+        .arg("-read_intervals")
+        .arg("%+30")
         .arg(video_path)
         .output()
-        .await;
+        .await
+        .ok()?;
 
-    if let Ok(output) = output
-        && output.status.success()
-    {
-        let rotation_str = String::from_utf8_lossy(&output.stdout);
-        let rotation_str = rotation_str.trim();
-        if !rotation_str.is_empty()
-            && let Ok(rotation) = rotation_str.parse::<f64>()
-        {
-            let rotation = rotation.abs() as i32;
-            if rotation != 0 {
-                debug!(
-                    "Detected rotation {}° from display matrix for {}",
-                    rotation, video_path
-                );
-                return rotation;
-            }
-        }
+    if !output.status.success() {
+        warn!(
+            "ffprobe GOP check failed for {}: {}",
+            video_path,
+            String::from_utf8_lossy(&output.stderr).trim()
+        );
+        return None;
     }
 
-    0
+    let times: Vec<f64> = String::from_utf8_lossy(&output.stdout)
+        .lines()
+        .filter_map(|l| l.trim().parse::<f64>().ok())
+        .collect();
+
+    if times.len() < 2 {
+        return None;
+    }
+
+    let max_gop = times
+        .windows(2)
+        .map(|w| w[1] - w[0])
+        .fold(0.0_f64, f64::max);
+    debug!(
+        "Max GOP in first {} keyframes of {}: {:.2}s",
+        times.len(),
+        video_path,
+        max_gop
+    );
+    Some(max_gop)
 }
 
 pub struct VideoPlaylistManager {
@@ -404,8 +432,17 @@ pub struct PlaylistGenerator {
 
 impl PlaylistGenerator {
     pub(crate) fn new() -> Self {
+        // Concurrency is tunable via HLS_CONCURRENCY so operators can dial
+        // it to their hardware: 1 on weak Synology boxes to avoid thermal
+        // throttling, higher on desktops with spare cores.
+        let concurrency = std::env::var("HLS_CONCURRENCY")
+            .ok()
+            .and_then(|v| v.parse::<usize>().ok())
+            .filter(|&n| n > 0)
+            .unwrap_or(2);
+        info!("PlaylistGenerator: concurrency={}", concurrency);
         PlaylistGenerator {
-            semaphore: Arc::new(Semaphore::new(2)),
+            semaphore: Arc::new(Semaphore::new(concurrency)),
         }
     }
 }
@@ -465,14 +502,42 @@ impl Handler<GeneratePlaylistMessage> for PlaylistGenerator {
                 return Err(std::io::Error::from(std::io::ErrorKind::AlreadyExists));
             }
 
-            // Check if video is already h264 encoded
-            let is_h264 = is_h264_encoded(&video_file).await;
-
-            // Check for rotation metadata
-            let rotation = get_video_rotation(&video_file).await;
+            // One ffprobe call for codec + rotation metadata.
+            let stream_meta = probe_video_stream_meta(&video_file).await;
+            let is_h264 = stream_meta.is_h264;
+            let rotation = stream_meta.rotation;
             let has_rotation = rotation != 0;
 
-            let use_copy = is_h264 && !has_rotation;
+            // Stream-copy is only safe when the source GOP fits inside a
+            // single HLS segment. Otherwise ffmpeg has to extend segments
+            // past hls_time to land on a keyframe, producing uneven
+            // segments and seeking glitches.
+            const HLS_SEGMENT_SECONDS: f64 = 3.0;
+            let gop_ok = if is_h264 && !has_rotation {
+                match get_max_gop_seconds(&video_file).await {
+                    Some(g) if g > HLS_SEGMENT_SECONDS => {
+                        info!(
+                            "Video {} has long GOP ({:.1}s > {}s), transcoding for segment alignment",
+                            video_file, g, HLS_SEGMENT_SECONDS
+                        );
+                        false
+                    }
+                    Some(_) => true,
+                    None => {
+                        // Probe failed — be conservative and transcode rather
+                        // than risk broken segments from a mystery source.
+                        debug!(
+                            "GOP probe failed for {}, transcoding to be safe",
+                            video_file
+                        );
+                        false
+                    }
+                }
+            } else {
+                false
+            };
+
+            let use_copy = is_h264 && !has_rotation && gop_ok;
 
             if has_rotation {
                 info!(
@@ -486,59 +551,168 @@ impl Handler<GeneratePlaylistMessage> for PlaylistGenerator {
             } else if use_copy {
                 info!("Video {} is already h264, using stream copy", video_file);
                 span.add_event("Using stream copy (h264 detected)", vec![]);
+            } else if is_h264 {
+                info!(
+                    "Video {} is h264 but needs transcoding for GOP alignment",
+                    video_file
+                );
+                span.add_event("Transcoding for GOP alignment", vec![]);
             } else {
                 info!("Video {} needs transcoding to h264", video_file);
                 span.add_event("Transcoding to h264", vec![]);
             }
 
-            tokio::spawn(async move {
-                let mut cmd = tokio::process::Command::new("ffmpeg");
-                cmd.arg("-i").arg(&video_file);
+            // Encode to a .tmp playlist and explicit segment names so a failed
+            // encode leaves predictable artifacts we can clean up — and so a
+            // concurrent scan doesn't see a half-written .m3u8 as "done".
+            let playlist_tmp = format!("{}.tmp", playlist_file);
+            let video_stem = msg
+                .video_path
+                .file_name()
+                .and_then(|n| n.to_str())
+                .unwrap_or("video");
+            let segment_pattern = format!("{}/{}_%03d.ts", playlist_path, video_stem);
 
-                if use_copy {
-                    // Video is already h264, just copy the stream
-                    // Note: rotation metadata will be preserved in the stream
-                    cmd.arg("-c:v").arg("copy");
-                    cmd.arg("-c:a").arg("aac"); // Still need to ensure audio is compatible
+            let mut cmd = tokio::process::Command::new("ffmpeg");
+            cmd.arg("-y").arg("-i").arg(&video_file);
+
+            if use_copy {
+                cmd.arg("-c:v").arg("copy");
+                cmd.arg("-c:a").arg("aac");
+            } else {
+                let nvenc = crate::video::ffmpeg::is_nvenc_available().await;
+                if nvenc {
+                    // NVENC: no CRF, use VBR + target CQ. p1 = fastest
+                    // preset — prioritizes encoder throughput over bitrate
+                    // efficiency. CQ 23 roughly matches libx264 crf 21
+                    // visually; NVENC has slightly lower compression
+                    // efficiency per quality.
+                    cmd.arg("-c:v").arg("h264_nvenc");
+                    cmd.arg("-preset").arg("p1");
+                    cmd.arg("-rc").arg("vbr");
+                    cmd.arg("-cq").arg("23");
+                    cmd.arg("-pix_fmt").arg("yuv420p");
                 } else {
-                    // Need to transcode - autorotate is enabled by default and will apply rotation
                     cmd.arg("-c:v").arg("h264");
                     cmd.arg("-crf").arg("21");
                     cmd.arg("-preset").arg("veryfast");
-                    cmd.arg("-vf").arg("scale=1080:-2,setsar=1:1");
-                    cmd.arg("-c:a").arg("aac");
                 }
+                cmd.arg("-vf").arg("scale='min(1080,iw)':-2,setsar=1:1");
+                cmd.arg("-c:a").arg("aac");
+                // Force an IDR frame every hls_time seconds so each HLS
+                // segment starts on a keyframe — accurate seeking without
+                // players having to decode from a prior segment.
+                cmd.arg("-force_key_frames").arg("expr:gte(t,n_forced*3)");
+            }
 
-                // Common HLS settings
-                cmd.arg("-hls_time").arg("3");
-                cmd.arg("-hls_list_size").arg("100");
-                cmd.arg(&playlist_file);
-                cmd.stdout(Stdio::null());
-                cmd.stderr(Stdio::piped());
+            // -f hls is required because the playlist is written to a .tmp
+            // path during encoding — ffmpeg normally infers the muxer from
+            // the output extension and doesn't recognize ".m3u8.tmp".
+            cmd.arg("-f").arg("hls");
+            cmd.arg("-hls_time").arg("3");
+            cmd.arg("-hls_list_size").arg("0");
+            cmd.arg("-hls_playlist_type").arg("vod");
+            // independent_segments advertises that each segment can be
+            // decoded without reference to any other — the matching guarantee
+            // for the forced keyframes above.
+            cmd.arg("-hls_flags").arg("independent_segments");
+            cmd.arg("-hls_segment_filename").arg(&segment_pattern);
+            cmd.arg(&playlist_tmp);
+            cmd.stdout(Stdio::null());
+            cmd.stderr(Stdio::piped());
+            cmd.kill_on_drop(true);
 
-                let ffmpeg_result = cmd
-                    .output()
-                    .inspect_err(|e| error!("Failed to run ffmpeg on child process: {}", e))
-                    .map_err(|e| std::io::Error::other(e.to_string()))
-                    .await;
+            // Spawn + wait under a timeout so a hung ffmpeg (corrupt source,
+            // NFS stall, etc.) doesn't permanently hold a semaphore slot.
+            // Default is generous — a long 4K transcode on CPU can take hours.
+            let timeout_secs = std::env::var("HLS_TIMEOUT_SECONDS")
+                .ok()
+                .and_then(|v| v.parse::<u64>().ok())
+                .unwrap_or(7200);
 
-                // Hang on to the permit until we're done decoding and then explicitly drop
-                drop(permit);
-
-                if let Ok(ref res) = ffmpeg_result {
-                    debug!("ffmpeg output: {:?}", res);
+            let ffmpeg_result = match cmd.spawn() {
+                Ok(child) => {
+                    match tokio::time::timeout(
+                        std::time::Duration::from_secs(timeout_secs),
+                        child.wait_with_output(),
+                    )
+                    .await
+                    {
+                        Ok(res) => res
+                            .inspect_err(|e| {
+                                error!("Failed to wait on ffmpeg child process: {}", e)
+                            })
+                            .map_err(|e| std::io::Error::other(e.to_string())),
+                        Err(_) => Err(std::io::Error::other(format!(
+                            "ffmpeg exceeded {}s timeout",
+                            timeout_secs
+                        ))),
+                    }
                 }
+                Err(e) => {
+                    error!("Failed to spawn ffmpeg: {}", e);
+                    Err(std::io::Error::other(e.to_string()))
+                }
+            };
 
+            drop(permit);
+
+            let success = matches!(&ffmpeg_result, Ok(out) if out.status.success());
+
+            if success {
+                if let Err(e) = tokio::fs::rename(&playlist_tmp, &playlist_file).await {
+                    error!(
+                        "ffmpeg succeeded but rename {} -> {} failed: {}",
+                        playlist_tmp, playlist_file, e
+                    );
+                    cleanup_partial_hls(&playlist_tmp, playlist_path.as_str(), video_stem).await;
+                    span.set_status(Status::error(format!("rename failed: {}", e)));
+                    return Err(e);
+                }
+                debug!("Playlist complete: {}", playlist_file);
                 span.set_status(Status::Ok);
-
-                ffmpeg_result
-            });
-
-            Ok(())
+                Ok(())
+            } else {
+                let detail = match &ffmpeg_result {
+                    Ok(out) => format!(
+                        "exit {}: {}",
+                        out.status,
+                        String::from_utf8_lossy(&out.stderr).trim()
+                    ),
+                    Err(e) => format!("ffmpeg failed: {}", e),
+                };
+                error!("ffmpeg failed for {}: {}", video_file, detail);
+                cleanup_partial_hls(&playlist_tmp, playlist_path.as_str(), video_stem).await;
+                span.set_status(Status::error(detail.clone()));
+                Err(std::io::Error::other(detail))
+            }
         })
     }
 }
 
+/// Delete the temp playlist and any segment files that ffmpeg may have written
+/// before failing. Called both on ffmpeg error and on rename failure so a
+/// retry on the next scan starts from a clean slate.
+async fn cleanup_partial_hls(playlist_tmp: &str, playlist_dir: &str, video_stem: &str) {
+    let _ = tokio::fs::remove_file(playlist_tmp).await;
+
+    let segment_prefix = format!("{}_", video_stem);
+    let Ok(mut entries) = tokio::fs::read_dir(playlist_dir).await else {
+        return;
+    };
+    while let Ok(Some(entry)) = entries.next_entry().await {
+        let Some(name) = entry.file_name().to_str().map(str::to_owned) else {
+            continue;
+        };
+        if name.starts_with(&segment_prefix)
+            && name.ends_with(".ts")
+            && let Err(e) = tokio::fs::remove_file(entry.path()).await
+        {
+            warn!("Failed to remove partial segment {}: {}", name, e);
+        }
+    }
+}
+
 #[derive(Message)]
 #[rtype(result = "()")]
 pub struct GeneratePreviewClipMessage {
diff --git a/src/video/ffmpeg.rs b/src/video/ffmpeg.rs
index 5ed9308..a31fd0c 100644
--- a/src/video/ffmpeg.rs
+++ b/src/video/ffmpeg.rs
@@ -22,16 +22,16 @@ async fn check_nvenc_available() -> bool {
 }
 
 /// Returns whether NVENC is available, caching the result after first check.
-async fn is_nvenc_available() -> bool {
+pub async fn is_nvenc_available() -> bool {
     if let Some(&available) = NVENC_AVAILABLE.get() {
         return available;
     }
     let available = check_nvenc_available().await;
     let _ = NVENC_AVAILABLE.set(available);
     if available {
-        info!("CUDA NVENC hardware acceleration detected and enabled for preview clips");
+        info!("CUDA NVENC hardware acceleration detected and enabled");
     } else {
-        info!("NVENC not available, using CPU encoding for preview clips");
+        info!("NVENC not available, using CPU encoding");
     }
     available
 }
-- 
2.49.1


From f0ae9f95dc9aaa0040fe5d9a8f5182ed24f77e4b Mon Sep 17 00:00:00 2001
From: Cameron <ccordes12@mail.bw.edu>
Date: Fri, 24 Apr 2026 13:54:06 -0400
Subject: [PATCH 17/24] feat(ai): few-shot exemplars + sticky Ollama preference

- Few-shot injection on /insights/generate/agentic: compresses prior
  training_messages into trajectory blocks (tool calls + result summaries)
  and injects into the system prompt. Hardcoded default ids with optional
  request override.
- New fewshot_source_ids column on photo_insights (+ migration) to track
  which exemplars influenced a given row, for downstream training-set
  filtering. Chat amend rows stamp None with a lineage note.
- Ollama client now remembers which server (primary/fallback) most
  recently succeeded and tries it first on the next call, via a shared
  Arc<AtomicBool>. Avoids re-404ing the primary on every agent iteration
  when the chosen model only lives on the fallback.
- Demote noisy logs: daily_summary "Summary match" lines to debug;
  inner chat_with_tools non-2xx body log from error to warn (outer
  layer owns the terminal-error signal).
- Drift-guard tests for summarize_tool_result covering the success /
  empty / error / unknown shape for every tool.
- Tidy: three pre-existing clippy warnings cleaned up.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../down.sql                                  |  24 ++
 .../up.sql                                    |   1 +
 src/ai/handlers.rs                            |  51 +++
 src/ai/insight_chat.rs                        |  14 +-
 src/ai/insight_generator.rs                   | 391 +++++++++++++++++-
 src/ai/ollama.rs                              | 194 +++++----
 src/ai/openrouter.rs                          |   3 +-
 src/bin/populate_knowledge.rs                 |   2 +
 src/database/daily_summary_dao.rs             |   4 +-
 src/database/insights_dao.rs                  |  29 ++
 src/database/models.rs                        |   7 +
 src/database/schema.rs                        |   1 +
 12 files changed, 639 insertions(+), 82 deletions(-)
 create mode 100644 migrations/2026-04-24-000000_add_fewshot_source_to_insights/down.sql
 create mode 100644 migrations/2026-04-24-000000_add_fewshot_source_to_insights/up.sql

diff --git a/migrations/2026-04-24-000000_add_fewshot_source_to_insights/down.sql b/migrations/2026-04-24-000000_add_fewshot_source_to_insights/down.sql
new file mode 100644
index 0000000..2702414
--- /dev/null
+++ b/migrations/2026-04-24-000000_add_fewshot_source_to_insights/down.sql
@@ -0,0 +1,24 @@
+-- SQLite can't DROP COLUMN cleanly on older versions; rebuild the table.
+CREATE TABLE photo_insights_backup AS
+    SELECT id, library_id, rel_path, title, summary, generated_at, model_version,
+           is_current, training_messages, approved, backend
+    FROM photo_insights;
+DROP TABLE photo_insights;
+CREATE TABLE photo_insights (
+    id INTEGER NOT NULL PRIMARY KEY AUTOINCREMENT,
+    library_id INTEGER NOT NULL REFERENCES libraries(id),
+    rel_path TEXT NOT NULL,
+    title TEXT NOT NULL,
+    summary TEXT NOT NULL,
+    generated_at BIGINT NOT NULL,
+    model_version TEXT NOT NULL,
+    is_current BOOLEAN NOT NULL DEFAULT TRUE,
+    training_messages TEXT,
+    approved BOOLEAN,
+    backend TEXT NOT NULL DEFAULT 'local'
+);
+INSERT INTO photo_insights
+    SELECT id, library_id, rel_path, title, summary, generated_at, model_version,
+           is_current, training_messages, approved, backend
+    FROM photo_insights_backup;
+DROP TABLE photo_insights_backup;
diff --git a/migrations/2026-04-24-000000_add_fewshot_source_to_insights/up.sql b/migrations/2026-04-24-000000_add_fewshot_source_to_insights/up.sql
new file mode 100644
index 0000000..f39340c
--- /dev/null
+++ b/migrations/2026-04-24-000000_add_fewshot_source_to_insights/up.sql
@@ -0,0 +1 @@
+ALTER TABLE photo_insights ADD COLUMN fewshot_source_ids TEXT;
diff --git a/src/ai/handlers.rs b/src/ai/handlers.rs
index d24927f..2121a81 100644
--- a/src/ai/handlers.rs
+++ b/src/ai/handlers.rs
@@ -4,6 +4,7 @@ use opentelemetry::trace::{Span, Status, Tracer};
 use serde::{Deserialize, Serialize};
 
 use crate::ai::insight_chat::{ChatStreamEvent, ChatTurnRequest};
+use crate::ai::ollama::ChatMessage;
 use crate::ai::{InsightGenerator, ModelCapabilities, OllamaClient};
 use crate::data::Claims;
 use crate::database::{ExifDao, InsightDao};
@@ -12,6 +13,13 @@ use crate::otel::{extract_context_from_request, global_tracer};
 use crate::state::AppState;
 use crate::utils::normalize_path;
 
+/// Hardcoded few-shot exemplars for the agentic endpoint. Populate with the
+/// ids of approved insights whose `training_messages` should be compressed
+/// into trajectory form and injected into the system prompt. Empty = no
+/// change in behavior. Request-level `fewshot_insight_ids` overrides this
+/// when non-empty.
+const DEFAULT_FEWSHOT_INSIGHT_IDS: &[i32] = &[2918, 2908];
+
 #[derive(Debug, Deserialize)]
 pub struct GeneratePhotoInsightRequest {
     pub file_path: String,
@@ -33,6 +41,12 @@ pub struct GeneratePhotoInsightRequest {
     /// OpenRouter chat). Only respected by the agentic endpoint.
     #[serde(default)]
     pub backend: Option<String>,
+    /// Insight ids whose stored `training_messages` should be compressed
+    /// into few-shot trajectories and injected into the system prompt.
+    /// Silently truncated to the first 2. When absent/empty, the handler
+    /// falls back to `DEFAULT_FEWSHOT_INSIGHT_IDS`.
+    #[serde(default)]
+    pub fewshot_insight_ids: Option<Vec<i32>>,
 }
 
 #[derive(Debug, Deserialize)]
@@ -326,6 +340,41 @@ pub async fn generate_agentic_insight_handler(
         span.set_attribute(KeyValue::new("backend", b.clone()));
     }
 
+    // Resolve few-shot ids: request-provided ids take precedence when
+    // non-empty; otherwise fall back to the hardcoded defaults.
+    let fewshot_ids: Vec<i32> = match request.fewshot_insight_ids.as_deref() {
+        Some(ids) if !ids.is_empty() => ids.iter().take(2).copied().collect(),
+        _ => DEFAULT_FEWSHOT_INSIGHT_IDS
+            .iter()
+            .take(2)
+            .copied()
+            .collect(),
+    };
+    span.set_attribute(KeyValue::new("fewshot_count", fewshot_ids.len() as i64));
+
+    let fewshot_examples: Vec<Vec<ChatMessage>> = {
+        let otel_context = opentelemetry::Context::new();
+        let mut dao = insight_dao.lock().expect("Unable to lock InsightDao");
+        fewshot_ids
+            .iter()
+            .filter_map(|id| {
+                let insight = dao.get_insight_by_id(&otel_context, *id).ok().flatten()?;
+                let json = insight.training_messages?;
+                match serde_json::from_str::<Vec<ChatMessage>>(&json) {
+                    Ok(msgs) => Some(msgs),
+                    Err(e) => {
+                        log::warn!(
+                            "Few-shot insight {} has malformed training_messages: {}",
+                            id,
+                            e
+                        );
+                        None
+                    }
+                }
+            })
+            .collect()
+    };
+
     let result = insight_generator
         .generate_agentic_insight_for_photo(
             &normalized_path,
@@ -338,6 +387,8 @@ pub async fn generate_agentic_insight_handler(
             request.min_p,
             max_iterations,
             request.backend.clone(),
+            fewshot_examples,
+            fewshot_ids,
         )
         .await;
 
diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs
index 01dd4de..c51befd 100644
--- a/src/ai/insight_chat.rs
+++ b/src/ai/insight_chat.rs
@@ -502,6 +502,11 @@ impl InsightChatService {
                 .await?;
             let title = title_raw.trim().trim_matches('"').to_string();
 
+            // Amended rows intentionally do not inherit the parent's
+            // `fewshot_source_ids`. The parent's few-shot influence is still
+            // present in this row's content; if you want strict lineage
+            // tracking for training-set filtering, fetch the parent here and
+            // copy its value forward.
             let new_row = InsertPhotoInsight {
                 library_id: req.library_id,
                 file_path: normalized.clone(),
@@ -512,6 +517,7 @@ impl InsightChatService {
                 is_current: true,
                 training_messages: Some(json),
                 backend: effective_backend.clone(),
+                fewshot_source_ids: None,
             };
             let cx = opentelemetry::Context::new();
             let mut dao = self.insight_dao.lock().expect("Unable to lock InsightDao");
@@ -608,7 +614,7 @@ impl InsightChatService {
     ) -> BoxStream<'static, ChatStreamEvent> {
         let svc = self;
         let s = async_stream::stream! {
-            match svc.chat_turn_stream_inner(req, |ev| Ok(ev)).await {
+            match svc.chat_turn_stream_inner(req, Ok).await {
                 Ok(mut rx) => {
                     while let Some(ev) = rx.recv().await {
                         yield ev;
@@ -955,6 +961,11 @@ impl InsightChatService {
                 .await?;
             let title = title_raw.trim().trim_matches('"').to_string();
 
+            // Amended rows intentionally do not inherit the parent's
+            // `fewshot_source_ids`. The parent's few-shot influence is still
+            // present in this row's content; if you want strict lineage
+            // tracking for training-set filtering, fetch the parent here and
+            // copy its value forward.
             let new_row = InsertPhotoInsight {
                 library_id: req.library_id,
                 file_path: normalized.clone(),
@@ -965,6 +976,7 @@ impl InsightChatService {
                 is_current: true,
                 training_messages: Some(json),
                 backend: effective_backend.clone(),
+                fewshot_source_ids: None,
             };
             let cx = opentelemetry::Context::new();
             let mut dao = self.insight_dao.lock().expect("Unable to lock InsightDao");
diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs
index dad7040..65ba3a1 100644
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -1227,6 +1227,7 @@ impl InsightGenerator {
             is_current: true,
             training_messages: None,
             backend: "local".to_string(),
+            fewshot_source_ids: None,
         };
 
         let mut dao = self.insight_dao.lock().expect("Unable to lock InsightDao");
@@ -2634,6 +2635,159 @@ Return ONLY the summary, nothing else."#,
     /// text, and runs the loop through OpenRouter (chat only — embeddings
     /// and describe calls stay local in either mode).
     #[allow(clippy::too_many_arguments)]
+    /// Render a set of prior-conversation transcripts into a compact
+    /// trajectory block for inclusion in the system prompt. Tool results
+    /// are summarised to one line each so the prompt stays small.
+    fn render_fewshot_examples(examples: &[Vec<ChatMessage>]) -> String {
+        if examples.is_empty() {
+            return String::new();
+        }
+
+        let mut out = String::from("## Examples of strong context-gathering\n\n");
+        out.push_str(
+            "The following are compressed trajectories from prior high-quality insights. \
+             They show the *pattern* of tool use, not answers to copy.\n\n",
+        );
+
+        for (i, msgs) in examples.iter().enumerate() {
+            out.push_str(&format!("### Example {}\n\n", i + 1));
+            out.push_str(&Self::render_single_trajectory(msgs));
+            out.push('\n');
+        }
+
+        out.push_str("---\n\n");
+        out
+    }
+
+    fn render_single_trajectory(msgs: &[ChatMessage]) -> String {
+        let mut out = String::new();
+
+        if let Some(first_user) = msgs.iter().find(|m| m.role == "user") {
+            let trimmed = first_user
+                .content
+                .lines()
+                .filter(|l| !l.trim().is_empty())
+                .take(8)
+                .collect::<Vec<_>>()
+                .join("\n");
+            out.push_str(&format!("Input:\n{}\n\n", trimmed));
+        }
+
+        out.push_str("Trajectory:\n");
+        let mut step = 1;
+        let mut final_content: Option<String> = None;
+
+        for (i, m) in msgs.iter().enumerate() {
+            if m.role != "assistant" {
+                continue;
+            }
+            if let Some(ref calls) = m.tool_calls {
+                for call in calls {
+                    let args_brief = Self::brief_json_args(&call.function.arguments);
+                    let result_summary = msgs
+                        .get(i + 1)
+                        .filter(|r| r.role == "tool")
+                        .map(|r| Self::summarize_tool_result(&call.function.name, &r.content))
+                        .unwrap_or_else(|| "(no result)".to_string());
+                    out.push_str(&format!(
+                        "{}. {}({}) -> {}\n",
+                        step, call.function.name, args_brief, result_summary
+                    ));
+                    step += 1;
+                }
+            } else if !m.content.is_empty() {
+                final_content = Some(m.content.clone());
+            }
+        }
+
+        if let Some(content) = final_content {
+            let short: String = content.chars().take(240).collect();
+            out.push_str(&format!("\nFinal insight: {}...\n", short));
+        }
+
+        out
+    }
+
+    fn brief_json_args(v: &serde_json::Value) -> String {
+        let Some(obj) = v.as_object() else {
+            return v.to_string();
+        };
+        obj.iter()
+            .map(|(k, v)| {
+                let rendered = match v {
+                    serde_json::Value::String(s) if s.len() > 40 => {
+                        format!("\"{}...\"", &s[..40])
+                    }
+                    _ => v.to_string(),
+                };
+                format!("{}={}", k, rendered)
+            })
+            .collect::<Vec<_>>()
+            .join(", ")
+    }
+
+    /// Collapse a raw tool-result string (the text the model saw) into a
+    /// short phrase suitable for a few-shot trajectory. Detects the
+    /// "Found N ...", "No ...", and "Error ..." idioms used by the tool
+    /// implementations in this file. Unknown shapes fall back to a char
+    /// count, which is deliberately visible so drift shows up in output.
+    fn summarize_tool_result(tool_name: &str, raw: &str) -> String {
+        if raw.starts_with("Error ") {
+            return "error".to_string();
+        }
+        if raw.starts_with("No ") || raw.starts_with("Could not ") {
+            return "empty (pivoted)".to_string();
+        }
+
+        if let Some(rest) = raw.strip_prefix("Found ")
+            && let Some(n_str) = rest.split_whitespace().next()
+            && let Ok(n) = n_str.parse::<usize>()
+        {
+            let kind = match tool_name {
+                "search_messages" | "get_sms_messages" => "messages",
+                "get_calendar_events" => "events",
+                "get_location_history" => "location records",
+                _ => "results",
+            };
+            return format!("{} {}", n, kind);
+        }
+
+        match tool_name {
+            "search_rag" => {
+                let n = raw.split("\n\n").filter(|s| !s.trim().is_empty()).count();
+                format!("{} rag hits", n)
+            }
+            "get_file_tags" => {
+                let n = raw.split(',').filter(|s| !s.trim().is_empty()).count();
+                format!("{} tags", n)
+            }
+            "describe_photo" => {
+                let short: String = raw.chars().take(80).collect();
+                format!("described: \"{}...\"", short)
+            }
+            "reverse_geocode" => {
+                let short: String = raw.chars().take(60).collect();
+                format!("place: {}", short)
+            }
+            "recall_entities" | "recall_facts_for_photo" => {
+                let n = raw.lines().skip(1).filter(|l| !l.trim().is_empty()).count();
+                let kind = if tool_name == "recall_entities" {
+                    "entities"
+                } else {
+                    "facts"
+                };
+                format!("{} {}", n, kind)
+            }
+            "store_entity" | "store_fact" => raw
+                .split_whitespace()
+                .find_map(|tok| tok.strip_prefix("ID:"))
+                .map(|id| format!("stored id={}", id.trim_end_matches(',')))
+                .unwrap_or_else(|| "stored".to_string()),
+            "get_current_datetime" => "time noted".to_string(),
+            _ => format!("{} chars", raw.len()),
+        }
+    }
+
     pub async fn generate_agentic_insight_for_photo(
         &self,
         file_path: &str,
@@ -2646,6 +2800,8 @@ Return ONLY the summary, nothing else."#,
         min_p: Option<f32>,
         max_iterations: usize,
         backend: Option<String>,
+        fewshot_examples: Vec<Vec<ChatMessage>>,
+        fewshot_source_ids: Vec<i32>,
     ) -> Result<(Option<i32>, Option<i32>)> {
         let tracer = global_tracer();
         let current_cx = opentelemetry::Context::current();
@@ -2990,8 +3146,10 @@ Return ONLY the summary, nothing else."#,
             ),
             None => String::new(),
         };
+        let fewshot_block = Self::render_fewshot_examples(&fewshot_examples);
         let base_system = format!(
             "You are a personal photo memory assistant helping to reconstruct a memory from a photo.{owner_id_note}\n\n\
+            {fewshot_block}\
             IMPORTANT INSTRUCTIONS:\n\
             1. You MUST call multiple tools to gather context BEFORE writing any final insight. Do not produce a final answer after only one or two tool calls.\n\
             2. When calling get_sms_messages and search_rag, always make at least one call WITHOUT a contact filter to capture what else was happening in {owner_name}'s life around this date — other conversations, events, and activities provide important wider context even when a specific contact is known.\n\
@@ -3002,6 +3160,7 @@ Return ONLY the summary, nothing else."#,
             7. If a tool returns no results, that is useful information — continue calling the remaining tools anyway.\n\
             8. You have a hard budget of {max_iterations} tool-calling iterations before the loop ends. Plan your context gathering so you can write a complete final insight within that budget.",
             owner_id_note = owner_id_note,
+            fewshot_block = fewshot_block,
             owner_name = owner_name,
             max_iterations = max_iterations
         );
@@ -3153,12 +3312,10 @@ Return ONLY the summary, nothing else."#,
                 "Agentic loop exhausted after {} iterations, requesting final answer",
                 iterations_used
             );
-            messages.push(ChatMessage::user(
-                &format!(
-                    "Based on the context gathered, please write the final photo insight: a title and a detailed personal summary. Write in first person as {}.",
-                    user_display_name()
-                ),
-            ));
+            messages.push(ChatMessage::user(format!(
+                "Based on the context gathered, please write the final photo insight: a title and a detailed personal summary. Write in first person as {}.",
+                user_display_name()
+            )));
             let (final_response, prompt_tokens, eval_tokens) = chat_backend
                 .chat_with_tools(messages.clone(), vec![])
                 .await?;
@@ -3204,6 +3361,11 @@ Return ONLY the summary, nothing else."#,
 
         // 15. Store insight (returns the persisted row including its new id)
         let model_version = chat_backend.primary_model().to_string();
+        let fewshot_source_ids_json = if fewshot_source_ids.is_empty() {
+            None
+        } else {
+            Some(serde_json::to_string(&fewshot_source_ids).unwrap_or_else(|_| "[]".to_string()))
+        };
         let insight = InsertPhotoInsight {
             library_id: crate::libraries::PRIMARY_LIBRARY_ID,
             file_path: file_path.to_string(),
@@ -3214,6 +3376,7 @@ Return ONLY the summary, nothing else."#,
             is_current: true,
             training_messages,
             backend: backend_label.clone(),
+            fewshot_source_ids: fewshot_source_ids_json,
         };
 
         let stored = {
@@ -3333,6 +3496,7 @@ Return ONLY the summary, nothing else."#,
 #[cfg(test)]
 mod tests {
     use super::*;
+    use crate::ai::ollama::{ToolCall, ToolCallFunction};
 
     #[test]
     fn combine_contexts_includes_tags_section_when_tags_present() {
@@ -3374,4 +3538,219 @@ mod tests {
         let result = InsightGenerator::combine_contexts(None, None, None, None, None);
         assert_eq!(result, "No additional context available");
     }
+
+    // These tests assert the shape of the strings returned by the tool
+    // implementations above. If a tool's output format changes, update the
+    // tool AND the corresponding arm of `summarize_tool_result` — these
+    // tests exist to make that coupling loud.
+
+    #[test]
+    fn summarize_errors_uniformly() {
+        assert_eq!(
+            InsightGenerator::summarize_tool_result("search_rag", "Error searching RAG: boom"),
+            "error"
+        );
+        assert_eq!(
+            InsightGenerator::summarize_tool_result(
+                "get_sms_messages",
+                "Error fetching SMS messages: timeout"
+            ),
+            "error"
+        );
+    }
+
+    #[test]
+    fn summarize_empty_results_uniformly() {
+        assert_eq!(
+            InsightGenerator::summarize_tool_result("search_rag", "No relevant messages found."),
+            "empty (pivoted)"
+        );
+        assert_eq!(
+            InsightGenerator::summarize_tool_result("get_sms_messages", "No messages found."),
+            "empty (pivoted)"
+        );
+        assert_eq!(
+            InsightGenerator::summarize_tool_result(
+                "reverse_geocode",
+                "Could not resolve coordinates to a place name."
+            ),
+            "empty (pivoted)"
+        );
+        assert_eq!(
+            InsightGenerator::summarize_tool_result(
+                "recall_facts_for_photo",
+                "No knowledge facts found for this photo."
+            ),
+            "empty (pivoted)"
+        );
+    }
+
+    #[test]
+    fn summarize_found_count_per_tool() {
+        assert_eq!(
+            InsightGenerator::summarize_tool_result(
+                "get_sms_messages",
+                "Found 7 messages:\n[2023-08-15 10:00] Sarah: hi"
+            ),
+            "7 messages"
+        );
+        assert_eq!(
+            InsightGenerator::summarize_tool_result(
+                "search_messages",
+                "Found 3 messages (mode: hybrid):\n\n[2023-08-15] Sarah — hi"
+            ),
+            "3 messages"
+        );
+        assert_eq!(
+            InsightGenerator::summarize_tool_result(
+                "get_calendar_events",
+                "Found 2 calendar events:\n[2023-08-15 10:00] Wedding"
+            ),
+            "2 events"
+        );
+        assert_eq!(
+            InsightGenerator::summarize_tool_result(
+                "get_location_history",
+                "Found 5 location records:\n[2023-08-15 10:00] 39.0, -120.0"
+            ),
+            "5 location records"
+        );
+    }
+
+    #[test]
+    fn summarize_search_rag_counts_hits() {
+        let raw = "[2023-08-15] Sarah: venue confirmed\n\n[2023-08-14] Mom: travel plans\n\n[2023-08-13] Dad: weather";
+        assert_eq!(
+            InsightGenerator::summarize_tool_result("search_rag", raw),
+            "3 rag hits"
+        );
+    }
+
+    #[test]
+    fn summarize_get_file_tags() {
+        assert_eq!(
+            InsightGenerator::summarize_tool_result("get_file_tags", "wedding, tahoe, 2023"),
+            "3 tags"
+        );
+    }
+
+    #[test]
+    fn summarize_describe_photo_truncates() {
+        let raw = "A wedding ceremony at Lake Tahoe with about 40 guests seated in rows facing a lakeside arch decorated with white flowers.";
+        let out = InsightGenerator::summarize_tool_result("describe_photo", raw);
+        assert!(out.starts_with("described: \""));
+        assert!(out.contains("A wedding ceremony at Lake Tahoe"));
+        assert!(out.ends_with("...\""));
+    }
+
+    #[test]
+    fn summarize_reverse_geocode_returns_place() {
+        let out =
+            InsightGenerator::summarize_tool_result("reverse_geocode", "South Lake Tahoe, CA, USA");
+        assert_eq!(out, "place: South Lake Tahoe, CA, USA");
+    }
+
+    #[test]
+    fn summarize_recall_entities_counts_lines() {
+        let raw = "Known entities:\n- Sarah (person)\n- Tahoe (place)\n- Wedding 2023 (event)";
+        assert_eq!(
+            InsightGenerator::summarize_tool_result("recall_entities", raw),
+            "3 entities"
+        );
+    }
+
+    #[test]
+    fn summarize_recall_facts_counts_lines() {
+        let raw = "Knowledge for this photo:\n- Sarah: college friend\n- Tahoe: vacation spot";
+        assert_eq!(
+            InsightGenerator::summarize_tool_result("recall_facts_for_photo", raw),
+            "2 facts"
+        );
+    }
+
+    #[test]
+    fn summarize_store_entity_extracts_id() {
+        assert_eq!(
+            InsightGenerator::summarize_tool_result(
+                "store_entity",
+                "Entity stored: ID:42 | person | Sarah | confidence:0.80"
+            ),
+            "stored id=42"
+        );
+    }
+
+    #[test]
+    fn summarize_store_fact_extracts_id() {
+        assert_eq!(
+            InsightGenerator::summarize_tool_result(
+                "store_fact",
+                "Stored new fact: ID:17 | confidence:0.60"
+            ),
+            "stored id=17"
+        );
+        assert_eq!(
+            InsightGenerator::summarize_tool_result(
+                "store_fact",
+                "Corroborated existing fact: ID:17 | confidence:0.85"
+            ),
+            "stored id=17"
+        );
+    }
+
+    #[test]
+    fn summarize_current_datetime() {
+        assert_eq!(
+            InsightGenerator::summarize_tool_result(
+                "get_current_datetime",
+                "Current date/time: 2024-01-15 12:00:00 PST (Monday)"
+            ),
+            "time noted"
+        );
+    }
+
+    #[test]
+    fn summarize_unknown_tool_falls_back_to_char_count() {
+        let out = InsightGenerator::summarize_tool_result("never_heard_of_it", "some output");
+        assert_eq!(out, "11 chars");
+    }
+
+    #[test]
+    fn render_fewshot_empty_returns_empty_string() {
+        assert!(InsightGenerator::render_fewshot_examples(&[]).is_empty());
+    }
+
+    #[test]
+    fn render_single_trajectory_walks_tool_calls_in_order() {
+        let arguments = serde_json::json!({ "query": "wedding", "date": "2023-08-15" });
+        let msgs = vec![
+            ChatMessage::system("ignored"),
+            ChatMessage::user("Photo file path: /photos/img.jpg\nDate taken: August 15, 2023"),
+            ChatMessage {
+                role: "assistant".to_string(),
+                content: String::new(),
+                tool_calls: Some(vec![ToolCall {
+                    function: ToolCallFunction {
+                        name: "search_rag".to_string(),
+                        arguments,
+                    },
+                    id: None,
+                }]),
+                images: None,
+            },
+            ChatMessage::tool_result("No relevant messages found."),
+            ChatMessage {
+                role: "assistant".to_string(),
+                content: "Final title\n\nFinal body.".to_string(),
+                tool_calls: None,
+                images: None,
+            },
+        ];
+        let out = InsightGenerator::render_single_trajectory(&msgs);
+        assert!(out.contains("Input:"));
+        assert!(out.contains("/photos/img.jpg"));
+        assert!(out.contains("1. search_rag("));
+        assert!(out.contains("query=\"wedding\""));
+        assert!(out.contains("-> empty (pivoted)"));
+        assert!(out.contains("Final insight: Final title"));
+    }
 }
diff --git a/src/ai/ollama.rs b/src/ai/ollama.rs
index 81185f0..eb810d2 100644
--- a/src/ai/ollama.rs
+++ b/src/ai/ollama.rs
@@ -4,6 +4,7 @@ use chrono::NaiveDate;
 use reqwest::Client;
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
+use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::{Arc, Mutex};
 use std::time::{Duration, Instant};
 
@@ -19,6 +20,19 @@ pub use crate::ai::llm_client::{ToolCall, ToolCallFunction, ToolFunction};
 // Cache duration: 15 minutes
 const CACHE_DURATION_SECS: u64 = 15 * 60;
 
+/// Default total request timeout for generation calls, in seconds.
+/// Overridable via `OLLAMA_REQUEST_TIMEOUT_SECONDS` env var for slow
+/// CPU-offloaded models where inference can take several minutes.
+const DEFAULT_REQUEST_TIMEOUT_SECS: u64 = 120;
+
+fn configured_request_timeout_secs() -> u64 {
+    std::env::var("OLLAMA_REQUEST_TIMEOUT_SECONDS")
+        .ok()
+        .and_then(|v| v.parse::<u64>().ok())
+        .filter(|&s| s > 0)
+        .unwrap_or(DEFAULT_REQUEST_TIMEOUT_SECS)
+}
+
 /// Embedding model used across the app. Callers that persist a
 /// `model_version` alongside an embedding should read this constant so the
 /// stored label always matches what `generate_embeddings` actually ran.
@@ -65,6 +79,12 @@ pub struct OllamaClient {
     top_p: Option<f32>,
     top_k: Option<i32>,
     min_p: Option<f32>,
+    /// Sticky preference shared across clones: when the fallback server
+    /// succeeded most recently, try it first on the next call. Avoids
+    /// re-probing the primary with a model it doesn't have loaded across
+    /// every iteration of the agent loop. `Arc<AtomicBool>` so cloning
+    /// `OllamaClient` shares the flag rather than resetting it.
+    prefer_fallback: Arc<AtomicBool>,
 }
 
 impl OllamaClient {
@@ -77,7 +97,7 @@ impl OllamaClient {
         Self {
             client: Client::builder()
                 .connect_timeout(Duration::from_secs(5)) // Quick connection timeout
-                .timeout(Duration::from_secs(120)) // Total request timeout for generation
+                .timeout(Duration::from_secs(configured_request_timeout_secs()))
                 .build()
                 .unwrap_or_else(|_| Client::new()),
             primary_url,
@@ -89,9 +109,44 @@ impl OllamaClient {
             top_p: None,
             top_k: None,
             min_p: None,
+            prefer_fallback: Arc::new(AtomicBool::new(false)),
         }
     }
 
+    /// Return the server attempt order as `(label, url, model)` tuples.
+    /// Respects the sticky `prefer_fallback` flag so the most recently
+    /// successful server is tried first.
+    fn attempt_order(&self) -> Vec<(&'static str, String, String)> {
+        let primary = (
+            "primary",
+            self.primary_url.clone(),
+            self.primary_model.clone(),
+        );
+        let fallback = self.fallback_url.as_ref().map(|url| {
+            let model = self
+                .fallback_model
+                .clone()
+                .unwrap_or_else(|| self.primary_model.clone());
+            ("fallback", url.clone(), model)
+        });
+
+        let prefer_fallback = fallback.is_some() && self.prefer_fallback.load(Ordering::Relaxed);
+
+        let mut order = Vec::with_capacity(2);
+        if prefer_fallback {
+            if let Some(fb) = fallback.clone() {
+                order.push(fb);
+            }
+            order.push(primary);
+        } else {
+            order.push(primary);
+            if let Some(fb) = fallback {
+                order.push(fb);
+            }
+        }
+        order
+    }
+
     pub fn set_num_ctx(&mut self, num_ctx: Option<i32>) {
         self.num_ctx = num_ctx;
     }
@@ -587,68 +642,57 @@ Analyze the image and use specific details from both the visual content and the
 
     /// Send a chat request with tool definitions to /api/chat.
     /// Returns the assistant's response message (may contain tool_calls or final content).
-    /// Uses primary/fallback URL routing same as other generation methods.
+    /// Tries servers in preference order — most recently successful first —
+    /// so a fallback-only model doesn't re-404 against the primary on every
+    /// iteration of the agent loop.
     pub async fn chat_with_tools(
         &self,
         messages: Vec<ChatMessage>,
         tools: Vec<Tool>,
     ) -> Result<(ChatMessage, Option<i32>, Option<i32>)> {
-        // Try primary server first
-        log::info!(
-            "Attempting chat_with_tools with primary server: {} (model: {})",
-            self.primary_url,
-            self.primary_model
-        );
-        let primary_result = self
-            .try_chat_with_tools(&self.primary_url, messages.clone(), tools.clone())
-            .await;
-
-        match primary_result {
-            Ok(result) => {
-                log::info!("Successfully got chat_with_tools response from primary server");
-                Ok(result)
-            }
-            Err(e) => {
-                log::warn!("Primary server chat_with_tools failed: {}", e);
-
-                // Try fallback server if available
-                if let Some(fallback_url) = &self.fallback_url {
-                    let fallback_model =
-                        self.fallback_model.as_ref().unwrap_or(&self.primary_model);
+        let order = self.attempt_order();
+        let mut errors: Vec<String> = Vec::new();
 
+        for (label, url, model) in &order {
+            log::info!(
+                "Attempting chat_with_tools with {} server: {} (model: {})",
+                label,
+                url,
+                model
+            );
+            match self
+                .try_chat_with_tools(url, messages.clone(), tools.clone())
+                .await
+            {
+                Ok(result) => {
                     log::info!(
-                        "Attempting chat_with_tools with fallback server: {} (model: {})",
-                        fallback_url,
-                        fallback_model
+                        "Successfully got chat_with_tools response from {} server",
+                        label
                     );
-                    match self
-                        .try_chat_with_tools(fallback_url, messages, tools)
-                        .await
-                    {
-                        Ok(result) => {
-                            log::info!(
-                                "Successfully got chat_with_tools response from fallback server"
-                            );
-                            Ok(result)
-                        }
-                        Err(fallback_e) => {
-                            log::error!(
-                                "Fallback server chat_with_tools also failed: {}",
-                                fallback_e
-                            );
-                            Err(anyhow::anyhow!(
-                                "Both primary and fallback servers failed. Primary: {}, Fallback: {}",
-                                e,
-                                fallback_e
-                            ))
-                        }
-                    }
-                } else {
-                    log::error!("No fallback server configured");
-                    Err(e)
+                    self.prefer_fallback
+                        .store(*label == "fallback", Ordering::Relaxed);
+                    return Ok(result);
+                }
+                Err(e) => {
+                    log::warn!("{} server chat_with_tools failed: {}", label, e);
+                    errors.push(format!("{}: {}", label, e));
                 }
             }
         }
+
+        if order.len() <= 1 {
+            log::error!("No fallback server configured; chat_with_tools exhausted");
+        } else {
+            log::error!(
+                "All {} servers failed for chat_with_tools ({})",
+                order.len(),
+                errors.join(" / ")
+            );
+        }
+        Err(anyhow::anyhow!(
+            "chat_with_tools failed on all servers: {}",
+            errors.join(" / ")
+        ))
     }
 
     /// Streaming variant of `chat_with_tools`. Tries primary, then falls
@@ -662,26 +706,30 @@ Analyze the image and use specific details from both the visual content and the
         messages: Vec<ChatMessage>,
         tools: Vec<Tool>,
     ) -> Result<BoxStream<'static, Result<LlmStreamEvent>>> {
-        // Attempt primary. If it can't be opened at all, try fallback.
-        match self
-            .try_chat_with_tools_stream(&self.primary_url, messages.clone(), tools.clone())
-            .await
-        {
-            Ok(s) => Ok(s),
-            Err(e) => {
-                if let Some(fallback_url) = self.fallback_url.clone() {
-                    log::warn!(
-                        "Streaming chat primary failed ({}); trying fallback {}",
-                        e,
-                        fallback_url
-                    );
-                    self.try_chat_with_tools_stream(&fallback_url, messages, tools)
-                        .await
-                } else {
-                    Err(e)
+        // Same preference logic as `chat_with_tools`. Only the initial
+        // connection is retried across servers — once the stream begins,
+        // mid-stream errors propagate to the caller.
+        let order = self.attempt_order();
+        let mut last_err: Option<anyhow::Error> = None;
+
+        for (label, url, _model) in &order {
+            match self
+                .try_chat_with_tools_stream(url, messages.clone(), tools.clone())
+                .await
+            {
+                Ok(s) => {
+                    self.prefer_fallback
+                        .store(*label == "fallback", Ordering::Relaxed);
+                    return Ok(s);
+                }
+                Err(e) => {
+                    log::warn!("Streaming chat on {} server failed: {}", label, e);
+                    last_err = Some(e);
                 }
             }
         }
+
+        Err(last_err.unwrap_or_else(|| anyhow::anyhow!("No Ollama server configured")))
     }
 
     async fn try_chat_with_tools_stream(
@@ -859,8 +907,12 @@ Analyze the image and use specific details from both the visual content and the
         if !response.status().is_success() {
             let status = response.status();
             let body = response.text().await.unwrap_or_default();
-            log::error!(
-                "chat_with_tools request body that caused {}: {}",
+            // warn, not error — the outer `chat_with_tools` may recover via
+            // the fallback server. When both fail, the outer layer emits the
+            // actual error log.
+            log::warn!(
+                "chat_with_tools request to {} got {}: {}",
+                base_url,
                 status,
                 request_json
             );
diff --git a/src/ai/openrouter.rs b/src/ai/openrouter.rs
index 1559479..82195cf 100644
--- a/src/ai/openrouter.rs
+++ b/src/ai/openrouter.rs
@@ -452,8 +452,7 @@ impl LlmClient for OpenRouterClient {
                 // SSE frames are delimited by a blank line. Walk the buffer
                 // for "\n\n" markers; anything before them is a complete
                 // frame (possibly multi-line).
-                loop {
-                    let Some(sep) = find_double_newline(&buf) else { break };
+                while let Some(sep) = find_double_newline(&buf) {
                     let frame = buf.drain(..sep + 2).collect::<Vec<_>>();
                     let frame_str = match std::str::from_utf8(&frame) {
                         Ok(s) => s,
diff --git a/src/bin/populate_knowledge.rs b/src/bin/populate_knowledge.rs
index f70c5a2..b3239ca 100644
--- a/src/bin/populate_knowledge.rs
+++ b/src/bin/populate_knowledge.rs
@@ -251,6 +251,8 @@ async fn main() -> anyhow::Result<()> {
                 args.min_p,
                 args.max_iterations,
                 None,
+                Vec::new(),
+                Vec::new(),
             )
             .await
         {
diff --git a/src/database/daily_summary_dao.rs b/src/database/daily_summary_dao.rs
index 6ea560a..276a5a2 100644
--- a/src/database/daily_summary_dao.rs
+++ b/src/database/daily_summary_dao.rs
@@ -268,7 +268,7 @@ impl DailySummaryDao for SqliteDailySummaryDao {
                 .into_iter()
                 .take(limit)
                 .map(|(similarity, summary)| {
-                    log::info!(
+                    log::debug!(
                         "Summary match: similarity={:.3}, date={}, contact={}, summary=\"{}\"",
                         similarity,
                         summary.date,
@@ -388,7 +388,7 @@ impl DailySummaryDao for SqliteDailySummaryDao {
                 .into_iter()
                 .take(limit)
                 .map(|(combined, similarity, days, summary)| {
-                    log::info!(
+                    log::debug!(
                         "Summary match: combined={:.3} (sim={:.3}, days={}), date={}, contact={}, summary=\"{}\"",
                         combined,
                         similarity,
diff --git a/src/database/insights_dao.rs b/src/database/insights_dao.rs
index 34c1d12..2821b5b 100644
--- a/src/database/insights_dao.rs
+++ b/src/database/insights_dao.rs
@@ -38,6 +38,16 @@ pub trait InsightDao: Sync + Send {
         file_path: &str,
     ) -> Result<Vec<PhotoInsight>, DbError>;
 
+    /// Fetch a single insight by primary key, regardless of `is_current`.
+    /// Used by the few-shot injection flow where the caller picks specific
+    /// historical insights (which may have been superseded) as training
+    /// exemplars for a fresh generation.
+    fn get_insight_by_id(
+        &mut self,
+        context: &opentelemetry::Context,
+        insight_id: i32,
+    ) -> Result<Option<PhotoInsight>, DbError>;
+
     fn delete_insight(
         &mut self,
         context: &opentelemetry::Context,
@@ -198,6 +208,25 @@ impl InsightDao for SqliteInsightDao {
         .map_err(|_| DbError::new(DbErrorKind::QueryError))
     }
 
+    fn get_insight_by_id(
+        &mut self,
+        context: &opentelemetry::Context,
+        insight_id: i32,
+    ) -> Result<Option<PhotoInsight>, DbError> {
+        trace_db_call(context, "query", "get_insight_by_id", |_span| {
+            use schema::photo_insights::dsl::*;
+
+            let mut connection = self.connection.lock().expect("Unable to get InsightDao");
+
+            photo_insights
+                .find(insight_id)
+                .first::<PhotoInsight>(connection.deref_mut())
+                .optional()
+                .map_err(|_| anyhow::anyhow!("Query error"))
+        })
+        .map_err(|_| DbError::new(DbErrorKind::QueryError))
+    }
+
     fn delete_insight(
         &mut self,
         context: &opentelemetry::Context,
diff --git a/src/database/models.rs b/src/database/models.rs
index 3d63f1a..96d9c53 100644
--- a/src/database/models.rs
+++ b/src/database/models.rs
@@ -102,6 +102,12 @@ pub struct InsertPhotoInsight {
     pub training_messages: Option<String>,
     /// `"local"` (Ollama with images) | `"hybrid"` (local vision + OpenRouter chat).
     pub backend: String,
+    /// JSON array of insight ids whose `training_messages` were compressed
+    /// and injected into the system prompt as few-shot exemplars when this
+    /// row was generated. `None` means no few-shot was used (pristine
+    /// generation). Used downstream to filter out contaminated rows when
+    /// assembling an unbiased training / evaluation set.
+    pub fewshot_source_ids: Option<String>,
 }
 
 #[derive(Serialize, Queryable, Clone, Debug)]
@@ -119,6 +125,7 @@ pub struct PhotoInsight {
     pub approved: Option<bool>,
     /// `"local"` (Ollama with images) | `"hybrid"` (local vision + OpenRouter chat).
     pub backend: String,
+    pub fewshot_source_ids: Option<String>,
 }
 
 // --- Libraries ---
diff --git a/src/database/schema.rs b/src/database/schema.rs
index 200cb15..e49f21f 100644
--- a/src/database/schema.rs
+++ b/src/database/schema.rs
@@ -143,6 +143,7 @@ diesel::table! {
         training_messages -> Nullable<Text>,
         approved -> Nullable<Bool>,
         backend -> Text,
+        fewshot_source_ids -> Nullable<Text>,
     }
 }
 
-- 
2.49.1


From d43f5fc991c7da599e34b62193a983486b09fa41 Mon Sep 17 00:00:00 2001
From: Cameron <ccordes12@mail.bw.edu>
Date: Fri, 24 Apr 2026 13:54:23 -0400
Subject: [PATCH 18/24] docs: document OLLAMA_REQUEST_TIMEOUT_SECONDS env var

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 CLAUDE.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CLAUDE.md b/CLAUDE.md
index 004569e..23bddf5 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -270,6 +270,7 @@ OLLAMA_PRIMARY_URL=http://desktop:11434        # Primary Ollama server (e.g., de
 OLLAMA_FALLBACK_URL=http://server:11434        # Fallback Ollama server (optional, always-on)
 OLLAMA_PRIMARY_MODEL=nemotron-3-nano:30b       # Model for primary server (default: nemotron-3-nano:30b)
 OLLAMA_FALLBACK_MODEL=llama3.2:3b              # Model for fallback server (optional, uses primary if not set)
+OLLAMA_REQUEST_TIMEOUT_SECONDS=120             # Per-request generation timeout (default 120). Increase for slow CPU-offloaded models.
 SMS_API_URL=http://localhost:8000              # SMS message API endpoint (default: localhost:8000)
 SMS_API_TOKEN=your-api-token                   # SMS API authentication token (optional)
 
-- 
2.49.1


From e5781325c61576ca82be40379d62a80bf3759187 Mon Sep 17 00:00:00 2001
From: Cameron <ccordes12@mail.bw.edu>
Date: Fri, 24 Apr 2026 14:25:53 -0400
Subject: [PATCH 19/24] fix(ai): render tool-call arguments as compact JSON in
 logs
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Switch the "Agentic tool call" log from {:?} (Debug) to {} (Display) on
serde_json::Value. Display produces compact JSON — `{"date":"2023-08-15"}`
instead of `Object {"date": String("2023-08-15")}` — which is what the
model actually sent and what a human reading the log wants to see.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/ai/insight_generator.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs
index 65ba3a1..d718b7e 100644
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -3281,7 +3281,7 @@ Return ONLY the summary, nothing else."#,
             {
                 for tool_call in tool_calls {
                     log::info!(
-                        "Agentic tool call [{}]: {} {:?}",
+                        "Agentic tool call [{}]: {} {}",
                         iteration,
                         tool_call.function.name,
                         tool_call.function.arguments
-- 
2.49.1


From 0ebc2e9003338ca1ecc7a47fb0b1e57d322cff6d Mon Sep 17 00:00:00 2001
From: Cameron <ccordes12@mail.bw.edu>
Date: Fri, 24 Apr 2026 16:19:45 -0400
Subject: [PATCH 20/24] feat(ai): rerank timing + think:false + OpenRouter
 error detail
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- search_rag reranker now logs wall-clock time around the ollama.generate
  call, the candidate count / top-N going in, and the final reordering.
  The "final indices" + swap-count line is info level so it's always
  visible; detailed before/after previews stay at debug for when you want
  to inspect reranker quality.
- New OllamaClient::generate_no_think convenience that sets Ollama's
  top-level think:false on the request, plumbed through try_generate via
  a new internal generate_with_options. Used only by the reranker today;
  avoids the chain-of-thought tax on reasoning models (Qwen3/VL,
  DeepSeek-R1 distills, GPT-OSS) when the task has nothing to reason
  about. Server-side no-op on non-reasoning models.
- OpenRouter chat_with_tools "missing choices[0]" error now includes the
  actual response body — extracts structured {error: {code, message}}
  when OpenRouter surfaces it (common for upstream-provider issues like
  rate limits and content moderation), otherwise falls back to a
  truncated raw-JSON view.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/ai/insight_generator.rs | 46 ++++++++++++++++++++++++++++++++++++-
 src/ai/ollama.rs            | 38 +++++++++++++++++++++++++++++-
 src/ai/openrouter.rs        | 43 ++++++++++++++++++++++++++++++----
 3 files changed, 121 insertions(+), 6 deletions(-)

diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs
index d718b7e..44a2a4e 100644
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -1547,6 +1547,14 @@ Return ONLY the summary, nothing else."#,
         limit: usize,
         ollama: &OllamaClient,
     ) -> Result<Vec<String>> {
+        let query_preview: String = query.chars().take(60).collect();
+        log::info!(
+            "rerank: {} candidates -> top {} (query=\"{}\")",
+            candidates.len(),
+            limit,
+            query_preview
+        );
+
         // Build numbered list (1-based for readability). Cap each passage
         // at ~1000 chars so very long summaries don't eat the prompt.
         let numbered: String = candidates
@@ -1574,14 +1582,20 @@ Return ONLY the summary, nothing else."#,
             limit, query, numbered, limit
         );
 
+        let started = std::time::Instant::now();
         let response = ollama
-            .generate(
+            .generate_no_think(
                 &prompt,
                 Some(
                     "You are a terse relevance ranker. You output only numbers separated by commas.",
                 ),
             )
             .await?;
+        log::info!(
+            "rerank: finished in {} ms (prompt={} chars)",
+            started.elapsed().as_millis(),
+            prompt.len()
+        );
 
         // Extract indices from the response. Accept "3, 1, 7" and also
         // tolerate "[3, 1, 7]" or "3,1,7,..." with trailing junk.
@@ -1600,9 +1614,11 @@ Return ONLY the summary, nothing else."#,
 
         let mut seen = std::collections::HashSet::new();
         let mut reordered: Vec<String> = Vec::with_capacity(limit);
+        let mut final_indices: Vec<usize> = Vec::with_capacity(limit);
         for n in picks {
             if seen.insert(n) {
                 reordered.push(candidates[n - 1].clone());
+                final_indices.push(n);
                 if reordered.len() >= limit {
                     break;
                 }
@@ -1614,12 +1630,40 @@ Return ONLY the summary, nothing else."#,
             for (i, c) in candidates.iter().enumerate() {
                 if !seen.contains(&(i + 1)) {
                     reordered.push(c.clone());
+                    final_indices.push(i + 1);
                     if reordered.len() >= limit {
                         break;
                     }
                 }
             }
         }
+
+        // Debug snapshot: show what the reranker changed. Position p holds
+        // the 1-based index of the candidate that now sits at position p.
+        // A value that equals its position means "no change at that slot".
+        let swapped = final_indices
+            .iter()
+            .enumerate()
+            .filter(|(pos, idx)| **idx != pos + 1)
+            .count();
+        log::info!(
+            "rerank: final indices (1-based): {:?} — {} of top {} swapped from vector order",
+            final_indices,
+            swapped,
+            final_indices.len()
+        );
+        let show = final_indices.len().min(5);
+        log::debug!("rerank: vector-order top {}:", show);
+        for (i, c) in candidates.iter().enumerate().take(show) {
+            let preview: String = c.chars().take(100).collect();
+            log::debug!("rerank:   [{}] {}", i + 1, preview);
+        }
+        log::debug!("rerank: reranked top {}:", show);
+        for (pos, idx) in final_indices.iter().enumerate().take(show) {
+            let preview: String = candidates[*idx - 1].chars().take(100).collect();
+            log::debug!("rerank:   [{}] (orig #{}) {}", pos + 1, idx, preview);
+        }
+
         Ok(reordered)
     }
 
diff --git a/src/ai/ollama.rs b/src/ai/ollama.rs
index eb810d2..c56e1e7 100644
--- a/src/ai/ollama.rs
+++ b/src/ai/ollama.rs
@@ -381,6 +381,7 @@ impl OllamaClient {
         prompt: &str,
         system: Option<&str>,
         images: Option<Vec<String>>,
+        think: Option<bool>,
     ) -> Result<String> {
         let request = OllamaRequest {
             model: model.to_string(),
@@ -389,6 +390,7 @@ impl OllamaClient {
             system: system.map(|s| s.to_string()),
             options: self.build_options(),
             images,
+            think,
         };
 
         let response = self
@@ -422,11 +424,31 @@ impl OllamaClient {
         self.generate_with_images(prompt, system, None).await
     }
 
+    /// Variant of `generate` that sets Ollama's top-level `think: false`.
+    /// Used by latency-sensitive callers like the rerank pass, where the
+    /// task has nothing to reason about and chain-of-thought tokens are
+    /// wasted wall time. Server-side no-op on non-reasoning models.
+    pub async fn generate_no_think(&self, prompt: &str, system: Option<&str>) -> Result<String> {
+        self.generate_with_options(prompt, system, None, Some(false))
+            .await
+    }
+
     pub async fn generate_with_images(
         &self,
         prompt: &str,
         system: Option<&str>,
         images: Option<Vec<String>>,
+    ) -> Result<String> {
+        self.generate_with_options(prompt, system, images, None)
+            .await
+    }
+
+    async fn generate_with_options(
+        &self,
+        prompt: &str,
+        system: Option<&str>,
+        images: Option<Vec<String>>,
+        think: Option<bool>,
     ) -> Result<String> {
         log::debug!("=== Ollama Request ===");
         log::debug!("Primary model: {}", self.primary_model);
@@ -452,6 +474,7 @@ impl OllamaClient {
                 prompt,
                 system,
                 images.clone(),
+                think,
             )
             .await;
 
@@ -475,7 +498,14 @@ impl OllamaClient {
                         fallback_model
                     );
                     match self
-                        .try_generate(fallback_url, fallback_model, prompt, system, images.clone())
+                        .try_generate(
+                            fallback_url,
+                            fallback_model,
+                            prompt,
+                            system,
+                            images.clone(),
+                            think,
+                        )
                         .await
                     {
                         Ok(response) => {
@@ -1134,6 +1164,12 @@ struct OllamaRequest {
     options: Option<OllamaOptions>,
     #[serde(skip_serializing_if = "Option::is_none")]
     images: Option<Vec<String>>,
+    /// Ollama's top-level reasoning-mode toggle (~0.4+). `Some(false)`
+    /// asks the server to skip thinking on models that expose a toggle
+    /// (Qwen3, Ollama-integrated DeepSeek-R1 distills, GPT-OSS, etc).
+    /// Ignored by non-reasoning models. None = use the model's default.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    think: Option<bool>,
 }
 
 #[derive(Serialize)]
diff --git a/src/ai/openrouter.rs b/src/ai/openrouter.rs
index 82195cf..62a3cca 100644
--- a/src/ai/openrouter.rs
+++ b/src/ai/openrouter.rs
@@ -360,10 +360,18 @@ impl LlmClient for OpenRouterClient {
             .get("choices")
             .and_then(|v| v.as_array())
             .and_then(|a| a.first())
-            .ok_or_else(|| anyhow!("response missing choices[0]"))?;
-        let msg = choice
-            .get("message")
-            .ok_or_else(|| anyhow!("choices[0] missing message"))?;
+            .ok_or_else(|| {
+                anyhow!(
+                    "response missing choices[0]: {}",
+                    extract_openrouter_error_detail(&parsed)
+                )
+            })?;
+        let msg = choice.get("message").ok_or_else(|| {
+            anyhow!(
+                "choices[0] missing message: {}",
+                extract_openrouter_error_detail(&parsed)
+            )
+        })?;
         let chat_msg = Self::openai_message_to_chat(msg)?;
 
         let usage = parsed.get("usage");
@@ -687,6 +695,33 @@ impl LlmClient for OpenRouterClient {
     }
 }
 
+/// Extract a diagnostic fragment from an OpenRouter response body that
+/// doesn't match the expected `{choices: [...]}` shape. OpenRouter will
+/// sometimes return 200 OK with `{"error": {"message": "...", "code": ...}}`
+/// when the upstream provider (Anthropic/OpenAI/Google/etc) errored out
+/// — rate limits, content moderation, model overload, provider timeout.
+/// Surface the structured error if present; otherwise fall back to a
+/// truncated raw-JSON view so the log line is actionable.
+fn extract_openrouter_error_detail(parsed: &Value) -> String {
+    if let Some(err) = parsed.get("error") {
+        let message = err
+            .get("message")
+            .and_then(|v| v.as_str())
+            .unwrap_or("(no message)");
+        let code = err
+            .get("code")
+            .map(|v| match v {
+                Value::String(s) => s.clone(),
+                other => other.to_string(),
+            })
+            .unwrap_or_else(|| "?".to_string());
+        let short_message: String = message.chars().take(240).collect();
+        return format!("error code={} message=\"{}\"", code, short_message);
+    }
+    let raw = parsed.to_string();
+    raw.chars().take(300).collect()
+}
+
 /// Find the byte offset of the first `\n\n` (end of an SSE frame) in `buf`.
 /// Returns the index of the first `\n` of the pair, so the full separator is
 /// `buf[idx..=idx+1]`. Also handles `\r\n\r\n` since some servers emit it.
-- 
2.49.1


From 0e55a6b12566ba2ad0bb3f8ae48210c23759f38b Mon Sep 17 00:00:00 2001
From: Cameron <ccordes12@mail.bw.edu>
Date: Fri, 24 Apr 2026 19:12:17 -0400
Subject: [PATCH 21/24] fix(ai): treat rewind at end of history as no-op
 success

The mobile client's regenerate-after-failure flow sends a discard index
equal to the server's rendered count (its optimistic user bubble for the
failed turn was never persisted). find_raw_cut treated this as out of
range, surfacing as "Chat rewind failed: discard_from_rendered_index out
of range" and blocking the retry.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/ai/insight_chat.rs | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs
index c51befd..1c20b45 100644
--- a/src/ai/insight_chat.rs
+++ b/src/ai/insight_chat.rs
@@ -1071,8 +1071,13 @@ fn is_rendered(m: &ChatMessage) -> bool {
 /// Given a rendered index to start discarding from, find the raw index at
 /// which to truncate. The cut position is the raw length after all prior
 /// rendered messages — which also strips any tool-call scaffolding that
-/// immediately precedes the discarded rendered message. Returns `None` if
-/// `discard_from_rendered_index` is past the end of the rendered view.
+/// immediately precedes the discarded rendered message.
+///
+/// Discarding *at* the end (`discard == rendered_count`) is a no-op success:
+/// returns `Some(messages.len())`. The mobile client hits this when
+/// regenerating after a failed turn — its optimistic user bubble lives at
+/// the index just past the server's persisted history. Strictly past the end
+/// (`discard > rendered_count`) returns `None`.
 pub(crate) fn find_raw_cut(
     messages: &[ChatMessage],
     discard_from_rendered_index: usize,
@@ -1089,10 +1094,8 @@ pub(crate) fn find_raw_cut(
         rendered_count += 1;
         last_kept_raw_end = i + 1;
     }
-    if rendered_count == discard_from_rendered_index {
-        // Discarding past the last rendered message is a no-op, but we
-        // surface it as "nothing to cut" rather than silent success.
-        return None;
+    if discard_from_rendered_index == rendered_count {
+        return Some(messages.len());
     }
     None
 }
@@ -1361,4 +1364,18 @@ mod tests {
         let msgs = vec![ChatMessage::user("q1"), assistant_text("a1")];
         assert!(find_raw_cut(&msgs, 5).is_none());
     }
+
+    #[test]
+    fn rewind_at_end_is_noop_success() {
+        // Mobile client retries after a failed turn that never persisted —
+        // its optimistic user bubble's index equals the server's rendered
+        // count. Should resolve to "no cut" rather than an out-of-range error.
+        let msgs = vec![
+            ChatMessage::system("s"),
+            ChatMessage::user("q1"),
+            assistant_text("a1"),
+        ];
+        let cut = find_raw_cut(&msgs, 2).expect("boundary cut should succeed");
+        assert_eq!(cut, msgs.len());
+    }
 }
-- 
2.49.1


From fa21b0d73d71fd1de977f684f8bee4423f72dd2b Mon Sep 17 00:00:00 2001
From: Cameron <ccordes12@mail.bw.edu>
Date: Fri, 24 Apr 2026 19:12:25 -0400
Subject: [PATCH 22/24] chore(ai): disable default few-shot insight ids

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/ai/handlers.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/ai/handlers.rs b/src/ai/handlers.rs
index 2121a81..7ec6ab7 100644
--- a/src/ai/handlers.rs
+++ b/src/ai/handlers.rs
@@ -18,7 +18,8 @@ use crate::utils::normalize_path;
 /// into trajectory form and injected into the system prompt. Empty = no
 /// change in behavior. Request-level `fewshot_insight_ids` overrides this
 /// when non-empty.
-const DEFAULT_FEWSHOT_INSIGHT_IDS: &[i32] = &[2918, 2908];
+// const DEFAULT_FEWSHOT_INSIGHT_IDS: &[i32] = &[2918, 2908];
+const DEFAULT_FEWSHOT_INSIGHT_IDS: &[i32] = &[];
 
 #[derive(Debug, Deserialize)]
 pub struct GeneratePhotoInsightRequest {
-- 
2.49.1


From 021d1bffc0bf221b587e3c21a94780912e825b8a Mon Sep 17 00:00:00 2001
From: Cameron <ccordes12@mail.bw.edu>
Date: Fri, 24 Apr 2026 19:13:28 -0400
Subject: [PATCH 23/24] chore: ignore db backups and local .idea config files

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 1437451..112d1e3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,12 +1,15 @@
 /target
 database/target
 *.db
+*.db.bak
 .env
 /tmp
 
 # Default ignored files
 .idea/shelf/
 .idea/workspace.xml
+.idea/inspectionProfiles/
+.idea/markdown.xml
 # Datasource local storage ignored files
 .idea/dataSources*
 .idea/dataSources.local.xml
-- 
2.49.1


From 21e624da6bbfaf2a2893210e4070b0c1954c8a91 Mon Sep 17 00:00:00 2001
From: Cameron <ccordes12@mail.bw.edu>
Date: Sun, 26 Apr 2026 01:06:13 -0400
Subject: [PATCH 24/24] fix(video): sentinel for failed HLS encodes to stop
 retry loop

Previously a corrupt source (e.g. truncated mp4 with no moov atom) would be
re-queued on every directory scan: cleanup_partial_hls wipes the temp
playlist on ffmpeg failure, leaving no .m3u8 to short-circuit the next pass.

Mirrors the thumbnail .unsupported sentinel pattern: on ffmpeg failure,
write <playlist>.m3u8.unsupported, and treat its presence as "done" in both
the ScanDirectoryMessage filter and the QueueVideosMessage check. Delete
the sentinel to force a retry.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/video/actors.rs | 32 ++++++++++++++++++++++++++++++--
 1 file changed, 30 insertions(+), 2 deletions(-)

diff --git a/src/video/actors.rs b/src/video/actors.rs
index a461efe..c7300ef 100644
--- a/src/video/actors.rs
+++ b/src/video/actors.rs
@@ -55,6 +55,16 @@ pub fn playlist_file_for(playlist_dir: &str, video_path: &Path) -> PathBuf {
     PathBuf::from(format!("{}/{}.m3u8", playlist_dir, filename))
 }
 
+/// Sentinel path written next to a would-be playlist when ffmpeg cannot
+/// transcode the source (e.g. truncated mp4 with no moov atom). Its presence
+/// causes future scans to skip the file instead of re-running ffmpeg every
+/// pass. Delete the `.unsupported` file to force a retry.
+pub fn playlist_unsupported_sentinel(playlist_file: &Path) -> PathBuf {
+    let mut s = playlist_file.as_os_str().to_owned();
+    s.push(".unsupported");
+    PathBuf::from(s)
+}
+
 pub async fn create_playlist(video_path: &str, playlist_file: &str) -> Result<Child> {
     if Path::new(playlist_file).exists() {
         debug!("Playlist already exists: {}", playlist_file);
@@ -319,7 +329,10 @@ impl Handler<ScanDirectoryMessage> for VideoPlaylistManager {
             .filter_map(|e| e.ok())
             .filter(|e| e.file_type().is_file())
             .filter(is_video)
-            .filter(|e| !playlist_file_for(&playlist_dir_str, e.path()).exists())
+            .filter(|e| {
+                let playlist = playlist_file_for(&playlist_dir_str, e.path());
+                !playlist.exists() && !playlist_unsupported_sentinel(&playlist).exists()
+            })
             .collect::<Vec<DirEntry>>();
 
         let scan_dir_name = msg.directory.clone();
@@ -393,7 +406,8 @@ impl Handler<QueueVideosMessage> for VideoPlaylistManager {
         let playlist_generator = self.playlist_generator.clone();
 
         for video_path in msg.video_paths {
-            if playlist_file_for(&playlist_dir_str, &video_path).exists() {
+            let playlist = playlist_file_for(&playlist_dir_str, &video_path);
+            if playlist.exists() || playlist_unsupported_sentinel(&playlist).exists() {
                 continue;
             }
             let path_str = video_path.to_string_lossy().to_string();
@@ -683,6 +697,20 @@ impl Handler<GeneratePlaylistMessage> for PlaylistGenerator {
                 };
                 error!("ffmpeg failed for {}: {}", video_file, detail);
                 cleanup_partial_hls(&playlist_tmp, playlist_path.as_str(), video_stem).await;
+                let sentinel = playlist_unsupported_sentinel(Path::new(&playlist_file));
+                if let Err(se) = tokio::fs::write(&sentinel, b"").await {
+                    warn!(
+                        "Failed to write playlist sentinel {}: {}",
+                        sentinel.display(),
+                        se
+                    );
+                } else {
+                    info!(
+                        "Wrote playlist sentinel {} so future scans skip {}",
+                        sentinel.display(),
+                        video_file
+                    );
+                }
                 span.set_status(Status::error(detail.clone()));
                 Err(std::io::Error::other(detail))
             }
-- 
2.49.1