feat(ai): few-shot exemplars + sticky Ollama preference

- Few-shot injection on /insights/generate/agentic: compresses prior training_messages into trajectory blocks (tool calls + result summaries) and injects into the system prompt. Hardcoded default ids with optional request override. - New fewshot_source_ids column on photo_insights (+ migration) to track which exemplars influenced a given row, for downstream training-set filtering. Chat amend rows stamp None with a lineage note. - Ollama client now remembers which server (primary/fallback) most recently succeeded and tries it first on the next call, via a shared Arc<AtomicBool>. Avoids re-404ing the primary on every agent iteration when the chosen model only lives on the fallback. - Demote noisy logs: daily_summary "Summary match" lines to debug; inner chat_with_tools non-2xx body log from error to warn (outer layer owns the terminal-error signal). - Drift-guard tests for summarize_tool_result covering the success / empty / error / unknown shape for every tool. - Tidy: three pre-existing clippy warnings cleaned up. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-24 13:54:06 -04:00
parent 29f32b9d22
commit f0ae9f95dc
12 changed files with 639 additions and 82 deletions
--- a/src/ai/ollama.rs
+++ b/src/ai/ollama.rs
@@ -4,6 +4,7 @@ use chrono::NaiveDate;
 use reqwest::Client;
 use serde::{Deserialize, Serialize};
 use std::collections::HashMap;
+use std::sync::atomic::{AtomicBool, Ordering};
 use std::sync::{Arc, Mutex};
 use std::time::{Duration, Instant};

@@ -19,6 +20,19 @@ pub use crate::ai::llm_client::{ToolCall, ToolCallFunction, ToolFunction};
 // Cache duration: 15 minutes
 const CACHE_DURATION_SECS: u64 = 15 * 60;

+/// Default total request timeout for generation calls, in seconds.
+/// Overridable via `OLLAMA_REQUEST_TIMEOUT_SECONDS` env var for slow
+/// CPU-offloaded models where inference can take several minutes.
+const DEFAULT_REQUEST_TIMEOUT_SECS: u64 = 120;
+
+fn configured_request_timeout_secs() -> u64 {
+    std::env::var("OLLAMA_REQUEST_TIMEOUT_SECONDS")
+        .ok()
+        .and_then(|v| v.parse::<u64>().ok())
+        .filter(|&s| s > 0)
+        .unwrap_or(DEFAULT_REQUEST_TIMEOUT_SECS)
+}
+
 /// Embedding model used across the app. Callers that persist a
 /// `model_version` alongside an embedding should read this constant so the
 /// stored label always matches what `generate_embeddings` actually ran.
@@ -65,6 +79,12 @@ pub struct OllamaClient {
    top_p: Option<f32>,
    top_k: Option<i32>,
    min_p: Option<f32>,
+    /// Sticky preference shared across clones: when the fallback server
+    /// succeeded most recently, try it first on the next call. Avoids
+    /// re-probing the primary with a model it doesn't have loaded across
+    /// every iteration of the agent loop. `Arc<AtomicBool>` so cloning
+    /// `OllamaClient` shares the flag rather than resetting it.
+    prefer_fallback: Arc<AtomicBool>,
 }

 impl OllamaClient {
@@ -77,7 +97,7 @@ impl OllamaClient {
        Self {
            client: Client::builder()
                .connect_timeout(Duration::from_secs(5)) // Quick connection timeout
-                .timeout(Duration::from_secs(120)) // Total request timeout for generation
+                .timeout(Duration::from_secs(configured_request_timeout_secs()))
                .build()
                .unwrap_or_else(|_| Client::new()),
            primary_url,
@@ -89,9 +109,44 @@ impl OllamaClient {
            top_p: None,
            top_k: None,
            min_p: None,
+            prefer_fallback: Arc::new(AtomicBool::new(false)),
        }
    }

+    /// Return the server attempt order as `(label, url, model)` tuples.
+    /// Respects the sticky `prefer_fallback` flag so the most recently
+    /// successful server is tried first.
+    fn attempt_order(&self) -> Vec<(&'static str, String, String)> {
+        let primary = (
+            "primary",
+            self.primary_url.clone(),
+            self.primary_model.clone(),
+        );
+        let fallback = self.fallback_url.as_ref().map(|url| {
+            let model = self
+                .fallback_model
+                .clone()
+                .unwrap_or_else(|| self.primary_model.clone());
+            ("fallback", url.clone(), model)
+        });
+
+        let prefer_fallback = fallback.is_some() && self.prefer_fallback.load(Ordering::Relaxed);
+
+        let mut order = Vec::with_capacity(2);
+        if prefer_fallback {
+            if let Some(fb) = fallback.clone() {
+                order.push(fb);
+            }
+            order.push(primary);
+        } else {
+            order.push(primary);
+            if let Some(fb) = fallback {
+                order.push(fb);
+            }
+        }
+        order
+    }
+
    pub fn set_num_ctx(&mut self, num_ctx: Option<i32>) {
        self.num_ctx = num_ctx;
    }
@@ -587,68 +642,57 @@ Analyze the image and use specific details from both the visual content and the

    /// Send a chat request with tool definitions to /api/chat.
    /// Returns the assistant's response message (may contain tool_calls or final content).
-    /// Uses primary/fallback URL routing same as other generation methods.
+    /// Tries servers in preference order — most recently successful first —
+    /// so a fallback-only model doesn't re-404 against the primary on every
+    /// iteration of the agent loop.
    pub async fn chat_with_tools(
        &self,
        messages: Vec<ChatMessage>,
        tools: Vec<Tool>,
    ) -> Result<(ChatMessage, Option<i32>, Option<i32>)> {
-        // Try primary server first
-        log::info!(
-            "Attempting chat_with_tools with primary server: {} (model: {})",
-            self.primary_url,
-            self.primary_model
-        );
-        let primary_result = self
-            .try_chat_with_tools(&self.primary_url, messages.clone(), tools.clone())
-            .await;
-
-        match primary_result {
-            Ok(result) => {
-                log::info!("Successfully got chat_with_tools response from primary server");
-                Ok(result)
-            }
-            Err(e) => {
-                log::warn!("Primary server chat_with_tools failed: {}", e);
-
-                // Try fallback server if available
-                if let Some(fallback_url) = &self.fallback_url {
-                    let fallback_model =
-                        self.fallback_model.as_ref().unwrap_or(&self.primary_model);
+        let order = self.attempt_order();
+        let mut errors: Vec<String> = Vec::new();

+        for (label, url, model) in &order {
+            log::info!(
+                "Attempting chat_with_tools with {} server: {} (model: {})",
+                label,
+                url,
+                model
+            );
+            match self
+                .try_chat_with_tools(url, messages.clone(), tools.clone())
+                .await
+            {
+                Ok(result) => {
                    log::info!(
-                        "Attempting chat_with_tools with fallback server: {} (model: {})",
-                        fallback_url,
-                        fallback_model
+                        "Successfully got chat_with_tools response from {} server",
+                        label
                    );
-                    match self
-                        .try_chat_with_tools(fallback_url, messages, tools)
-                        .await
-                    {
-                        Ok(result) => {
-                            log::info!(
-                                "Successfully got chat_with_tools response from fallback server"
-                            );
-                            Ok(result)
-                        }
-                        Err(fallback_e) => {
-                            log::error!(
-                                "Fallback server chat_with_tools also failed: {}",
-                                fallback_e
-                            );
-                            Err(anyhow::anyhow!(
-                                "Both primary and fallback servers failed. Primary: {}, Fallback: {}",
-                                e,
-                                fallback_e
-                            ))
-                        }
-                    }
-                } else {
-                    log::error!("No fallback server configured");
-                    Err(e)
+                    self.prefer_fallback
+                        .store(*label == "fallback", Ordering::Relaxed);
+                    return Ok(result);
+                }
+                Err(e) => {
+                    log::warn!("{} server chat_with_tools failed: {}", label, e);
+                    errors.push(format!("{}: {}", label, e));
                }
            }
        }
+
+        if order.len() <= 1 {
+            log::error!("No fallback server configured; chat_with_tools exhausted");
+        } else {
+            log::error!(
+                "All {} servers failed for chat_with_tools ({})",
+                order.len(),
+                errors.join(" / ")
+            );
+        }
+        Err(anyhow::anyhow!(
+            "chat_with_tools failed on all servers: {}",
+            errors.join(" / ")
+        ))
    }

    /// Streaming variant of `chat_with_tools`. Tries primary, then falls
@@ -662,26 +706,30 @@ Analyze the image and use specific details from both the visual content and the
        messages: Vec<ChatMessage>,
        tools: Vec<Tool>,
    ) -> Result<BoxStream<'static, Result<LlmStreamEvent>>> {
-        // Attempt primary. If it can't be opened at all, try fallback.
-        match self
-            .try_chat_with_tools_stream(&self.primary_url, messages.clone(), tools.clone())
-            .await
-        {
-            Ok(s) => Ok(s),
-            Err(e) => {
-                if let Some(fallback_url) = self.fallback_url.clone() {
-                    log::warn!(
-                        "Streaming chat primary failed ({}); trying fallback {}",
-                        e,
-                        fallback_url
-                    );
-                    self.try_chat_with_tools_stream(&fallback_url, messages, tools)
-                        .await
-                } else {
-                    Err(e)
+        // Same preference logic as `chat_with_tools`. Only the initial
+        // connection is retried across servers — once the stream begins,
+        // mid-stream errors propagate to the caller.
+        let order = self.attempt_order();
+        let mut last_err: Option<anyhow::Error> = None;
+
+        for (label, url, _model) in &order {
+            match self
+                .try_chat_with_tools_stream(url, messages.clone(), tools.clone())
+                .await
+            {
+                Ok(s) => {
+                    self.prefer_fallback
+                        .store(*label == "fallback", Ordering::Relaxed);
+                    return Ok(s);
+                }
+                Err(e) => {
+                    log::warn!("Streaming chat on {} server failed: {}", label, e);
+                    last_err = Some(e);
                }
            }
        }
+
+        Err(last_err.unwrap_or_else(|| anyhow::anyhow!("No Ollama server configured")))
    }

    async fn try_chat_with_tools_stream(
@@ -859,8 +907,12 @@ Analyze the image and use specific details from both the visual content and the
        if !response.status().is_success() {
            let status = response.status();
            let body = response.text().await.unwrap_or_default();
-            log::error!(
-                "chat_with_tools request body that caused {}: {}",
+            // warn, not error — the outer `chat_with_tools` may recover via
+            // the fallback server. When both fail, the outer layer emits the
+            // actual error log.
+            log::warn!(
+                "chat_with_tools request to {} got {}: {}",
+                base_url,
                status,
                request_json
            );