feat(ai): few-shot exemplars + sticky Ollama preference
- Few-shot injection on /insights/generate/agentic: compresses prior training_messages into trajectory blocks (tool calls + result summaries) and injects into the system prompt. Hardcoded default ids with optional request override. - New fewshot_source_ids column on photo_insights (+ migration) to track which exemplars influenced a given row, for downstream training-set filtering. Chat amend rows stamp None with a lineage note. - Ollama client now remembers which server (primary/fallback) most recently succeeded and tries it first on the next call, via a shared Arc<AtomicBool>. Avoids re-404ing the primary on every agent iteration when the chosen model only lives on the fallback. - Demote noisy logs: daily_summary "Summary match" lines to debug; inner chat_with_tools non-2xx body log from error to warn (outer layer owns the terminal-error signal). - Drift-guard tests for summarize_tool_result covering the success / empty / error / unknown shape for every tool. - Tidy: three pre-existing clippy warnings cleaned up. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
194
src/ai/ollama.rs
194
src/ai/ollama.rs
@@ -4,6 +4,7 @@ use chrono::NaiveDate;
|
||||
use reqwest::Client;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::sync::atomic::{AtomicBool, Ordering};
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
@@ -19,6 +20,19 @@ pub use crate::ai::llm_client::{ToolCall, ToolCallFunction, ToolFunction};
|
||||
// Cache duration: 15 minutes
|
||||
const CACHE_DURATION_SECS: u64 = 15 * 60;
|
||||
|
||||
/// Default total request timeout for generation calls, in seconds.
|
||||
/// Overridable via `OLLAMA_REQUEST_TIMEOUT_SECONDS` env var for slow
|
||||
/// CPU-offloaded models where inference can take several minutes.
|
||||
const DEFAULT_REQUEST_TIMEOUT_SECS: u64 = 120;
|
||||
|
||||
fn configured_request_timeout_secs() -> u64 {
|
||||
std::env::var("OLLAMA_REQUEST_TIMEOUT_SECONDS")
|
||||
.ok()
|
||||
.and_then(|v| v.parse::<u64>().ok())
|
||||
.filter(|&s| s > 0)
|
||||
.unwrap_or(DEFAULT_REQUEST_TIMEOUT_SECS)
|
||||
}
|
||||
|
||||
/// Embedding model used across the app. Callers that persist a
|
||||
/// `model_version` alongside an embedding should read this constant so the
|
||||
/// stored label always matches what `generate_embeddings` actually ran.
|
||||
@@ -65,6 +79,12 @@ pub struct OllamaClient {
|
||||
top_p: Option<f32>,
|
||||
top_k: Option<i32>,
|
||||
min_p: Option<f32>,
|
||||
/// Sticky preference shared across clones: when the fallback server
|
||||
/// succeeded most recently, try it first on the next call. Avoids
|
||||
/// re-probing the primary with a model it doesn't have loaded across
|
||||
/// every iteration of the agent loop. `Arc<AtomicBool>` so cloning
|
||||
/// `OllamaClient` shares the flag rather than resetting it.
|
||||
prefer_fallback: Arc<AtomicBool>,
|
||||
}
|
||||
|
||||
impl OllamaClient {
|
||||
@@ -77,7 +97,7 @@ impl OllamaClient {
|
||||
Self {
|
||||
client: Client::builder()
|
||||
.connect_timeout(Duration::from_secs(5)) // Quick connection timeout
|
||||
.timeout(Duration::from_secs(120)) // Total request timeout for generation
|
||||
.timeout(Duration::from_secs(configured_request_timeout_secs()))
|
||||
.build()
|
||||
.unwrap_or_else(|_| Client::new()),
|
||||
primary_url,
|
||||
@@ -89,9 +109,44 @@ impl OllamaClient {
|
||||
top_p: None,
|
||||
top_k: None,
|
||||
min_p: None,
|
||||
prefer_fallback: Arc::new(AtomicBool::new(false)),
|
||||
}
|
||||
}
|
||||
|
||||
/// Return the server attempt order as `(label, url, model)` tuples.
|
||||
/// Respects the sticky `prefer_fallback` flag so the most recently
|
||||
/// successful server is tried first.
|
||||
fn attempt_order(&self) -> Vec<(&'static str, String, String)> {
|
||||
let primary = (
|
||||
"primary",
|
||||
self.primary_url.clone(),
|
||||
self.primary_model.clone(),
|
||||
);
|
||||
let fallback = self.fallback_url.as_ref().map(|url| {
|
||||
let model = self
|
||||
.fallback_model
|
||||
.clone()
|
||||
.unwrap_or_else(|| self.primary_model.clone());
|
||||
("fallback", url.clone(), model)
|
||||
});
|
||||
|
||||
let prefer_fallback = fallback.is_some() && self.prefer_fallback.load(Ordering::Relaxed);
|
||||
|
||||
let mut order = Vec::with_capacity(2);
|
||||
if prefer_fallback {
|
||||
if let Some(fb) = fallback.clone() {
|
||||
order.push(fb);
|
||||
}
|
||||
order.push(primary);
|
||||
} else {
|
||||
order.push(primary);
|
||||
if let Some(fb) = fallback {
|
||||
order.push(fb);
|
||||
}
|
||||
}
|
||||
order
|
||||
}
|
||||
|
||||
pub fn set_num_ctx(&mut self, num_ctx: Option<i32>) {
|
||||
self.num_ctx = num_ctx;
|
||||
}
|
||||
@@ -587,68 +642,57 @@ Analyze the image and use specific details from both the visual content and the
|
||||
|
||||
/// Send a chat request with tool definitions to /api/chat.
|
||||
/// Returns the assistant's response message (may contain tool_calls or final content).
|
||||
/// Uses primary/fallback URL routing same as other generation methods.
|
||||
/// Tries servers in preference order — most recently successful first —
|
||||
/// so a fallback-only model doesn't re-404 against the primary on every
|
||||
/// iteration of the agent loop.
|
||||
pub async fn chat_with_tools(
|
||||
&self,
|
||||
messages: Vec<ChatMessage>,
|
||||
tools: Vec<Tool>,
|
||||
) -> Result<(ChatMessage, Option<i32>, Option<i32>)> {
|
||||
// Try primary server first
|
||||
log::info!(
|
||||
"Attempting chat_with_tools with primary server: {} (model: {})",
|
||||
self.primary_url,
|
||||
self.primary_model
|
||||
);
|
||||
let primary_result = self
|
||||
.try_chat_with_tools(&self.primary_url, messages.clone(), tools.clone())
|
||||
.await;
|
||||
|
||||
match primary_result {
|
||||
Ok(result) => {
|
||||
log::info!("Successfully got chat_with_tools response from primary server");
|
||||
Ok(result)
|
||||
}
|
||||
Err(e) => {
|
||||
log::warn!("Primary server chat_with_tools failed: {}", e);
|
||||
|
||||
// Try fallback server if available
|
||||
if let Some(fallback_url) = &self.fallback_url {
|
||||
let fallback_model =
|
||||
self.fallback_model.as_ref().unwrap_or(&self.primary_model);
|
||||
let order = self.attempt_order();
|
||||
let mut errors: Vec<String> = Vec::new();
|
||||
|
||||
for (label, url, model) in &order {
|
||||
log::info!(
|
||||
"Attempting chat_with_tools with {} server: {} (model: {})",
|
||||
label,
|
||||
url,
|
||||
model
|
||||
);
|
||||
match self
|
||||
.try_chat_with_tools(url, messages.clone(), tools.clone())
|
||||
.await
|
||||
{
|
||||
Ok(result) => {
|
||||
log::info!(
|
||||
"Attempting chat_with_tools with fallback server: {} (model: {})",
|
||||
fallback_url,
|
||||
fallback_model
|
||||
"Successfully got chat_with_tools response from {} server",
|
||||
label
|
||||
);
|
||||
match self
|
||||
.try_chat_with_tools(fallback_url, messages, tools)
|
||||
.await
|
||||
{
|
||||
Ok(result) => {
|
||||
log::info!(
|
||||
"Successfully got chat_with_tools response from fallback server"
|
||||
);
|
||||
Ok(result)
|
||||
}
|
||||
Err(fallback_e) => {
|
||||
log::error!(
|
||||
"Fallback server chat_with_tools also failed: {}",
|
||||
fallback_e
|
||||
);
|
||||
Err(anyhow::anyhow!(
|
||||
"Both primary and fallback servers failed. Primary: {}, Fallback: {}",
|
||||
e,
|
||||
fallback_e
|
||||
))
|
||||
}
|
||||
}
|
||||
} else {
|
||||
log::error!("No fallback server configured");
|
||||
Err(e)
|
||||
self.prefer_fallback
|
||||
.store(*label == "fallback", Ordering::Relaxed);
|
||||
return Ok(result);
|
||||
}
|
||||
Err(e) => {
|
||||
log::warn!("{} server chat_with_tools failed: {}", label, e);
|
||||
errors.push(format!("{}: {}", label, e));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if order.len() <= 1 {
|
||||
log::error!("No fallback server configured; chat_with_tools exhausted");
|
||||
} else {
|
||||
log::error!(
|
||||
"All {} servers failed for chat_with_tools ({})",
|
||||
order.len(),
|
||||
errors.join(" / ")
|
||||
);
|
||||
}
|
||||
Err(anyhow::anyhow!(
|
||||
"chat_with_tools failed on all servers: {}",
|
||||
errors.join(" / ")
|
||||
))
|
||||
}
|
||||
|
||||
/// Streaming variant of `chat_with_tools`. Tries primary, then falls
|
||||
@@ -662,26 +706,30 @@ Analyze the image and use specific details from both the visual content and the
|
||||
messages: Vec<ChatMessage>,
|
||||
tools: Vec<Tool>,
|
||||
) -> Result<BoxStream<'static, Result<LlmStreamEvent>>> {
|
||||
// Attempt primary. If it can't be opened at all, try fallback.
|
||||
match self
|
||||
.try_chat_with_tools_stream(&self.primary_url, messages.clone(), tools.clone())
|
||||
.await
|
||||
{
|
||||
Ok(s) => Ok(s),
|
||||
Err(e) => {
|
||||
if let Some(fallback_url) = self.fallback_url.clone() {
|
||||
log::warn!(
|
||||
"Streaming chat primary failed ({}); trying fallback {}",
|
||||
e,
|
||||
fallback_url
|
||||
);
|
||||
self.try_chat_with_tools_stream(&fallback_url, messages, tools)
|
||||
.await
|
||||
} else {
|
||||
Err(e)
|
||||
// Same preference logic as `chat_with_tools`. Only the initial
|
||||
// connection is retried across servers — once the stream begins,
|
||||
// mid-stream errors propagate to the caller.
|
||||
let order = self.attempt_order();
|
||||
let mut last_err: Option<anyhow::Error> = None;
|
||||
|
||||
for (label, url, _model) in &order {
|
||||
match self
|
||||
.try_chat_with_tools_stream(url, messages.clone(), tools.clone())
|
||||
.await
|
||||
{
|
||||
Ok(s) => {
|
||||
self.prefer_fallback
|
||||
.store(*label == "fallback", Ordering::Relaxed);
|
||||
return Ok(s);
|
||||
}
|
||||
Err(e) => {
|
||||
log::warn!("Streaming chat on {} server failed: {}", label, e);
|
||||
last_err = Some(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Err(last_err.unwrap_or_else(|| anyhow::anyhow!("No Ollama server configured")))
|
||||
}
|
||||
|
||||
async fn try_chat_with_tools_stream(
|
||||
@@ -859,8 +907,12 @@ Analyze the image and use specific details from both the visual content and the
|
||||
if !response.status().is_success() {
|
||||
let status = response.status();
|
||||
let body = response.text().await.unwrap_or_default();
|
||||
log::error!(
|
||||
"chat_with_tools request body that caused {}: {}",
|
||||
// warn, not error — the outer `chat_with_tools` may recover via
|
||||
// the fallback server. When both fail, the outer layer emits the
|
||||
// actual error log.
|
||||
log::warn!(
|
||||
"chat_with_tools request to {} got {}: {}",
|
||||
base_url,
|
||||
status,
|
||||
request_json
|
||||
);
|
||||
|
||||
Reference in New Issue
Block a user