From 0ebc2e9003338ca1ecc7a47fb0b1e57d322cff6d Mon Sep 17 00:00:00 2001 From: Cameron Date: Fri, 24 Apr 2026 16:19:45 -0400 Subject: [PATCH] feat(ai): rerank timing + think:false + OpenRouter error detail MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - search_rag reranker now logs wall-clock time around the ollama.generate call, the candidate count / top-N going in, and the final reordering. The "final indices" + swap-count line is info level so it's always visible; detailed before/after previews stay at debug for when you want to inspect reranker quality. - New OllamaClient::generate_no_think convenience that sets Ollama's top-level think:false on the request, plumbed through try_generate via a new internal generate_with_options. Used only by the reranker today; avoids the chain-of-thought tax on reasoning models (Qwen3/VL, DeepSeek-R1 distills, GPT-OSS) when the task has nothing to reason about. Server-side no-op on non-reasoning models. - OpenRouter chat_with_tools "missing choices[0]" error now includes the actual response body — extracts structured {error: {code, message}} when OpenRouter surfaces it (common for upstream-provider issues like rate limits and content moderation), otherwise falls back to a truncated raw-JSON view. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/ai/insight_generator.rs | 46 ++++++++++++++++++++++++++++++++++++- src/ai/ollama.rs | 38 +++++++++++++++++++++++++++++- src/ai/openrouter.rs | 43 ++++++++++++++++++++++++++++++---- 3 files changed, 121 insertions(+), 6 deletions(-) diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs index d718b7e..44a2a4e 100644 --- a/src/ai/insight_generator.rs +++ b/src/ai/insight_generator.rs @@ -1547,6 +1547,14 @@ Return ONLY the summary, nothing else."#, limit: usize, ollama: &OllamaClient, ) -> Result> { + let query_preview: String = query.chars().take(60).collect(); + log::info!( + "rerank: {} candidates -> top {} (query=\"{}\")", + candidates.len(), + limit, + query_preview + ); + // Build numbered list (1-based for readability). Cap each passage // at ~1000 chars so very long summaries don't eat the prompt. let numbered: String = candidates @@ -1574,14 +1582,20 @@ Return ONLY the summary, nothing else."#, limit, query, numbered, limit ); + let started = std::time::Instant::now(); let response = ollama - .generate( + .generate_no_think( &prompt, Some( "You are a terse relevance ranker. You output only numbers separated by commas.", ), ) .await?; + log::info!( + "rerank: finished in {} ms (prompt={} chars)", + started.elapsed().as_millis(), + prompt.len() + ); // Extract indices from the response. Accept "3, 1, 7" and also // tolerate "[3, 1, 7]" or "3,1,7,..." with trailing junk. @@ -1600,9 +1614,11 @@ Return ONLY the summary, nothing else."#, let mut seen = std::collections::HashSet::new(); let mut reordered: Vec = Vec::with_capacity(limit); + let mut final_indices: Vec = Vec::with_capacity(limit); for n in picks { if seen.insert(n) { reordered.push(candidates[n - 1].clone()); + final_indices.push(n); if reordered.len() >= limit { break; } @@ -1614,12 +1630,40 @@ Return ONLY the summary, nothing else."#, for (i, c) in candidates.iter().enumerate() { if !seen.contains(&(i + 1)) { reordered.push(c.clone()); + final_indices.push(i + 1); if reordered.len() >= limit { break; } } } } + + // Debug snapshot: show what the reranker changed. Position p holds + // the 1-based index of the candidate that now sits at position p. + // A value that equals its position means "no change at that slot". + let swapped = final_indices + .iter() + .enumerate() + .filter(|(pos, idx)| **idx != pos + 1) + .count(); + log::info!( + "rerank: final indices (1-based): {:?} — {} of top {} swapped from vector order", + final_indices, + swapped, + final_indices.len() + ); + let show = final_indices.len().min(5); + log::debug!("rerank: vector-order top {}:", show); + for (i, c) in candidates.iter().enumerate().take(show) { + let preview: String = c.chars().take(100).collect(); + log::debug!("rerank: [{}] {}", i + 1, preview); + } + log::debug!("rerank: reranked top {}:", show); + for (pos, idx) in final_indices.iter().enumerate().take(show) { + let preview: String = candidates[*idx - 1].chars().take(100).collect(); + log::debug!("rerank: [{}] (orig #{}) {}", pos + 1, idx, preview); + } + Ok(reordered) } diff --git a/src/ai/ollama.rs b/src/ai/ollama.rs index eb810d2..c56e1e7 100644 --- a/src/ai/ollama.rs +++ b/src/ai/ollama.rs @@ -381,6 +381,7 @@ impl OllamaClient { prompt: &str, system: Option<&str>, images: Option>, + think: Option, ) -> Result { let request = OllamaRequest { model: model.to_string(), @@ -389,6 +390,7 @@ impl OllamaClient { system: system.map(|s| s.to_string()), options: self.build_options(), images, + think, }; let response = self @@ -422,11 +424,31 @@ impl OllamaClient { self.generate_with_images(prompt, system, None).await } + /// Variant of `generate` that sets Ollama's top-level `think: false`. + /// Used by latency-sensitive callers like the rerank pass, where the + /// task has nothing to reason about and chain-of-thought tokens are + /// wasted wall time. Server-side no-op on non-reasoning models. + pub async fn generate_no_think(&self, prompt: &str, system: Option<&str>) -> Result { + self.generate_with_options(prompt, system, None, Some(false)) + .await + } + pub async fn generate_with_images( &self, prompt: &str, system: Option<&str>, images: Option>, + ) -> Result { + self.generate_with_options(prompt, system, images, None) + .await + } + + async fn generate_with_options( + &self, + prompt: &str, + system: Option<&str>, + images: Option>, + think: Option, ) -> Result { log::debug!("=== Ollama Request ==="); log::debug!("Primary model: {}", self.primary_model); @@ -452,6 +474,7 @@ impl OllamaClient { prompt, system, images.clone(), + think, ) .await; @@ -475,7 +498,14 @@ impl OllamaClient { fallback_model ); match self - .try_generate(fallback_url, fallback_model, prompt, system, images.clone()) + .try_generate( + fallback_url, + fallback_model, + prompt, + system, + images.clone(), + think, + ) .await { Ok(response) => { @@ -1134,6 +1164,12 @@ struct OllamaRequest { options: Option, #[serde(skip_serializing_if = "Option::is_none")] images: Option>, + /// Ollama's top-level reasoning-mode toggle (~0.4+). `Some(false)` + /// asks the server to skip thinking on models that expose a toggle + /// (Qwen3, Ollama-integrated DeepSeek-R1 distills, GPT-OSS, etc). + /// Ignored by non-reasoning models. None = use the model's default. + #[serde(skip_serializing_if = "Option::is_none")] + think: Option, } #[derive(Serialize)] diff --git a/src/ai/openrouter.rs b/src/ai/openrouter.rs index 82195cf..62a3cca 100644 --- a/src/ai/openrouter.rs +++ b/src/ai/openrouter.rs @@ -360,10 +360,18 @@ impl LlmClient for OpenRouterClient { .get("choices") .and_then(|v| v.as_array()) .and_then(|a| a.first()) - .ok_or_else(|| anyhow!("response missing choices[0]"))?; - let msg = choice - .get("message") - .ok_or_else(|| anyhow!("choices[0] missing message"))?; + .ok_or_else(|| { + anyhow!( + "response missing choices[0]: {}", + extract_openrouter_error_detail(&parsed) + ) + })?; + let msg = choice.get("message").ok_or_else(|| { + anyhow!( + "choices[0] missing message: {}", + extract_openrouter_error_detail(&parsed) + ) + })?; let chat_msg = Self::openai_message_to_chat(msg)?; let usage = parsed.get("usage"); @@ -687,6 +695,33 @@ impl LlmClient for OpenRouterClient { } } +/// Extract a diagnostic fragment from an OpenRouter response body that +/// doesn't match the expected `{choices: [...]}` shape. OpenRouter will +/// sometimes return 200 OK with `{"error": {"message": "...", "code": ...}}` +/// when the upstream provider (Anthropic/OpenAI/Google/etc) errored out +/// — rate limits, content moderation, model overload, provider timeout. +/// Surface the structured error if present; otherwise fall back to a +/// truncated raw-JSON view so the log line is actionable. +fn extract_openrouter_error_detail(parsed: &Value) -> String { + if let Some(err) = parsed.get("error") { + let message = err + .get("message") + .and_then(|v| v.as_str()) + .unwrap_or("(no message)"); + let code = err + .get("code") + .map(|v| match v { + Value::String(s) => s.clone(), + other => other.to_string(), + }) + .unwrap_or_else(|| "?".to_string()); + let short_message: String = message.chars().take(240).collect(); + return format!("error code={} message=\"{}\"", code, short_message); + } + let raw = parsed.to_string(); + raw.chars().take(300).collect() +} + /// Find the byte offset of the first `\n\n` (end of an SSE frame) in `buf`. /// Returns the index of the first `\n` of the pair, so the full separator is /// `buf[idx..=idx+1]`. Also handles `\r\n\r\n` since some servers emit it.