From 0ebc2e9003338ca1ecc7a47fb0b1e57d322cff6d Mon Sep 17 00:00:00 2001
From: Cameron <ccordes12@mail.bw.edu>
Date: Fri, 24 Apr 2026 16:19:45 -0400
Subject: [PATCH] feat(ai): rerank timing + think:false + OpenRouter error
 detail
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- search_rag reranker now logs wall-clock time around the ollama.generate
  call, the candidate count / top-N going in, and the final reordering.
  The "final indices" + swap-count line is info level so it's always
  visible; detailed before/after previews stay at debug for when you want
  to inspect reranker quality.
- New OllamaClient::generate_no_think convenience that sets Ollama's
  top-level think:false on the request, plumbed through try_generate via
  a new internal generate_with_options. Used only by the reranker today;
  avoids the chain-of-thought tax on reasoning models (Qwen3/VL,
  DeepSeek-R1 distills, GPT-OSS) when the task has nothing to reason
  about. Server-side no-op on non-reasoning models.
- OpenRouter chat_with_tools "missing choices[0]" error now includes the
  actual response body — extracts structured {error: {code, message}}
  when OpenRouter surfaces it (common for upstream-provider issues like
  rate limits and content moderation), otherwise falls back to a
  truncated raw-JSON view.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 src/ai/insight_generator.rs | 46 ++++++++++++++++++++++++++++++++++++-
 src/ai/ollama.rs            | 38 +++++++++++++++++++++++++++++-
 src/ai/openrouter.rs        | 43 ++++++++++++++++++++++++++++++----
 3 files changed, 121 insertions(+), 6 deletions(-)
diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs
index d718b7e..44a2a4e 100644
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -1547,6 +1547,14 @@ Return ONLY the summary, nothing else."#,
         limit: usize,
         ollama: &OllamaClient,
     ) -> Result<Vec<String>> {
+        let query_preview: String = query.chars().take(60).collect();
+        log::info!(
+            "rerank: {} candidates -> top {} (query=\"{}\")",
+            candidates.len(),
+            limit,
+            query_preview
+        );
+
         // Build numbered list (1-based for readability). Cap each passage
         // at ~1000 chars so very long summaries don't eat the prompt.
         let numbered: String = candidates
@@ -1574,14 +1582,20 @@ Return ONLY the summary, nothing else."#,
             limit, query, numbered, limit
         );
 
+        let started = std::time::Instant::now();
         let response = ollama
-            .generate(
+            .generate_no_think(
                 &prompt,
                 Some(
                     "You are a terse relevance ranker. You output only numbers separated by commas.",
                 ),
             )
             .await?;
+        log::info!(
+            "rerank: finished in {} ms (prompt={} chars)",
+            started.elapsed().as_millis(),
+            prompt.len()
+        );
 
         // Extract indices from the response. Accept "3, 1, 7" and also
         // tolerate "[3, 1, 7]" or "3,1,7,..." with trailing junk.
@@ -1600,9 +1614,11 @@ Return ONLY the summary, nothing else."#,
 
         let mut seen = std::collections::HashSet::new();
         let mut reordered: Vec<String> = Vec::with_capacity(limit);
+        let mut final_indices: Vec<usize> = Vec::with_capacity(limit);
         for n in picks {
             if seen.insert(n) {
                 reordered.push(candidates[n - 1].clone());
+                final_indices.push(n);
                 if reordered.len() >= limit {
                     break;
                 }
@@ -1614,12 +1630,40 @@ Return ONLY the summary, nothing else."#,
             for (i, c) in candidates.iter().enumerate() {
                 if !seen.contains(&(i + 1)) {
                     reordered.push(c.clone());
+                    final_indices.push(i + 1);
                     if reordered.len() >= limit {
                         break;
                     }
                 }
             }
         }
+
+        // Debug snapshot: show what the reranker changed. Position p holds
+        // the 1-based index of the candidate that now sits at position p.
+        // A value that equals its position means "no change at that slot".
+        let swapped = final_indices
+            .iter()
+            .enumerate()
+            .filter(|(pos, idx)| **idx != pos + 1)
+            .count();
+        log::info!(
+            "rerank: final indices (1-based): {:?} — {} of top {} swapped from vector order",
+            final_indices,
+            swapped,
+            final_indices.len()
+        );
+        let show = final_indices.len().min(5);
+        log::debug!("rerank: vector-order top {}:", show);
+        for (i, c) in candidates.iter().enumerate().take(show) {
+            let preview: String = c.chars().take(100).collect();
+            log::debug!("rerank:   [{}] {}", i + 1, preview);
+        }
+        log::debug!("rerank: reranked top {}:", show);
+        for (pos, idx) in final_indices.iter().enumerate().take(show) {
+            let preview: String = candidates[*idx - 1].chars().take(100).collect();
+            log::debug!("rerank:   [{}] (orig #{}) {}", pos + 1, idx, preview);
+        }
+
         Ok(reordered)
     }
 
diff --git a/src/ai/ollama.rs b/src/ai/ollama.rs
index eb810d2..c56e1e7 100644
--- a/src/ai/ollama.rs
+++ b/src/ai/ollama.rs
@@ -381,6 +381,7 @@ impl OllamaClient {
         prompt: &str,
         system: Option<&str>,
         images: Option<Vec<String>>,
+        think: Option<bool>,
     ) -> Result<String> {
         let request = OllamaRequest {
             model: model.to_string(),
@@ -389,6 +390,7 @@ impl OllamaClient {
             system: system.map(|s| s.to_string()),
             options: self.build_options(),
             images,
+            think,
         };
 
         let response = self
@@ -422,11 +424,31 @@ impl OllamaClient {
         self.generate_with_images(prompt, system, None).await
     }
 
+    /// Variant of `generate` that sets Ollama's top-level `think: false`.
+    /// Used by latency-sensitive callers like the rerank pass, where the
+    /// task has nothing to reason about and chain-of-thought tokens are
+    /// wasted wall time. Server-side no-op on non-reasoning models.
+    pub async fn generate_no_think(&self, prompt: &str, system: Option<&str>) -> Result<String> {
+        self.generate_with_options(prompt, system, None, Some(false))
+            .await
+    }
+
     pub async fn generate_with_images(
         &self,
         prompt: &str,
         system: Option<&str>,
         images: Option<Vec<String>>,
+    ) -> Result<String> {
+        self.generate_with_options(prompt, system, images, None)
+            .await
+    }
+
+    async fn generate_with_options(
+        &self,
+        prompt: &str,
+        system: Option<&str>,
+        images: Option<Vec<String>>,
+        think: Option<bool>,
     ) -> Result<String> {
         log::debug!("=== Ollama Request ===");
         log::debug!("Primary model: {}", self.primary_model);
@@ -452,6 +474,7 @@ impl OllamaClient {
                 prompt,
                 system,
                 images.clone(),
+                think,
             )
             .await;
 
@@ -475,7 +498,14 @@ impl OllamaClient {
                         fallback_model
                     );
                     match self
-                        .try_generate(fallback_url, fallback_model, prompt, system, images.clone())
+                        .try_generate(
+                            fallback_url,
+                            fallback_model,
+                            prompt,
+                            system,
+                            images.clone(),
+                            think,
+                        )
                         .await
                     {
                         Ok(response) => {
@@ -1134,6 +1164,12 @@ struct OllamaRequest {
     options: Option<OllamaOptions>,
     #[serde(skip_serializing_if = "Option::is_none")]
     images: Option<Vec<String>>,
+    /// Ollama's top-level reasoning-mode toggle (~0.4+). `Some(false)`
+    /// asks the server to skip thinking on models that expose a toggle
+    /// (Qwen3, Ollama-integrated DeepSeek-R1 distills, GPT-OSS, etc).
+    /// Ignored by non-reasoning models. None = use the model's default.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    think: Option<bool>,
 }
 
 #[derive(Serialize)]
diff --git a/src/ai/openrouter.rs b/src/ai/openrouter.rs
index 82195cf..62a3cca 100644
--- a/src/ai/openrouter.rs
+++ b/src/ai/openrouter.rs
@@ -360,10 +360,18 @@ impl LlmClient for OpenRouterClient {
             .get("choices")
             .and_then(|v| v.as_array())
             .and_then(|a| a.first())
-            .ok_or_else(|| anyhow!("response missing choices[0]"))?;
-        let msg = choice
-            .get("message")
-            .ok_or_else(|| anyhow!("choices[0] missing message"))?;
+            .ok_or_else(|| {
+                anyhow!(
+                    "response missing choices[0]: {}",
+                    extract_openrouter_error_detail(&parsed)
+                )
+            })?;
+        let msg = choice.get("message").ok_or_else(|| {
+            anyhow!(
+                "choices[0] missing message: {}",
+                extract_openrouter_error_detail(&parsed)
+            )
+        })?;
         let chat_msg = Self::openai_message_to_chat(msg)?;
 
         let usage = parsed.get("usage");
@@ -687,6 +695,33 @@ impl LlmClient for OpenRouterClient {
     }
 }
 
+/// Extract a diagnostic fragment from an OpenRouter response body that
+/// doesn't match the expected `{choices: [...]}` shape. OpenRouter will
+/// sometimes return 200 OK with `{"error": {"message": "...", "code": ...}}`
+/// when the upstream provider (Anthropic/OpenAI/Google/etc) errored out
+/// — rate limits, content moderation, model overload, provider timeout.
+/// Surface the structured error if present; otherwise fall back to a
+/// truncated raw-JSON view so the log line is actionable.
+fn extract_openrouter_error_detail(parsed: &Value) -> String {
+    if let Some(err) = parsed.get("error") {
+        let message = err
+            .get("message")
+            .and_then(|v| v.as_str())
+            .unwrap_or("(no message)");
+        let code = err
+            .get("code")
+            .map(|v| match v {
+                Value::String(s) => s.clone(),
+                other => other.to_string(),
+            })
+            .unwrap_or_else(|| "?".to_string());
+        let short_message: String = message.chars().take(240).collect();
+        return format!("error code={} message=\"{}\"", code, short_message);
+    }
+    let raw = parsed.to_string();
+    raw.chars().take(300).collect()
+}
+
 /// Find the byte offset of the first `\n\n` (end of an SSE frame) in `buf`.
 /// Returns the index of the first `\n` of the pair, so the full separator is
 /// `buf[idx..=idx+1]`. Also handles `\r\n\r\n` since some servers emit it.