feat(ai): rerank timing + think:false + OpenRouter error detail

- search_rag reranker now logs wall-clock time around the ollama.generate call, the candidate count / top-N going in, and the final reordering. The "final indices" + swap-count line is info level so it's always visible; detailed before/after previews stay at debug for when you want to inspect reranker quality. - New OllamaClient::generate_no_think convenience that sets Ollama's top-level think:false on the request, plumbed through try_generate via a new internal generate_with_options. Used only by the reranker today; avoids the chain-of-thought tax on reasoning models (Qwen3/VL, DeepSeek-R1 distills, GPT-OSS) when the task has nothing to reason about. Server-side no-op on non-reasoning models. - OpenRouter chat_with_tools "missing choices[0]" error now includes the actual response body — extracts structured {error: {code, message}} when OpenRouter surfaces it (common for upstream-provider issues like rate limits and content moderation), otherwise falls back to a truncated raw-JSON view. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-24 16:19:45 -04:00
parent e5781325c6
commit 0ebc2e9003
3 changed files with 121 additions and 6 deletions
--- a/src/ai/ollama.rs
+++ b/src/ai/ollama.rs
@@ -381,6 +381,7 @@ impl OllamaClient {
        prompt: &str,
        system: Option<&str>,
        images: Option<Vec<String>>,
+        think: Option<bool>,
    ) -> Result<String> {
        let request = OllamaRequest {
            model: model.to_string(),
@@ -389,6 +390,7 @@ impl OllamaClient {
            system: system.map(|s| s.to_string()),
            options: self.build_options(),
            images,
+            think,
        };

        let response = self
@@ -422,11 +424,31 @@ impl OllamaClient {
        self.generate_with_images(prompt, system, None).await
    }

+    /// Variant of `generate` that sets Ollama's top-level `think: false`.
+    /// Used by latency-sensitive callers like the rerank pass, where the
+    /// task has nothing to reason about and chain-of-thought tokens are
+    /// wasted wall time. Server-side no-op on non-reasoning models.
+    pub async fn generate_no_think(&self, prompt: &str, system: Option<&str>) -> Result<String> {
+        self.generate_with_options(prompt, system, None, Some(false))
+            .await
+    }
+
    pub async fn generate_with_images(
        &self,
        prompt: &str,
        system: Option<&str>,
        images: Option<Vec<String>>,
+    ) -> Result<String> {
+        self.generate_with_options(prompt, system, images, None)
+            .await
+    }
+
+    async fn generate_with_options(
+        &self,
+        prompt: &str,
+        system: Option<&str>,
+        images: Option<Vec<String>>,
+        think: Option<bool>,
    ) -> Result<String> {
        log::debug!("=== Ollama Request ===");
        log::debug!("Primary model: {}", self.primary_model);
@@ -452,6 +474,7 @@ impl OllamaClient {
                prompt,
                system,
                images.clone(),
+                think,
            )
            .await;

@@ -475,7 +498,14 @@ impl OllamaClient {
                        fallback_model
                    );
                    match self
-                        .try_generate(fallback_url, fallback_model, prompt, system, images.clone())
+                        .try_generate(
+                            fallback_url,
+                            fallback_model,
+                            prompt,
+                            system,
+                            images.clone(),
+                            think,
+                        )
                        .await
                    {
                        Ok(response) => {
@@ -1134,6 +1164,12 @@ struct OllamaRequest {
    options: Option<OllamaOptions>,
    #[serde(skip_serializing_if = "Option::is_none")]
    images: Option<Vec<String>>,
+    /// Ollama's top-level reasoning-mode toggle (~0.4+). `Some(false)`
+    /// asks the server to skip thinking on models that expose a toggle
+    /// (Qwen3, Ollama-integrated DeepSeek-R1 distills, GPT-OSS, etc).
+    /// Ignored by non-reasoning models. None = use the model's default.
+    #[serde(skip_serializing_if = "Option::is_none")]
+    think: Option<bool>,
 }

 #[derive(Serialize)]