feat(ai): rerank timing + think:false + OpenRouter error detail

- search_rag reranker now logs wall-clock time around the ollama.generate
  call, the candidate count / top-N going in, and the final reordering.
  The "final indices" + swap-count line is info level so it's always
  visible; detailed before/after previews stay at debug for when you want
  to inspect reranker quality.
- New OllamaClient::generate_no_think convenience that sets Ollama's
  top-level think:false on the request, plumbed through try_generate via
  a new internal generate_with_options. Used only by the reranker today;
  avoids the chain-of-thought tax on reasoning models (Qwen3/VL,
  DeepSeek-R1 distills, GPT-OSS) when the task has nothing to reason
  about. Server-side no-op on non-reasoning models.
- OpenRouter chat_with_tools "missing choices[0]" error now includes the
  actual response body — extracts structured {error: {code, message}}
  when OpenRouter surfaces it (common for upstream-provider issues like
  rate limits and content moderation), otherwise falls back to a
  truncated raw-JSON view.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Cameron
2026-04-24 16:19:45 -04:00
parent e5781325c6
commit 0ebc2e9003
3 changed files with 121 additions and 6 deletions

View File

@@ -381,6 +381,7 @@ impl OllamaClient {
prompt: &str,
system: Option<&str>,
images: Option<Vec<String>>,
think: Option<bool>,
) -> Result<String> {
let request = OllamaRequest {
model: model.to_string(),
@@ -389,6 +390,7 @@ impl OllamaClient {
system: system.map(|s| s.to_string()),
options: self.build_options(),
images,
think,
};
let response = self
@@ -422,11 +424,31 @@ impl OllamaClient {
self.generate_with_images(prompt, system, None).await
}
/// Variant of `generate` that sets Ollama's top-level `think: false`.
/// Used by latency-sensitive callers like the rerank pass, where the
/// task has nothing to reason about and chain-of-thought tokens are
/// wasted wall time. Server-side no-op on non-reasoning models.
pub async fn generate_no_think(&self, prompt: &str, system: Option<&str>) -> Result<String> {
self.generate_with_options(prompt, system, None, Some(false))
.await
}
pub async fn generate_with_images(
&self,
prompt: &str,
system: Option<&str>,
images: Option<Vec<String>>,
) -> Result<String> {
self.generate_with_options(prompt, system, images, None)
.await
}
async fn generate_with_options(
&self,
prompt: &str,
system: Option<&str>,
images: Option<Vec<String>>,
think: Option<bool>,
) -> Result<String> {
log::debug!("=== Ollama Request ===");
log::debug!("Primary model: {}", self.primary_model);
@@ -452,6 +474,7 @@ impl OllamaClient {
prompt,
system,
images.clone(),
think,
)
.await;
@@ -475,7 +498,14 @@ impl OllamaClient {
fallback_model
);
match self
.try_generate(fallback_url, fallback_model, prompt, system, images.clone())
.try_generate(
fallback_url,
fallback_model,
prompt,
system,
images.clone(),
think,
)
.await
{
Ok(response) => {
@@ -1134,6 +1164,12 @@ struct OllamaRequest {
options: Option<OllamaOptions>,
#[serde(skip_serializing_if = "Option::is_none")]
images: Option<Vec<String>>,
/// Ollama's top-level reasoning-mode toggle (~0.4+). `Some(false)`
/// asks the server to skip thinking on models that expose a toggle
/// (Qwen3, Ollama-integrated DeepSeek-R1 distills, GPT-OSS, etc).
/// Ignored by non-reasoning models. None = use the model's default.
#[serde(skip_serializing_if = "Option::is_none")]
think: Option<bool>,
}
#[derive(Serialize)]