feat(ai): rerank timing + think:false + OpenRouter error detail
- search_rag reranker now logs wall-clock time around the ollama.generate
call, the candidate count / top-N going in, and the final reordering.
The "final indices" + swap-count line is info level so it's always
visible; detailed before/after previews stay at debug for when you want
to inspect reranker quality.
- New OllamaClient::generate_no_think convenience that sets Ollama's
top-level think:false on the request, plumbed through try_generate via
a new internal generate_with_options. Used only by the reranker today;
avoids the chain-of-thought tax on reasoning models (Qwen3/VL,
DeepSeek-R1 distills, GPT-OSS) when the task has nothing to reason
about. Server-side no-op on non-reasoning models.
- OpenRouter chat_with_tools "missing choices[0]" error now includes the
actual response body — extracts structured {error: {code, message}}
when OpenRouter surfaces it (common for upstream-provider issues like
rate limits and content moderation), otherwise falls back to a
truncated raw-JSON view.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -1547,6 +1547,14 @@ Return ONLY the summary, nothing else."#,
|
|||||||
limit: usize,
|
limit: usize,
|
||||||
ollama: &OllamaClient,
|
ollama: &OllamaClient,
|
||||||
) -> Result<Vec<String>> {
|
) -> Result<Vec<String>> {
|
||||||
|
let query_preview: String = query.chars().take(60).collect();
|
||||||
|
log::info!(
|
||||||
|
"rerank: {} candidates -> top {} (query=\"{}\")",
|
||||||
|
candidates.len(),
|
||||||
|
limit,
|
||||||
|
query_preview
|
||||||
|
);
|
||||||
|
|
||||||
// Build numbered list (1-based for readability). Cap each passage
|
// Build numbered list (1-based for readability). Cap each passage
|
||||||
// at ~1000 chars so very long summaries don't eat the prompt.
|
// at ~1000 chars so very long summaries don't eat the prompt.
|
||||||
let numbered: String = candidates
|
let numbered: String = candidates
|
||||||
@@ -1574,14 +1582,20 @@ Return ONLY the summary, nothing else."#,
|
|||||||
limit, query, numbered, limit
|
limit, query, numbered, limit
|
||||||
);
|
);
|
||||||
|
|
||||||
|
let started = std::time::Instant::now();
|
||||||
let response = ollama
|
let response = ollama
|
||||||
.generate(
|
.generate_no_think(
|
||||||
&prompt,
|
&prompt,
|
||||||
Some(
|
Some(
|
||||||
"You are a terse relevance ranker. You output only numbers separated by commas.",
|
"You are a terse relevance ranker. You output only numbers separated by commas.",
|
||||||
),
|
),
|
||||||
)
|
)
|
||||||
.await?;
|
.await?;
|
||||||
|
log::info!(
|
||||||
|
"rerank: finished in {} ms (prompt={} chars)",
|
||||||
|
started.elapsed().as_millis(),
|
||||||
|
prompt.len()
|
||||||
|
);
|
||||||
|
|
||||||
// Extract indices from the response. Accept "3, 1, 7" and also
|
// Extract indices from the response. Accept "3, 1, 7" and also
|
||||||
// tolerate "[3, 1, 7]" or "3,1,7,..." with trailing junk.
|
// tolerate "[3, 1, 7]" or "3,1,7,..." with trailing junk.
|
||||||
@@ -1600,9 +1614,11 @@ Return ONLY the summary, nothing else."#,
|
|||||||
|
|
||||||
let mut seen = std::collections::HashSet::new();
|
let mut seen = std::collections::HashSet::new();
|
||||||
let mut reordered: Vec<String> = Vec::with_capacity(limit);
|
let mut reordered: Vec<String> = Vec::with_capacity(limit);
|
||||||
|
let mut final_indices: Vec<usize> = Vec::with_capacity(limit);
|
||||||
for n in picks {
|
for n in picks {
|
||||||
if seen.insert(n) {
|
if seen.insert(n) {
|
||||||
reordered.push(candidates[n - 1].clone());
|
reordered.push(candidates[n - 1].clone());
|
||||||
|
final_indices.push(n);
|
||||||
if reordered.len() >= limit {
|
if reordered.len() >= limit {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
@@ -1614,12 +1630,40 @@ Return ONLY the summary, nothing else."#,
|
|||||||
for (i, c) in candidates.iter().enumerate() {
|
for (i, c) in candidates.iter().enumerate() {
|
||||||
if !seen.contains(&(i + 1)) {
|
if !seen.contains(&(i + 1)) {
|
||||||
reordered.push(c.clone());
|
reordered.push(c.clone());
|
||||||
|
final_indices.push(i + 1);
|
||||||
if reordered.len() >= limit {
|
if reordered.len() >= limit {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Debug snapshot: show what the reranker changed. Position p holds
|
||||||
|
// the 1-based index of the candidate that now sits at position p.
|
||||||
|
// A value that equals its position means "no change at that slot".
|
||||||
|
let swapped = final_indices
|
||||||
|
.iter()
|
||||||
|
.enumerate()
|
||||||
|
.filter(|(pos, idx)| **idx != pos + 1)
|
||||||
|
.count();
|
||||||
|
log::info!(
|
||||||
|
"rerank: final indices (1-based): {:?} — {} of top {} swapped from vector order",
|
||||||
|
final_indices,
|
||||||
|
swapped,
|
||||||
|
final_indices.len()
|
||||||
|
);
|
||||||
|
let show = final_indices.len().min(5);
|
||||||
|
log::debug!("rerank: vector-order top {}:", show);
|
||||||
|
for (i, c) in candidates.iter().enumerate().take(show) {
|
||||||
|
let preview: String = c.chars().take(100).collect();
|
||||||
|
log::debug!("rerank: [{}] {}", i + 1, preview);
|
||||||
|
}
|
||||||
|
log::debug!("rerank: reranked top {}:", show);
|
||||||
|
for (pos, idx) in final_indices.iter().enumerate().take(show) {
|
||||||
|
let preview: String = candidates[*idx - 1].chars().take(100).collect();
|
||||||
|
log::debug!("rerank: [{}] (orig #{}) {}", pos + 1, idx, preview);
|
||||||
|
}
|
||||||
|
|
||||||
Ok(reordered)
|
Ok(reordered)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -381,6 +381,7 @@ impl OllamaClient {
|
|||||||
prompt: &str,
|
prompt: &str,
|
||||||
system: Option<&str>,
|
system: Option<&str>,
|
||||||
images: Option<Vec<String>>,
|
images: Option<Vec<String>>,
|
||||||
|
think: Option<bool>,
|
||||||
) -> Result<String> {
|
) -> Result<String> {
|
||||||
let request = OllamaRequest {
|
let request = OllamaRequest {
|
||||||
model: model.to_string(),
|
model: model.to_string(),
|
||||||
@@ -389,6 +390,7 @@ impl OllamaClient {
|
|||||||
system: system.map(|s| s.to_string()),
|
system: system.map(|s| s.to_string()),
|
||||||
options: self.build_options(),
|
options: self.build_options(),
|
||||||
images,
|
images,
|
||||||
|
think,
|
||||||
};
|
};
|
||||||
|
|
||||||
let response = self
|
let response = self
|
||||||
@@ -422,11 +424,31 @@ impl OllamaClient {
|
|||||||
self.generate_with_images(prompt, system, None).await
|
self.generate_with_images(prompt, system, None).await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Variant of `generate` that sets Ollama's top-level `think: false`.
|
||||||
|
/// Used by latency-sensitive callers like the rerank pass, where the
|
||||||
|
/// task has nothing to reason about and chain-of-thought tokens are
|
||||||
|
/// wasted wall time. Server-side no-op on non-reasoning models.
|
||||||
|
pub async fn generate_no_think(&self, prompt: &str, system: Option<&str>) -> Result<String> {
|
||||||
|
self.generate_with_options(prompt, system, None, Some(false))
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
pub async fn generate_with_images(
|
pub async fn generate_with_images(
|
||||||
&self,
|
&self,
|
||||||
prompt: &str,
|
prompt: &str,
|
||||||
system: Option<&str>,
|
system: Option<&str>,
|
||||||
images: Option<Vec<String>>,
|
images: Option<Vec<String>>,
|
||||||
|
) -> Result<String> {
|
||||||
|
self.generate_with_options(prompt, system, images, None)
|
||||||
|
.await
|
||||||
|
}
|
||||||
|
|
||||||
|
async fn generate_with_options(
|
||||||
|
&self,
|
||||||
|
prompt: &str,
|
||||||
|
system: Option<&str>,
|
||||||
|
images: Option<Vec<String>>,
|
||||||
|
think: Option<bool>,
|
||||||
) -> Result<String> {
|
) -> Result<String> {
|
||||||
log::debug!("=== Ollama Request ===");
|
log::debug!("=== Ollama Request ===");
|
||||||
log::debug!("Primary model: {}", self.primary_model);
|
log::debug!("Primary model: {}", self.primary_model);
|
||||||
@@ -452,6 +474,7 @@ impl OllamaClient {
|
|||||||
prompt,
|
prompt,
|
||||||
system,
|
system,
|
||||||
images.clone(),
|
images.clone(),
|
||||||
|
think,
|
||||||
)
|
)
|
||||||
.await;
|
.await;
|
||||||
|
|
||||||
@@ -475,7 +498,14 @@ impl OllamaClient {
|
|||||||
fallback_model
|
fallback_model
|
||||||
);
|
);
|
||||||
match self
|
match self
|
||||||
.try_generate(fallback_url, fallback_model, prompt, system, images.clone())
|
.try_generate(
|
||||||
|
fallback_url,
|
||||||
|
fallback_model,
|
||||||
|
prompt,
|
||||||
|
system,
|
||||||
|
images.clone(),
|
||||||
|
think,
|
||||||
|
)
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
Ok(response) => {
|
Ok(response) => {
|
||||||
@@ -1134,6 +1164,12 @@ struct OllamaRequest {
|
|||||||
options: Option<OllamaOptions>,
|
options: Option<OllamaOptions>,
|
||||||
#[serde(skip_serializing_if = "Option::is_none")]
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
images: Option<Vec<String>>,
|
images: Option<Vec<String>>,
|
||||||
|
/// Ollama's top-level reasoning-mode toggle (~0.4+). `Some(false)`
|
||||||
|
/// asks the server to skip thinking on models that expose a toggle
|
||||||
|
/// (Qwen3, Ollama-integrated DeepSeek-R1 distills, GPT-OSS, etc).
|
||||||
|
/// Ignored by non-reasoning models. None = use the model's default.
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
think: Option<bool>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Serialize)]
|
#[derive(Serialize)]
|
||||||
|
|||||||
@@ -360,10 +360,18 @@ impl LlmClient for OpenRouterClient {
|
|||||||
.get("choices")
|
.get("choices")
|
||||||
.and_then(|v| v.as_array())
|
.and_then(|v| v.as_array())
|
||||||
.and_then(|a| a.first())
|
.and_then(|a| a.first())
|
||||||
.ok_or_else(|| anyhow!("response missing choices[0]"))?;
|
.ok_or_else(|| {
|
||||||
let msg = choice
|
anyhow!(
|
||||||
.get("message")
|
"response missing choices[0]: {}",
|
||||||
.ok_or_else(|| anyhow!("choices[0] missing message"))?;
|
extract_openrouter_error_detail(&parsed)
|
||||||
|
)
|
||||||
|
})?;
|
||||||
|
let msg = choice.get("message").ok_or_else(|| {
|
||||||
|
anyhow!(
|
||||||
|
"choices[0] missing message: {}",
|
||||||
|
extract_openrouter_error_detail(&parsed)
|
||||||
|
)
|
||||||
|
})?;
|
||||||
let chat_msg = Self::openai_message_to_chat(msg)?;
|
let chat_msg = Self::openai_message_to_chat(msg)?;
|
||||||
|
|
||||||
let usage = parsed.get("usage");
|
let usage = parsed.get("usage");
|
||||||
@@ -687,6 +695,33 @@ impl LlmClient for OpenRouterClient {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Extract a diagnostic fragment from an OpenRouter response body that
|
||||||
|
/// doesn't match the expected `{choices: [...]}` shape. OpenRouter will
|
||||||
|
/// sometimes return 200 OK with `{"error": {"message": "...", "code": ...}}`
|
||||||
|
/// when the upstream provider (Anthropic/OpenAI/Google/etc) errored out
|
||||||
|
/// — rate limits, content moderation, model overload, provider timeout.
|
||||||
|
/// Surface the structured error if present; otherwise fall back to a
|
||||||
|
/// truncated raw-JSON view so the log line is actionable.
|
||||||
|
fn extract_openrouter_error_detail(parsed: &Value) -> String {
|
||||||
|
if let Some(err) = parsed.get("error") {
|
||||||
|
let message = err
|
||||||
|
.get("message")
|
||||||
|
.and_then(|v| v.as_str())
|
||||||
|
.unwrap_or("(no message)");
|
||||||
|
let code = err
|
||||||
|
.get("code")
|
||||||
|
.map(|v| match v {
|
||||||
|
Value::String(s) => s.clone(),
|
||||||
|
other => other.to_string(),
|
||||||
|
})
|
||||||
|
.unwrap_or_else(|| "?".to_string());
|
||||||
|
let short_message: String = message.chars().take(240).collect();
|
||||||
|
return format!("error code={} message=\"{}\"", code, short_message);
|
||||||
|
}
|
||||||
|
let raw = parsed.to_string();
|
||||||
|
raw.chars().take(300).collect()
|
||||||
|
}
|
||||||
|
|
||||||
/// Find the byte offset of the first `\n\n` (end of an SSE frame) in `buf`.
|
/// Find the byte offset of the first `\n\n` (end of an SSE frame) in `buf`.
|
||||||
/// Returns the index of the first `\n` of the pair, so the full separator is
|
/// Returns the index of the first `\n` of the pair, so the full separator is
|
||||||
/// `buf[idx..=idx+1]`. Also handles `\r\n\r\n` since some servers emit it.
|
/// `buf[idx..=idx+1]`. Also handles `\r\n\r\n` since some servers emit it.
|
||||||
|
|||||||
Reference in New Issue
Block a user