feat(ai): rerank timing + think:false + OpenRouter error detail
- search_rag reranker now logs wall-clock time around the ollama.generate
call, the candidate count / top-N going in, and the final reordering.
The "final indices" + swap-count line is info level so it's always
visible; detailed before/after previews stay at debug for when you want
to inspect reranker quality.
- New OllamaClient::generate_no_think convenience that sets Ollama's
top-level think:false on the request, plumbed through try_generate via
a new internal generate_with_options. Used only by the reranker today;
avoids the chain-of-thought tax on reasoning models (Qwen3/VL,
DeepSeek-R1 distills, GPT-OSS) when the task has nothing to reason
about. Server-side no-op on non-reasoning models.
- OpenRouter chat_with_tools "missing choices[0]" error now includes the
actual response body — extracts structured {error: {code, message}}
when OpenRouter surfaces it (common for upstream-provider issues like
rate limits and content moderation), otherwise falls back to a
truncated raw-JSON view.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -381,6 +381,7 @@ impl OllamaClient {
|
||||
prompt: &str,
|
||||
system: Option<&str>,
|
||||
images: Option<Vec<String>>,
|
||||
think: Option<bool>,
|
||||
) -> Result<String> {
|
||||
let request = OllamaRequest {
|
||||
model: model.to_string(),
|
||||
@@ -389,6 +390,7 @@ impl OllamaClient {
|
||||
system: system.map(|s| s.to_string()),
|
||||
options: self.build_options(),
|
||||
images,
|
||||
think,
|
||||
};
|
||||
|
||||
let response = self
|
||||
@@ -422,11 +424,31 @@ impl OllamaClient {
|
||||
self.generate_with_images(prompt, system, None).await
|
||||
}
|
||||
|
||||
/// Variant of `generate` that sets Ollama's top-level `think: false`.
|
||||
/// Used by latency-sensitive callers like the rerank pass, where the
|
||||
/// task has nothing to reason about and chain-of-thought tokens are
|
||||
/// wasted wall time. Server-side no-op on non-reasoning models.
|
||||
pub async fn generate_no_think(&self, prompt: &str, system: Option<&str>) -> Result<String> {
|
||||
self.generate_with_options(prompt, system, None, Some(false))
|
||||
.await
|
||||
}
|
||||
|
||||
pub async fn generate_with_images(
|
||||
&self,
|
||||
prompt: &str,
|
||||
system: Option<&str>,
|
||||
images: Option<Vec<String>>,
|
||||
) -> Result<String> {
|
||||
self.generate_with_options(prompt, system, images, None)
|
||||
.await
|
||||
}
|
||||
|
||||
async fn generate_with_options(
|
||||
&self,
|
||||
prompt: &str,
|
||||
system: Option<&str>,
|
||||
images: Option<Vec<String>>,
|
||||
think: Option<bool>,
|
||||
) -> Result<String> {
|
||||
log::debug!("=== Ollama Request ===");
|
||||
log::debug!("Primary model: {}", self.primary_model);
|
||||
@@ -452,6 +474,7 @@ impl OllamaClient {
|
||||
prompt,
|
||||
system,
|
||||
images.clone(),
|
||||
think,
|
||||
)
|
||||
.await;
|
||||
|
||||
@@ -475,7 +498,14 @@ impl OllamaClient {
|
||||
fallback_model
|
||||
);
|
||||
match self
|
||||
.try_generate(fallback_url, fallback_model, prompt, system, images.clone())
|
||||
.try_generate(
|
||||
fallback_url,
|
||||
fallback_model,
|
||||
prompt,
|
||||
system,
|
||||
images.clone(),
|
||||
think,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(response) => {
|
||||
@@ -1134,6 +1164,12 @@ struct OllamaRequest {
|
||||
options: Option<OllamaOptions>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
images: Option<Vec<String>>,
|
||||
/// Ollama's top-level reasoning-mode toggle (~0.4+). `Some(false)`
|
||||
/// asks the server to skip thinking on models that expose a toggle
|
||||
/// (Qwen3, Ollama-integrated DeepSeek-R1 distills, GPT-OSS, etc).
|
||||
/// Ignored by non-reasoning models. None = use the model's default.
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
think: Option<bool>,
|
||||
}
|
||||
|
||||
#[derive(Serialize)]
|
||||
|
||||
Reference in New Issue
Block a user