AI: add enable_thinking reasoning toggle plumbed to llama.cpp
New optional SamplingOverride forwarded to llama-server as chat_template_kwargs.enable_thinking (gates Qwen3-style reasoning blocks). None leaves the template default; other backends ignore it. Wired through the agentic-insight and chat-turn request bodies/handlers. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -41,6 +41,10 @@ pub struct SamplingOverrides {
|
|||||||
pub top_p: Option<f32>,
|
pub top_p: Option<f32>,
|
||||||
pub top_k: Option<i32>,
|
pub top_k: Option<i32>,
|
||||||
pub min_p: Option<f32>,
|
pub min_p: Option<f32>,
|
||||||
|
/// Reasoning toggle. Only the llama.cpp backend honors it (forwarded as
|
||||||
|
/// `chat_template_kwargs.enable_thinking`); other backends ignore it.
|
||||||
|
/// `None` leaves the model/template default in place.
|
||||||
|
pub enable_thinking: Option<bool>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl SamplingOverrides {
|
impl SamplingOverrides {
|
||||||
@@ -124,6 +128,7 @@ mod tests {
|
|||||||
top_p: None,
|
top_p: None,
|
||||||
top_k: None,
|
top_k: None,
|
||||||
min_p: None,
|
min_p: None,
|
||||||
|
enable_thinking: None,
|
||||||
};
|
};
|
||||||
assert!(!empty.has_sampling());
|
assert!(!empty.has_sampling());
|
||||||
|
|
||||||
@@ -134,6 +139,7 @@ mod tests {
|
|||||||
top_p: None,
|
top_p: None,
|
||||||
top_k: None,
|
top_k: None,
|
||||||
min_p: None,
|
min_p: None,
|
||||||
|
enable_thinking: None,
|
||||||
};
|
};
|
||||||
assert!(with_temp.has_sampling());
|
assert!(with_temp.has_sampling());
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -40,6 +40,12 @@ pub struct GeneratePhotoInsightRequest {
|
|||||||
pub top_k: Option<i32>,
|
pub top_k: Option<i32>,
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub min_p: Option<f32>,
|
pub min_p: Option<f32>,
|
||||||
|
/// Reasoning toggle for thinking-capable models. Forwarded to the
|
||||||
|
/// llama.cpp backend as `chat_template_kwargs.enable_thinking`; ignored
|
||||||
|
/// by other backends and the non-agentic (Ollama) path. Only the agentic
|
||||||
|
/// endpoint routes through llama.cpp. None defers to the template default.
|
||||||
|
#[serde(default)]
|
||||||
|
pub enable_thinking: Option<bool>,
|
||||||
/// `"local"` (default, Ollama with images) | `"hybrid"` (local vision +
|
/// `"local"` (default, Ollama with images) | `"hybrid"` (local vision +
|
||||||
/// OpenRouter chat). Only respected by the agentic endpoint.
|
/// OpenRouter chat). Only respected by the agentic endpoint.
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
@@ -868,6 +874,7 @@ pub async fn generate_agentic_insight_handler(
|
|||||||
request.top_p,
|
request.top_p,
|
||||||
request.top_k,
|
request.top_k,
|
||||||
request.min_p,
|
request.min_p,
|
||||||
|
request.enable_thinking,
|
||||||
max_iterations,
|
max_iterations,
|
||||||
request.backend.clone(),
|
request.backend.clone(),
|
||||||
fewshot_examples,
|
fewshot_examples,
|
||||||
@@ -1169,6 +1176,11 @@ pub struct ChatTurnHttpRequest {
|
|||||||
pub top_k: Option<i32>,
|
pub top_k: Option<i32>,
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub min_p: Option<f32>,
|
pub min_p: Option<f32>,
|
||||||
|
/// Reasoning toggle for thinking-capable models. Forwarded to the
|
||||||
|
/// llama.cpp backend as `chat_template_kwargs.enable_thinking`; ignored
|
||||||
|
/// by other backends. None defers to the model/template default.
|
||||||
|
#[serde(default)]
|
||||||
|
pub enable_thinking: Option<bool>,
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub max_iterations: Option<usize>,
|
pub max_iterations: Option<usize>,
|
||||||
/// Per-turn system-prompt override. Ephemeral in append mode,
|
/// Per-turn system-prompt override. Ephemeral in append mode,
|
||||||
@@ -1247,6 +1259,7 @@ pub async fn chat_turn_handler(
|
|||||||
top_p: request.top_p,
|
top_p: request.top_p,
|
||||||
top_k: request.top_k,
|
top_k: request.top_k,
|
||||||
min_p: request.min_p,
|
min_p: request.min_p,
|
||||||
|
enable_thinking: request.enable_thinking,
|
||||||
max_iterations: request.max_iterations,
|
max_iterations: request.max_iterations,
|
||||||
system_prompt: request.system_prompt.clone(),
|
system_prompt: request.system_prompt.clone(),
|
||||||
persona_id: request.persona_id.clone(),
|
persona_id: request.persona_id.clone(),
|
||||||
@@ -1473,6 +1486,7 @@ pub async fn chat_stream_handler(
|
|||||||
top_p: request.top_p,
|
top_p: request.top_p,
|
||||||
top_k: request.top_k,
|
top_k: request.top_k,
|
||||||
min_p: request.min_p,
|
min_p: request.min_p,
|
||||||
|
enable_thinking: request.enable_thinking,
|
||||||
max_iterations: request.max_iterations,
|
max_iterations: request.max_iterations,
|
||||||
system_prompt: request.system_prompt.clone(),
|
system_prompt: request.system_prompt.clone(),
|
||||||
persona_id: request.persona_id.clone(),
|
persona_id: request.persona_id.clone(),
|
||||||
@@ -1618,6 +1632,7 @@ pub async fn turn_async_handler(
|
|||||||
top_p: request.top_p,
|
top_p: request.top_p,
|
||||||
top_k: request.top_k,
|
top_k: request.top_k,
|
||||||
min_p: request.min_p,
|
min_p: request.min_p,
|
||||||
|
enable_thinking: request.enable_thinking,
|
||||||
max_iterations: request.max_iterations,
|
max_iterations: request.max_iterations,
|
||||||
system_prompt: request.system_prompt.clone(),
|
system_prompt: request.system_prompt.clone(),
|
||||||
persona_id: request.persona_id.clone(),
|
persona_id: request.persona_id.clone(),
|
||||||
|
|||||||
@@ -70,6 +70,10 @@ pub struct ChatTurnRequest {
|
|||||||
pub top_p: Option<f32>,
|
pub top_p: Option<f32>,
|
||||||
pub top_k: Option<i32>,
|
pub top_k: Option<i32>,
|
||||||
pub min_p: Option<f32>,
|
pub min_p: Option<f32>,
|
||||||
|
/// Reasoning toggle for thinking-capable models. Forwarded to the
|
||||||
|
/// llama.cpp backend as `chat_template_kwargs.enable_thinking`; ignored
|
||||||
|
/// by other backends. None defers to the model/template default.
|
||||||
|
pub enable_thinking: Option<bool>,
|
||||||
pub max_iterations: Option<usize>,
|
pub max_iterations: Option<usize>,
|
||||||
/// Per-turn system-prompt override. In append mode (default), applied
|
/// Per-turn system-prompt override. In append mode (default), applied
|
||||||
/// ephemerally — original system message restored before persistence.
|
/// ephemerally — original system message restored before persistence.
|
||||||
@@ -344,6 +348,7 @@ impl InsightChatService {
|
|||||||
top_p: req.top_p,
|
top_p: req.top_p,
|
||||||
top_k: req.top_k,
|
top_k: req.top_k,
|
||||||
min_p: req.min_p,
|
min_p: req.min_p,
|
||||||
|
enable_thinking: req.enable_thinking,
|
||||||
};
|
};
|
||||||
let backend = self.generator.resolve_backend(kind, &overrides).await?;
|
let backend = self.generator.resolve_backend(kind, &overrides).await?;
|
||||||
let model_used = backend.model().to_string();
|
let model_used = backend.model().to_string();
|
||||||
@@ -847,6 +852,7 @@ impl InsightChatService {
|
|||||||
top_p: req.top_p,
|
top_p: req.top_p,
|
||||||
top_k: req.top_k,
|
top_k: req.top_k,
|
||||||
min_p: req.min_p,
|
min_p: req.min_p,
|
||||||
|
enable_thinking: req.enable_thinking,
|
||||||
};
|
};
|
||||||
let backend = self.generator.resolve_backend(kind, &overrides).await?;
|
let backend = self.generator.resolve_backend(kind, &overrides).await?;
|
||||||
let model_used = backend.model().to_string();
|
let model_used = backend.model().to_string();
|
||||||
@@ -1017,6 +1023,7 @@ impl InsightChatService {
|
|||||||
top_p: req.top_p,
|
top_p: req.top_p,
|
||||||
top_k: req.top_k,
|
top_k: req.top_k,
|
||||||
min_p: req.min_p,
|
min_p: req.min_p,
|
||||||
|
enable_thinking: req.enable_thinking,
|
||||||
};
|
};
|
||||||
let backend = self.generator.resolve_backend(kind, &overrides).await?;
|
let backend = self.generator.resolve_backend(kind, &overrides).await?;
|
||||||
let model_used = backend.model().to_string();
|
let model_used = backend.model().to_string();
|
||||||
@@ -1425,6 +1432,7 @@ impl InsightChatService {
|
|||||||
top_p: req.top_p,
|
top_p: req.top_p,
|
||||||
top_k: req.top_k,
|
top_k: req.top_k,
|
||||||
min_p: req.min_p,
|
min_p: req.min_p,
|
||||||
|
enable_thinking: req.enable_thinking,
|
||||||
};
|
};
|
||||||
let backend = self.generator.resolve_backend(kind, &overrides).await?;
|
let backend = self.generator.resolve_backend(kind, &overrides).await?;
|
||||||
let model_used = backend.model().to_string();
|
let model_used = backend.model().to_string();
|
||||||
@@ -1607,6 +1615,7 @@ impl InsightChatService {
|
|||||||
top_p: req.top_p,
|
top_p: req.top_p,
|
||||||
top_k: req.top_k,
|
top_k: req.top_k,
|
||||||
min_p: req.min_p,
|
min_p: req.min_p,
|
||||||
|
enable_thinking: req.enable_thinking,
|
||||||
};
|
};
|
||||||
let backend = self.generator.resolve_backend(kind, &overrides).await?;
|
let backend = self.generator.resolve_backend(kind, &overrides).await?;
|
||||||
let model_used = backend.model().to_string();
|
let model_used = backend.model().to_string();
|
||||||
|
|||||||
@@ -3933,6 +3933,7 @@ Return ONLY the summary, nothing else."#,
|
|||||||
if let Some(ctx) = overrides.num_ctx {
|
if let Some(ctx) = overrides.num_ctx {
|
||||||
c.set_num_ctx(Some(ctx));
|
c.set_num_ctx(Some(ctx));
|
||||||
}
|
}
|
||||||
|
c.set_enable_thinking(overrides.enable_thinking);
|
||||||
Box::new(c)
|
Box::new(c)
|
||||||
} else {
|
} else {
|
||||||
// Pure Ollama local.
|
// Pure Ollama local.
|
||||||
@@ -4064,6 +4065,7 @@ Return ONLY the summary, nothing else."#,
|
|||||||
top_p: Option<f32>,
|
top_p: Option<f32>,
|
||||||
top_k: Option<i32>,
|
top_k: Option<i32>,
|
||||||
min_p: Option<f32>,
|
min_p: Option<f32>,
|
||||||
|
enable_thinking: Option<bool>,
|
||||||
max_iterations: usize,
|
max_iterations: usize,
|
||||||
backend: Option<String>,
|
backend: Option<String>,
|
||||||
fewshot_examples: Vec<Vec<ChatMessage>>,
|
fewshot_examples: Vec<Vec<ChatMessage>>,
|
||||||
@@ -4091,6 +4093,7 @@ Return ONLY the summary, nothing else."#,
|
|||||||
top_p,
|
top_p,
|
||||||
top_k,
|
top_k,
|
||||||
min_p,
|
min_p,
|
||||||
|
enable_thinking,
|
||||||
};
|
};
|
||||||
let backend = self.resolve_backend(kind, &overrides).await?;
|
let backend = self.resolve_backend(kind, &overrides).await?;
|
||||||
span.set_attribute(KeyValue::new("model", backend.model().to_string()));
|
span.set_attribute(KeyValue::new("model", backend.model().to_string()));
|
||||||
|
|||||||
@@ -64,6 +64,12 @@ pub struct LlamaCppClient {
|
|||||||
top_p: Option<f32>,
|
top_p: Option<f32>,
|
||||||
top_k: Option<i32>,
|
top_k: Option<i32>,
|
||||||
min_p: Option<f32>,
|
min_p: Option<f32>,
|
||||||
|
/// When `Some`, forwarded to llama-server as
|
||||||
|
/// `chat_template_kwargs: {"enable_thinking": <bool>}`. The Jinja chat
|
||||||
|
/// template (e.g. Qwen3) reads this to gate its reasoning block. `None`
|
||||||
|
/// omits the key entirely, leaving the template's own default. Templates
|
||||||
|
/// that don't reference the key ignore it, so sending it is harmless.
|
||||||
|
enable_thinking: Option<bool>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl LlamaCppClient {
|
impl LlamaCppClient {
|
||||||
@@ -89,6 +95,7 @@ impl LlamaCppClient {
|
|||||||
top_p: None,
|
top_p: None,
|
||||||
top_k: None,
|
top_k: None,
|
||||||
min_p: None,
|
min_p: None,
|
||||||
|
enable_thinking: None,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -104,6 +111,12 @@ impl LlamaCppClient {
|
|||||||
self.num_ctx = num_ctx;
|
self.num_ctx = num_ctx;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Set the reasoning toggle forwarded as `chat_template_kwargs.enable_thinking`.
|
||||||
|
/// `None` leaves the chat template's own default in place.
|
||||||
|
pub fn set_enable_thinking(&mut self, enable_thinking: Option<bool>) {
|
||||||
|
self.enable_thinking = enable_thinking;
|
||||||
|
}
|
||||||
|
|
||||||
pub fn set_sampling_params(
|
pub fn set_sampling_params(
|
||||||
&mut self,
|
&mut self,
|
||||||
temperature: Option<f32>,
|
temperature: Option<f32>,
|
||||||
@@ -458,6 +471,12 @@ impl LlamaCppClient {
|
|||||||
// via -c, so we silently drop the override here. The config.yaml
|
// via -c, so we silently drop the override here. The config.yaml
|
||||||
// entry is the source of truth for context size.
|
// entry is the source of truth for context size.
|
||||||
let _ = self.num_ctx;
|
let _ = self.num_ctx;
|
||||||
|
// Reasoning toggle for thinking-capable templates (Qwen3 et al.).
|
||||||
|
// llama-server forwards chat_template_kwargs into the Jinja render
|
||||||
|
// (requires --jinja); templates that ignore the key are unaffected.
|
||||||
|
if let Some(think) = self.enable_thinking {
|
||||||
|
v.push(("chat_template_kwargs", json!({ "enable_thinking": think })));
|
||||||
|
}
|
||||||
v
|
v
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -336,6 +336,7 @@ async fn main() -> anyhow::Result<()> {
|
|||||||
args.top_p,
|
args.top_p,
|
||||||
args.top_k,
|
args.top_k,
|
||||||
args.min_p,
|
args.min_p,
|
||||||
|
None, // enable_thinking: leave model/template default
|
||||||
args.max_iterations,
|
args.max_iterations,
|
||||||
None,
|
None,
|
||||||
Vec::new(),
|
Vec::new(),
|
||||||
|
|||||||
@@ -309,6 +309,7 @@ pub async fn generate_script_agentic(
|
|||||||
top_p: None,
|
top_p: None,
|
||||||
top_k: None,
|
top_k: None,
|
||||||
min_p: None,
|
min_p: None,
|
||||||
|
enable_thinking: None,
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
.await
|
.await
|
||||||
|
|||||||
@@ -193,6 +193,7 @@ pub async fn unified_search<TagD: TagDao>(
|
|||||||
top_p: None,
|
top_p: None,
|
||||||
top_k: None,
|
top_k: None,
|
||||||
min_p: None,
|
min_p: None,
|
||||||
|
enable_thinking: None,
|
||||||
};
|
};
|
||||||
let backend = match state
|
let backend = match state
|
||||||
.insight_generator
|
.insight_generator
|
||||||
|
|||||||
Reference in New Issue
Block a user