AI: add enable_thinking reasoning toggle plumbed to llama.cpp
New optional SamplingOverride forwarded to llama-server as chat_template_kwargs.enable_thinking (gates Qwen3-style reasoning blocks). None leaves the template default; other backends ignore it. Wired through the agentic-insight and chat-turn request bodies/handlers. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -40,6 +40,12 @@ pub struct GeneratePhotoInsightRequest {
|
||||
pub top_k: Option<i32>,
|
||||
#[serde(default)]
|
||||
pub min_p: Option<f32>,
|
||||
/// Reasoning toggle for thinking-capable models. Forwarded to the
|
||||
/// llama.cpp backend as `chat_template_kwargs.enable_thinking`; ignored
|
||||
/// by other backends and the non-agentic (Ollama) path. Only the agentic
|
||||
/// endpoint routes through llama.cpp. None defers to the template default.
|
||||
#[serde(default)]
|
||||
pub enable_thinking: Option<bool>,
|
||||
/// `"local"` (default, Ollama with images) | `"hybrid"` (local vision +
|
||||
/// OpenRouter chat). Only respected by the agentic endpoint.
|
||||
#[serde(default)]
|
||||
@@ -868,6 +874,7 @@ pub async fn generate_agentic_insight_handler(
|
||||
request.top_p,
|
||||
request.top_k,
|
||||
request.min_p,
|
||||
request.enable_thinking,
|
||||
max_iterations,
|
||||
request.backend.clone(),
|
||||
fewshot_examples,
|
||||
@@ -1169,6 +1176,11 @@ pub struct ChatTurnHttpRequest {
|
||||
pub top_k: Option<i32>,
|
||||
#[serde(default)]
|
||||
pub min_p: Option<f32>,
|
||||
/// Reasoning toggle for thinking-capable models. Forwarded to the
|
||||
/// llama.cpp backend as `chat_template_kwargs.enable_thinking`; ignored
|
||||
/// by other backends. None defers to the model/template default.
|
||||
#[serde(default)]
|
||||
pub enable_thinking: Option<bool>,
|
||||
#[serde(default)]
|
||||
pub max_iterations: Option<usize>,
|
||||
/// Per-turn system-prompt override. Ephemeral in append mode,
|
||||
@@ -1247,6 +1259,7 @@ pub async fn chat_turn_handler(
|
||||
top_p: request.top_p,
|
||||
top_k: request.top_k,
|
||||
min_p: request.min_p,
|
||||
enable_thinking: request.enable_thinking,
|
||||
max_iterations: request.max_iterations,
|
||||
system_prompt: request.system_prompt.clone(),
|
||||
persona_id: request.persona_id.clone(),
|
||||
@@ -1473,6 +1486,7 @@ pub async fn chat_stream_handler(
|
||||
top_p: request.top_p,
|
||||
top_k: request.top_k,
|
||||
min_p: request.min_p,
|
||||
enable_thinking: request.enable_thinking,
|
||||
max_iterations: request.max_iterations,
|
||||
system_prompt: request.system_prompt.clone(),
|
||||
persona_id: request.persona_id.clone(),
|
||||
@@ -1618,6 +1632,7 @@ pub async fn turn_async_handler(
|
||||
top_p: request.top_p,
|
||||
top_k: request.top_k,
|
||||
min_p: request.min_p,
|
||||
enable_thinking: request.enable_thinking,
|
||||
max_iterations: request.max_iterations,
|
||||
system_prompt: request.system_prompt.clone(),
|
||||
persona_id: request.persona_id.clone(),
|
||||
|
||||
Reference in New Issue
Block a user