AI: add enable_thinking reasoning toggle plumbed to llama.cpp

New optional SamplingOverride forwarded to llama-server as
chat_template_kwargs.enable_thinking (gates Qwen3-style reasoning
blocks). None leaves the template default; other backends ignore it.
Wired through the agentic-insight and chat-turn request bodies/handlers.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Cameron Cordes
2026-06-17 18:14:44 -04:00
parent f2ab8d3740
commit 48a1b753f0
8 changed files with 55 additions and 0 deletions
+15
View File
@@ -40,6 +40,12 @@ pub struct GeneratePhotoInsightRequest {
pub top_k: Option<i32>,
#[serde(default)]
pub min_p: Option<f32>,
/// Reasoning toggle for thinking-capable models. Forwarded to the
/// llama.cpp backend as `chat_template_kwargs.enable_thinking`; ignored
/// by other backends and the non-agentic (Ollama) path. Only the agentic
/// endpoint routes through llama.cpp. None defers to the template default.
#[serde(default)]
pub enable_thinking: Option<bool>,
/// `"local"` (default, Ollama with images) | `"hybrid"` (local vision +
/// OpenRouter chat). Only respected by the agentic endpoint.
#[serde(default)]
@@ -868,6 +874,7 @@ pub async fn generate_agentic_insight_handler(
request.top_p,
request.top_k,
request.min_p,
request.enable_thinking,
max_iterations,
request.backend.clone(),
fewshot_examples,
@@ -1169,6 +1176,11 @@ pub struct ChatTurnHttpRequest {
pub top_k: Option<i32>,
#[serde(default)]
pub min_p: Option<f32>,
/// Reasoning toggle for thinking-capable models. Forwarded to the
/// llama.cpp backend as `chat_template_kwargs.enable_thinking`; ignored
/// by other backends. None defers to the model/template default.
#[serde(default)]
pub enable_thinking: Option<bool>,
#[serde(default)]
pub max_iterations: Option<usize>,
/// Per-turn system-prompt override. Ephemeral in append mode,
@@ -1247,6 +1259,7 @@ pub async fn chat_turn_handler(
top_p: request.top_p,
top_k: request.top_k,
min_p: request.min_p,
enable_thinking: request.enable_thinking,
max_iterations: request.max_iterations,
system_prompt: request.system_prompt.clone(),
persona_id: request.persona_id.clone(),
@@ -1473,6 +1486,7 @@ pub async fn chat_stream_handler(
top_p: request.top_p,
top_k: request.top_k,
min_p: request.min_p,
enable_thinking: request.enable_thinking,
max_iterations: request.max_iterations,
system_prompt: request.system_prompt.clone(),
persona_id: request.persona_id.clone(),
@@ -1618,6 +1632,7 @@ pub async fn turn_async_handler(
top_p: request.top_p,
top_k: request.top_k,
min_p: request.min_p,
enable_thinking: request.enable_thinking,
max_iterations: request.max_iterations,
system_prompt: request.system_prompt.clone(),
persona_id: request.persona_id.clone(),