AI: add enable_thinking reasoning toggle plumbed to llama.cpp

New optional SamplingOverride forwarded to llama-server as chat_template_kwargs.enable_thinking (gates Qwen3-style reasoning blocks). None leaves the template default; other backends ignore it. Wired through the agentic-insight and chat-turn request bodies/handlers. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-17 18:14:44 -04:00
parent f2ab8d3740
commit 48a1b753f0
8 changed files with 55 additions and 0 deletions
@@ -40,6 +40,12 @@ pub struct GeneratePhotoInsightRequest {
    pub top_k: Option<i32>,
    #[serde(default)]
    pub min_p: Option<f32>,
+    /// Reasoning toggle for thinking-capable models. Forwarded to the
+    /// llama.cpp backend as `chat_template_kwargs.enable_thinking`; ignored
+    /// by other backends and the non-agentic (Ollama) path. Only the agentic
+    /// endpoint routes through llama.cpp. None defers to the template default.
+    #[serde(default)]
+    pub enable_thinking: Option<bool>,
    /// `"local"` (default, Ollama with images) | `"hybrid"` (local vision +
    /// OpenRouter chat). Only respected by the agentic endpoint.
    #[serde(default)]
@@ -868,6 +874,7 @@ pub async fn generate_agentic_insight_handler(
                    request.top_p,
                    request.top_k,
                    request.min_p,
+                    request.enable_thinking,
                    max_iterations,
                    request.backend.clone(),
                    fewshot_examples,
@@ -1169,6 +1176,11 @@ pub struct ChatTurnHttpRequest {
    pub top_k: Option<i32>,
    #[serde(default)]
    pub min_p: Option<f32>,
+    /// Reasoning toggle for thinking-capable models. Forwarded to the
+    /// llama.cpp backend as `chat_template_kwargs.enable_thinking`; ignored
+    /// by other backends. None defers to the model/template default.
+    #[serde(default)]
+    pub enable_thinking: Option<bool>,
    #[serde(default)]
    pub max_iterations: Option<usize>,
    /// Per-turn system-prompt override. Ephemeral in append mode,
@@ -1247,6 +1259,7 @@ pub async fn chat_turn_handler(
        top_p: request.top_p,
        top_k: request.top_k,
        min_p: request.min_p,
+        enable_thinking: request.enable_thinking,
        max_iterations: request.max_iterations,
        system_prompt: request.system_prompt.clone(),
        persona_id: request.persona_id.clone(),
@@ -1473,6 +1486,7 @@ pub async fn chat_stream_handler(
        top_p: request.top_p,
        top_k: request.top_k,
        min_p: request.min_p,
+        enable_thinking: request.enable_thinking,
        max_iterations: request.max_iterations,
        system_prompt: request.system_prompt.clone(),
        persona_id: request.persona_id.clone(),
@@ -1618,6 +1632,7 @@ pub async fn turn_async_handler(
        top_p: request.top_p,
        top_k: request.top_k,
        min_p: request.min_p,
+        enable_thinking: request.enable_thinking,
        max_iterations: request.max_iterations,
        system_prompt: request.system_prompt.clone(),
        persona_id: request.persona_id.clone(),