From 48a1b753f0ec706c6a388a91aeddbec583723f19 Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Wed, 17 Jun 2026 18:14:44 -0400
Subject: [PATCH] AI: add enable_thinking reasoning toggle plumbed to llama.cpp

New optional SamplingOverride forwarded to llama-server as
chat_template_kwargs.enable_thinking (gates Qwen3-style reasoning
blocks). None leaves the template default; other backends ignore it.
Wired through the agentic-insight and chat-turn request bodies/handlers.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/ai/backend.rs             |  6 ++++++
 src/ai/handlers.rs            | 15 +++++++++++++++
 src/ai/insight_chat.rs        |  9 +++++++++
 src/ai/insight_generator.rs   |  3 +++
 src/ai/llamacpp.rs            | 19 +++++++++++++++++++
 src/bin/populate_knowledge.rs |  1 +
 src/reels/script.rs           |  1 +
 src/unified_search.rs         |  1 +
 8 files changed, 55 insertions(+)
diff --git a/src/ai/backend.rs b/src/ai/backend.rs
index 0515f1c..dfcdd03 100644
--- a/src/ai/backend.rs
+++ b/src/ai/backend.rs
@@ -41,6 +41,10 @@ pub struct SamplingOverrides {
     pub top_p: Option<f32>,
     pub top_k: Option<i32>,
     pub min_p: Option<f32>,
+    /// Reasoning toggle. Only the llama.cpp backend honors it (forwarded as
+    /// `chat_template_kwargs.enable_thinking`); other backends ignore it.
+    /// `None` leaves the model/template default in place.
+    pub enable_thinking: Option<bool>,
 }
 
 impl SamplingOverrides {
@@ -124,6 +128,7 @@ mod tests {
             top_p: None,
             top_k: None,
             min_p: None,
+            enable_thinking: None,
         };
         assert!(!empty.has_sampling());
 
@@ -134,6 +139,7 @@ mod tests {
             top_p: None,
             top_k: None,
             min_p: None,
+            enable_thinking: None,
         };
         assert!(with_temp.has_sampling());
     }
diff --git a/src/ai/handlers.rs b/src/ai/handlers.rs
index c6bc212..ae9f300 100644
--- a/src/ai/handlers.rs
+++ b/src/ai/handlers.rs
@@ -40,6 +40,12 @@ pub struct GeneratePhotoInsightRequest {
     pub top_k: Option<i32>,
     #[serde(default)]
     pub min_p: Option<f32>,
+    /// Reasoning toggle for thinking-capable models. Forwarded to the
+    /// llama.cpp backend as `chat_template_kwargs.enable_thinking`; ignored
+    /// by other backends and the non-agentic (Ollama) path. Only the agentic
+    /// endpoint routes through llama.cpp. None defers to the template default.
+    #[serde(default)]
+    pub enable_thinking: Option<bool>,
     /// `"local"` (default, Ollama with images) | `"hybrid"` (local vision +
     /// OpenRouter chat). Only respected by the agentic endpoint.
     #[serde(default)]
@@ -868,6 +874,7 @@ pub async fn generate_agentic_insight_handler(
                     request.top_p,
                     request.top_k,
                     request.min_p,
+                    request.enable_thinking,
                     max_iterations,
                     request.backend.clone(),
                     fewshot_examples,
@@ -1169,6 +1176,11 @@ pub struct ChatTurnHttpRequest {
     pub top_k: Option<i32>,
     #[serde(default)]
     pub min_p: Option<f32>,
+    /// Reasoning toggle for thinking-capable models. Forwarded to the
+    /// llama.cpp backend as `chat_template_kwargs.enable_thinking`; ignored
+    /// by other backends. None defers to the model/template default.
+    #[serde(default)]
+    pub enable_thinking: Option<bool>,
     #[serde(default)]
     pub max_iterations: Option<usize>,
     /// Per-turn system-prompt override. Ephemeral in append mode,
@@ -1247,6 +1259,7 @@ pub async fn chat_turn_handler(
         top_p: request.top_p,
         top_k: request.top_k,
         min_p: request.min_p,
+        enable_thinking: request.enable_thinking,
         max_iterations: request.max_iterations,
         system_prompt: request.system_prompt.clone(),
         persona_id: request.persona_id.clone(),
@@ -1473,6 +1486,7 @@ pub async fn chat_stream_handler(
         top_p: request.top_p,
         top_k: request.top_k,
         min_p: request.min_p,
+        enable_thinking: request.enable_thinking,
         max_iterations: request.max_iterations,
         system_prompt: request.system_prompt.clone(),
         persona_id: request.persona_id.clone(),
@@ -1618,6 +1632,7 @@ pub async fn turn_async_handler(
         top_p: request.top_p,
         top_k: request.top_k,
         min_p: request.min_p,
+        enable_thinking: request.enable_thinking,
         max_iterations: request.max_iterations,
         system_prompt: request.system_prompt.clone(),
         persona_id: request.persona_id.clone(),
diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs
index 84f2b32..af00731 100644
--- a/src/ai/insight_chat.rs
+++ b/src/ai/insight_chat.rs
@@ -70,6 +70,10 @@ pub struct ChatTurnRequest {
     pub top_p: Option<f32>,
     pub top_k: Option<i32>,
     pub min_p: Option<f32>,
+    /// Reasoning toggle for thinking-capable models. Forwarded to the
+    /// llama.cpp backend as `chat_template_kwargs.enable_thinking`; ignored
+    /// by other backends. None defers to the model/template default.
+    pub enable_thinking: Option<bool>,
     pub max_iterations: Option<usize>,
     /// Per-turn system-prompt override. In append mode (default), applied
     /// ephemerally — original system message restored before persistence.
@@ -344,6 +348,7 @@ impl InsightChatService {
             top_p: req.top_p,
             top_k: req.top_k,
             min_p: req.min_p,
+            enable_thinking: req.enable_thinking,
         };
         let backend = self.generator.resolve_backend(kind, &overrides).await?;
         let model_used = backend.model().to_string();
@@ -847,6 +852,7 @@ impl InsightChatService {
             top_p: req.top_p,
             top_k: req.top_k,
             min_p: req.min_p,
+            enable_thinking: req.enable_thinking,
         };
         let backend = self.generator.resolve_backend(kind, &overrides).await?;
         let model_used = backend.model().to_string();
@@ -1017,6 +1023,7 @@ impl InsightChatService {
             top_p: req.top_p,
             top_k: req.top_k,
             min_p: req.min_p,
+            enable_thinking: req.enable_thinking,
         };
         let backend = self.generator.resolve_backend(kind, &overrides).await?;
         let model_used = backend.model().to_string();
@@ -1425,6 +1432,7 @@ impl InsightChatService {
             top_p: req.top_p,
             top_k: req.top_k,
             min_p: req.min_p,
+            enable_thinking: req.enable_thinking,
         };
         let backend = self.generator.resolve_backend(kind, &overrides).await?;
         let model_used = backend.model().to_string();
@@ -1607,6 +1615,7 @@ impl InsightChatService {
             top_p: req.top_p,
             top_k: req.top_k,
             min_p: req.min_p,
+            enable_thinking: req.enable_thinking,
         };
         let backend = self.generator.resolve_backend(kind, &overrides).await?;
         let model_used = backend.model().to_string();
diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs
index 4ff8494..d45fa55 100644
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -3933,6 +3933,7 @@ Return ONLY the summary, nothing else."#,
             if let Some(ctx) = overrides.num_ctx {
                 c.set_num_ctx(Some(ctx));
             }
+            c.set_enable_thinking(overrides.enable_thinking);
             Box::new(c)
         } else {
             // Pure Ollama local.
@@ -4064,6 +4065,7 @@ Return ONLY the summary, nothing else."#,
         top_p: Option<f32>,
         top_k: Option<i32>,
         min_p: Option<f32>,
+        enable_thinking: Option<bool>,
         max_iterations: usize,
         backend: Option<String>,
         fewshot_examples: Vec<Vec<ChatMessage>>,
@@ -4091,6 +4093,7 @@ Return ONLY the summary, nothing else."#,
             top_p,
             top_k,
             min_p,
+            enable_thinking,
         };
         let backend = self.resolve_backend(kind, &overrides).await?;
         span.set_attribute(KeyValue::new("model", backend.model().to_string()));
diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs
index 8a7c898..77e7f63 100644
--- a/src/ai/llamacpp.rs
+++ b/src/ai/llamacpp.rs
@@ -64,6 +64,12 @@ pub struct LlamaCppClient {
     top_p: Option<f32>,
     top_k: Option<i32>,
     min_p: Option<f32>,
+    /// When `Some`, forwarded to llama-server as
+    /// `chat_template_kwargs: {"enable_thinking": <bool>}`. The Jinja chat
+    /// template (e.g. Qwen3) reads this to gate its reasoning block. `None`
+    /// omits the key entirely, leaving the template's own default. Templates
+    /// that don't reference the key ignore it, so sending it is harmless.
+    enable_thinking: Option<bool>,
 }
 
 impl LlamaCppClient {
@@ -89,6 +95,7 @@ impl LlamaCppClient {
             top_p: None,
             top_k: None,
             min_p: None,
+            enable_thinking: None,
         }
     }
 
@@ -104,6 +111,12 @@ impl LlamaCppClient {
         self.num_ctx = num_ctx;
     }
 
+    /// Set the reasoning toggle forwarded as `chat_template_kwargs.enable_thinking`.
+    /// `None` leaves the chat template's own default in place.
+    pub fn set_enable_thinking(&mut self, enable_thinking: Option<bool>) {
+        self.enable_thinking = enable_thinking;
+    }
+
     pub fn set_sampling_params(
         &mut self,
         temperature: Option<f32>,
@@ -458,6 +471,12 @@ impl LlamaCppClient {
         // via -c, so we silently drop the override here. The config.yaml
         // entry is the source of truth for context size.
         let _ = self.num_ctx;
+        // Reasoning toggle for thinking-capable templates (Qwen3 et al.).
+        // llama-server forwards chat_template_kwargs into the Jinja render
+        // (requires --jinja); templates that ignore the key are unaffected.
+        if let Some(think) = self.enable_thinking {
+            v.push(("chat_template_kwargs", json!({ "enable_thinking": think })));
+        }
         v
     }
 
diff --git a/src/bin/populate_knowledge.rs b/src/bin/populate_knowledge.rs
index 71f2f8a..396eddc 100644
--- a/src/bin/populate_knowledge.rs
+++ b/src/bin/populate_knowledge.rs
@@ -336,6 +336,7 @@ async fn main() -> anyhow::Result<()> {
                 args.top_p,
                 args.top_k,
                 args.min_p,
+                None, // enable_thinking: leave model/template default
                 args.max_iterations,
                 None,
                 Vec::new(),
diff --git a/src/reels/script.rs b/src/reels/script.rs
index 858efd1..38ef9cc 100644
--- a/src/reels/script.rs
+++ b/src/reels/script.rs
@@ -309,6 +309,7 @@ pub async fn generate_script_agentic(
                 top_p: None,
                 top_k: None,
                 min_p: None,
+                enable_thinking: None,
             },
         )
         .await
diff --git a/src/unified_search.rs b/src/unified_search.rs
index 555773c..0940a92 100644
--- a/src/unified_search.rs
+++ b/src/unified_search.rs
@@ -193,6 +193,7 @@ pub async fn unified_search<TagD: TagDao>(
         top_p: None,
         top_k: None,
         min_p: None,
+        enable_thinking: None,
     };
     let backend = match state
         .insight_generator