From 48a1b753f0ec706c6a388a91aeddbec583723f19 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Wed, 17 Jun 2026 18:14:44 -0400 Subject: [PATCH] AI: add enable_thinking reasoning toggle plumbed to llama.cpp New optional SamplingOverride forwarded to llama-server as chat_template_kwargs.enable_thinking (gates Qwen3-style reasoning blocks). None leaves the template default; other backends ignore it. Wired through the agentic-insight and chat-turn request bodies/handlers. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/ai/backend.rs | 6 ++++++ src/ai/handlers.rs | 15 +++++++++++++++ src/ai/insight_chat.rs | 9 +++++++++ src/ai/insight_generator.rs | 3 +++ src/ai/llamacpp.rs | 19 +++++++++++++++++++ src/bin/populate_knowledge.rs | 1 + src/reels/script.rs | 1 + src/unified_search.rs | 1 + 8 files changed, 55 insertions(+) diff --git a/src/ai/backend.rs b/src/ai/backend.rs index 0515f1c..dfcdd03 100644 --- a/src/ai/backend.rs +++ b/src/ai/backend.rs @@ -41,6 +41,10 @@ pub struct SamplingOverrides { pub top_p: Option, pub top_k: Option, pub min_p: Option, + /// Reasoning toggle. Only the llama.cpp backend honors it (forwarded as + /// `chat_template_kwargs.enable_thinking`); other backends ignore it. + /// `None` leaves the model/template default in place. + pub enable_thinking: Option, } impl SamplingOverrides { @@ -124,6 +128,7 @@ mod tests { top_p: None, top_k: None, min_p: None, + enable_thinking: None, }; assert!(!empty.has_sampling()); @@ -134,6 +139,7 @@ mod tests { top_p: None, top_k: None, min_p: None, + enable_thinking: None, }; assert!(with_temp.has_sampling()); } diff --git a/src/ai/handlers.rs b/src/ai/handlers.rs index c6bc212..ae9f300 100644 --- a/src/ai/handlers.rs +++ b/src/ai/handlers.rs @@ -40,6 +40,12 @@ pub struct GeneratePhotoInsightRequest { pub top_k: Option, #[serde(default)] pub min_p: Option, + /// Reasoning toggle for thinking-capable models. Forwarded to the + /// llama.cpp backend as `chat_template_kwargs.enable_thinking`; ignored + /// by other backends and the non-agentic (Ollama) path. Only the agentic + /// endpoint routes through llama.cpp. None defers to the template default. + #[serde(default)] + pub enable_thinking: Option, /// `"local"` (default, Ollama with images) | `"hybrid"` (local vision + /// OpenRouter chat). Only respected by the agentic endpoint. #[serde(default)] @@ -868,6 +874,7 @@ pub async fn generate_agentic_insight_handler( request.top_p, request.top_k, request.min_p, + request.enable_thinking, max_iterations, request.backend.clone(), fewshot_examples, @@ -1169,6 +1176,11 @@ pub struct ChatTurnHttpRequest { pub top_k: Option, #[serde(default)] pub min_p: Option, + /// Reasoning toggle for thinking-capable models. Forwarded to the + /// llama.cpp backend as `chat_template_kwargs.enable_thinking`; ignored + /// by other backends. None defers to the model/template default. + #[serde(default)] + pub enable_thinking: Option, #[serde(default)] pub max_iterations: Option, /// Per-turn system-prompt override. Ephemeral in append mode, @@ -1247,6 +1259,7 @@ pub async fn chat_turn_handler( top_p: request.top_p, top_k: request.top_k, min_p: request.min_p, + enable_thinking: request.enable_thinking, max_iterations: request.max_iterations, system_prompt: request.system_prompt.clone(), persona_id: request.persona_id.clone(), @@ -1473,6 +1486,7 @@ pub async fn chat_stream_handler( top_p: request.top_p, top_k: request.top_k, min_p: request.min_p, + enable_thinking: request.enable_thinking, max_iterations: request.max_iterations, system_prompt: request.system_prompt.clone(), persona_id: request.persona_id.clone(), @@ -1618,6 +1632,7 @@ pub async fn turn_async_handler( top_p: request.top_p, top_k: request.top_k, min_p: request.min_p, + enable_thinking: request.enable_thinking, max_iterations: request.max_iterations, system_prompt: request.system_prompt.clone(), persona_id: request.persona_id.clone(), diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs index 84f2b32..af00731 100644 --- a/src/ai/insight_chat.rs +++ b/src/ai/insight_chat.rs @@ -70,6 +70,10 @@ pub struct ChatTurnRequest { pub top_p: Option, pub top_k: Option, pub min_p: Option, + /// Reasoning toggle for thinking-capable models. Forwarded to the + /// llama.cpp backend as `chat_template_kwargs.enable_thinking`; ignored + /// by other backends. None defers to the model/template default. + pub enable_thinking: Option, pub max_iterations: Option, /// Per-turn system-prompt override. In append mode (default), applied /// ephemerally — original system message restored before persistence. @@ -344,6 +348,7 @@ impl InsightChatService { top_p: req.top_p, top_k: req.top_k, min_p: req.min_p, + enable_thinking: req.enable_thinking, }; let backend = self.generator.resolve_backend(kind, &overrides).await?; let model_used = backend.model().to_string(); @@ -847,6 +852,7 @@ impl InsightChatService { top_p: req.top_p, top_k: req.top_k, min_p: req.min_p, + enable_thinking: req.enable_thinking, }; let backend = self.generator.resolve_backend(kind, &overrides).await?; let model_used = backend.model().to_string(); @@ -1017,6 +1023,7 @@ impl InsightChatService { top_p: req.top_p, top_k: req.top_k, min_p: req.min_p, + enable_thinking: req.enable_thinking, }; let backend = self.generator.resolve_backend(kind, &overrides).await?; let model_used = backend.model().to_string(); @@ -1425,6 +1432,7 @@ impl InsightChatService { top_p: req.top_p, top_k: req.top_k, min_p: req.min_p, + enable_thinking: req.enable_thinking, }; let backend = self.generator.resolve_backend(kind, &overrides).await?; let model_used = backend.model().to_string(); @@ -1607,6 +1615,7 @@ impl InsightChatService { top_p: req.top_p, top_k: req.top_k, min_p: req.min_p, + enable_thinking: req.enable_thinking, }; let backend = self.generator.resolve_backend(kind, &overrides).await?; let model_used = backend.model().to_string(); diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs index 4ff8494..d45fa55 100644 --- a/src/ai/insight_generator.rs +++ b/src/ai/insight_generator.rs @@ -3933,6 +3933,7 @@ Return ONLY the summary, nothing else."#, if let Some(ctx) = overrides.num_ctx { c.set_num_ctx(Some(ctx)); } + c.set_enable_thinking(overrides.enable_thinking); Box::new(c) } else { // Pure Ollama local. @@ -4064,6 +4065,7 @@ Return ONLY the summary, nothing else."#, top_p: Option, top_k: Option, min_p: Option, + enable_thinking: Option, max_iterations: usize, backend: Option, fewshot_examples: Vec>, @@ -4091,6 +4093,7 @@ Return ONLY the summary, nothing else."#, top_p, top_k, min_p, + enable_thinking, }; let backend = self.resolve_backend(kind, &overrides).await?; span.set_attribute(KeyValue::new("model", backend.model().to_string())); diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs index 8a7c898..77e7f63 100644 --- a/src/ai/llamacpp.rs +++ b/src/ai/llamacpp.rs @@ -64,6 +64,12 @@ pub struct LlamaCppClient { top_p: Option, top_k: Option, min_p: Option, + /// When `Some`, forwarded to llama-server as + /// `chat_template_kwargs: {"enable_thinking": }`. The Jinja chat + /// template (e.g. Qwen3) reads this to gate its reasoning block. `None` + /// omits the key entirely, leaving the template's own default. Templates + /// that don't reference the key ignore it, so sending it is harmless. + enable_thinking: Option, } impl LlamaCppClient { @@ -89,6 +95,7 @@ impl LlamaCppClient { top_p: None, top_k: None, min_p: None, + enable_thinking: None, } } @@ -104,6 +111,12 @@ impl LlamaCppClient { self.num_ctx = num_ctx; } + /// Set the reasoning toggle forwarded as `chat_template_kwargs.enable_thinking`. + /// `None` leaves the chat template's own default in place. + pub fn set_enable_thinking(&mut self, enable_thinking: Option) { + self.enable_thinking = enable_thinking; + } + pub fn set_sampling_params( &mut self, temperature: Option, @@ -458,6 +471,12 @@ impl LlamaCppClient { // via -c, so we silently drop the override here. The config.yaml // entry is the source of truth for context size. let _ = self.num_ctx; + // Reasoning toggle for thinking-capable templates (Qwen3 et al.). + // llama-server forwards chat_template_kwargs into the Jinja render + // (requires --jinja); templates that ignore the key are unaffected. + if let Some(think) = self.enable_thinking { + v.push(("chat_template_kwargs", json!({ "enable_thinking": think }))); + } v } diff --git a/src/bin/populate_knowledge.rs b/src/bin/populate_knowledge.rs index 71f2f8a..396eddc 100644 --- a/src/bin/populate_knowledge.rs +++ b/src/bin/populate_knowledge.rs @@ -336,6 +336,7 @@ async fn main() -> anyhow::Result<()> { args.top_p, args.top_k, args.min_p, + None, // enable_thinking: leave model/template default args.max_iterations, None, Vec::new(), diff --git a/src/reels/script.rs b/src/reels/script.rs index 858efd1..38ef9cc 100644 --- a/src/reels/script.rs +++ b/src/reels/script.rs @@ -309,6 +309,7 @@ pub async fn generate_script_agentic( top_p: None, top_k: None, min_p: None, + enable_thinking: None, }, ) .await diff --git a/src/unified_search.rs b/src/unified_search.rs index 555773c..0940a92 100644 --- a/src/unified_search.rs +++ b/src/unified_search.rs @@ -193,6 +193,7 @@ pub async fn unified_search( top_p: None, top_k: None, min_p: None, + enable_thinking: None, }; let backend = match state .insight_generator