AI: add enable_thinking reasoning toggle plumbed to llama.cpp

New optional SamplingOverride forwarded to llama-server as chat_template_kwargs.enable_thinking (gates Qwen3-style reasoning blocks). None leaves the template default; other backends ignore it. Wired through the agentic-insight and chat-turn request bodies/handlers. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-17 18:14:44 -04:00
parent f2ab8d3740
commit 48a1b753f0
8 changed files with 55 additions and 0 deletions
@@ -41,6 +41,10 @@ pub struct SamplingOverrides {
    pub top_p: Option<f32>,
    pub top_k: Option<i32>,
    pub min_p: Option<f32>,
    /// Reasoning toggle. Only the llama.cpp backend honors it (forwarded as
    /// `chat_template_kwargs.enable_thinking`); other backends ignore it.
    /// `None` leaves the model/template default in place.
    pub enable_thinking: Option<bool>,
 }
 impl SamplingOverrides {
@@ -124,6 +128,7 @@ mod tests {
            top_p: None,
            top_k: None,
            min_p: None,
            enable_thinking: None,
        };
        assert!(!empty.has_sampling());
@@ -134,6 +139,7 @@ mod tests {
            top_p: None,
            top_k: None,
            min_p: None,
            enable_thinking: None,
        };
        assert!(with_temp.has_sampling());
    }
@@ -40,6 +40,12 @@ pub struct GeneratePhotoInsightRequest {
    pub top_k: Option<i32>,
    #[serde(default)]
    pub min_p: Option<f32>,
    /// Reasoning toggle for thinking-capable models. Forwarded to the
    /// llama.cpp backend as `chat_template_kwargs.enable_thinking`; ignored
    /// by other backends and the non-agentic (Ollama) path. Only the agentic
    /// endpoint routes through llama.cpp. None defers to the template default.
    #[serde(default)]
    pub enable_thinking: Option<bool>,
    /// `"local"` (default, Ollama with images) | `"hybrid"` (local vision +
    /// OpenRouter chat). Only respected by the agentic endpoint.
    #[serde(default)]
@@ -868,6 +874,7 @@ pub async fn generate_agentic_insight_handler(
                    request.top_p,
                    request.top_k,
                    request.min_p,
                    request.enable_thinking,
                    max_iterations,
                    request.backend.clone(),
                    fewshot_examples,
@@ -1169,6 +1176,11 @@ pub struct ChatTurnHttpRequest {
    pub top_k: Option<i32>,
    #[serde(default)]
    pub min_p: Option<f32>,
    /// Reasoning toggle for thinking-capable models. Forwarded to the
    /// llama.cpp backend as `chat_template_kwargs.enable_thinking`; ignored
    /// by other backends. None defers to the model/template default.
    #[serde(default)]
    pub enable_thinking: Option<bool>,
    #[serde(default)]
    pub max_iterations: Option<usize>,
    /// Per-turn system-prompt override. Ephemeral in append mode,
@@ -1247,6 +1259,7 @@ pub async fn chat_turn_handler(
        top_p: request.top_p,
        top_k: request.top_k,
        min_p: request.min_p,
        enable_thinking: request.enable_thinking,
        max_iterations: request.max_iterations,
        system_prompt: request.system_prompt.clone(),
        persona_id: request.persona_id.clone(),
@@ -1473,6 +1486,7 @@ pub async fn chat_stream_handler(
        top_p: request.top_p,
        top_k: request.top_k,
        min_p: request.min_p,
        enable_thinking: request.enable_thinking,
        max_iterations: request.max_iterations,
        system_prompt: request.system_prompt.clone(),
        persona_id: request.persona_id.clone(),
@@ -1618,6 +1632,7 @@ pub async fn turn_async_handler(
        top_p: request.top_p,
        top_k: request.top_k,
        min_p: request.min_p,
        enable_thinking: request.enable_thinking,
        max_iterations: request.max_iterations,
        system_prompt: request.system_prompt.clone(),
        persona_id: request.persona_id.clone(),
@@ -70,6 +70,10 @@ pub struct ChatTurnRequest {
    pub top_p: Option<f32>,
    pub top_k: Option<i32>,
    pub min_p: Option<f32>,
    /// Reasoning toggle for thinking-capable models. Forwarded to the
    /// llama.cpp backend as `chat_template_kwargs.enable_thinking`; ignored
    /// by other backends. None defers to the model/template default.
    pub enable_thinking: Option<bool>,
    pub max_iterations: Option<usize>,
    /// Per-turn system-prompt override. In append mode (default), applied
    /// ephemerally — original system message restored before persistence.
@@ -344,6 +348,7 @@ impl InsightChatService {
            top_p: req.top_p,
            top_k: req.top_k,
            min_p: req.min_p,
            enable_thinking: req.enable_thinking,
        };
        let backend = self.generator.resolve_backend(kind, &overrides).await?;
        let model_used = backend.model().to_string();
@@ -847,6 +852,7 @@ impl InsightChatService {
            top_p: req.top_p,
            top_k: req.top_k,
            min_p: req.min_p,
            enable_thinking: req.enable_thinking,
        };
        let backend = self.generator.resolve_backend(kind, &overrides).await?;
        let model_used = backend.model().to_string();
@@ -1017,6 +1023,7 @@ impl InsightChatService {
            top_p: req.top_p,
            top_k: req.top_k,
            min_p: req.min_p,
            enable_thinking: req.enable_thinking,
        };
        let backend = self.generator.resolve_backend(kind, &overrides).await?;
        let model_used = backend.model().to_string();
@@ -1425,6 +1432,7 @@ impl InsightChatService {
            top_p: req.top_p,
            top_k: req.top_k,
            min_p: req.min_p,
            enable_thinking: req.enable_thinking,
        };
        let backend = self.generator.resolve_backend(kind, &overrides).await?;
        let model_used = backend.model().to_string();
@@ -1607,6 +1615,7 @@ impl InsightChatService {
            top_p: req.top_p,
            top_k: req.top_k,
            min_p: req.min_p,
            enable_thinking: req.enable_thinking,
        };
        let backend = self.generator.resolve_backend(kind, &overrides).await?;
        let model_used = backend.model().to_string();
@@ -3933,6 +3933,7 @@ Return ONLY the summary, nothing else."#,
            if let Some(ctx) = overrides.num_ctx {
                c.set_num_ctx(Some(ctx));
            }
            c.set_enable_thinking(overrides.enable_thinking);
            Box::new(c)
        } else {
            // Pure Ollama local.
@@ -4064,6 +4065,7 @@ Return ONLY the summary, nothing else."#,
        top_p: Option<f32>,
        top_k: Option<i32>,
        min_p: Option<f32>,
        enable_thinking: Option<bool>,
        max_iterations: usize,
        backend: Option<String>,
        fewshot_examples: Vec<Vec<ChatMessage>>,
@@ -4091,6 +4093,7 @@ Return ONLY the summary, nothing else."#,
            top_p,
            top_k,
            min_p,
            enable_thinking,
        };
        let backend = self.resolve_backend(kind, &overrides).await?;
        span.set_attribute(KeyValue::new("model", backend.model().to_string()));
@@ -64,6 +64,12 @@ pub struct LlamaCppClient {
    top_p: Option<f32>,
    top_k: Option<i32>,
    min_p: Option<f32>,
    /// When `Some`, forwarded to llama-server as
    /// `chat_template_kwargs: {"enable_thinking": <bool>}`. The Jinja chat
    /// template (e.g. Qwen3) reads this to gate its reasoning block. `None`
    /// omits the key entirely, leaving the template's own default. Templates
    /// that don't reference the key ignore it, so sending it is harmless.
    enable_thinking: Option<bool>,
 }
 impl LlamaCppClient {
@@ -89,6 +95,7 @@ impl LlamaCppClient {
            top_p: None,
            top_k: None,
            min_p: None,
            enable_thinking: None,
        }
    }
@@ -104,6 +111,12 @@ impl LlamaCppClient {
        self.num_ctx = num_ctx;
    }
    /// Set the reasoning toggle forwarded as `chat_template_kwargs.enable_thinking`.
    /// `None` leaves the chat template's own default in place.
    pub fn set_enable_thinking(&mut self, enable_thinking: Option<bool>) {
        self.enable_thinking = enable_thinking;
    }
    pub fn set_sampling_params(
        &mut self,
        temperature: Option<f32>,
@@ -458,6 +471,12 @@ impl LlamaCppClient {
        // via -c, so we silently drop the override here. The config.yaml
        // entry is the source of truth for context size.
        let _ = self.num_ctx;
        // Reasoning toggle for thinking-capable templates (Qwen3 et al.).
        // llama-server forwards chat_template_kwargs into the Jinja render
        // (requires --jinja); templates that ignore the key are unaffected.
        if let Some(think) = self.enable_thinking {
            v.push(("chat_template_kwargs", json!({ "enable_thinking": think })));
        }
        v
    }
@@ -336,6 +336,7 @@ async fn main() -> anyhow::Result<()> {
                args.top_p,
                args.top_k,
                args.min_p,
                None, // enable_thinking: leave model/template default
                args.max_iterations,
                None,
                Vec::new(),
@@ -309,6 +309,7 @@ pub async fn generate_script_agentic(
                top_p: None,
                top_k: None,
                min_p: None,
                enable_thinking: None,
            },
        )
        .await
@@ -193,6 +193,7 @@ pub async fn unified_search<TagD: TagDao>(
        top_p: None,
        top_k: None,
        min_p: None,
        enable_thinking: None,
    };
    let backend = match state
        .insight_generator