Raise chat truncation default num_ctx to 32k, env-overridable

The history-truncation budget assumed an 8192-token context whenever a chat request omitted num_ctx, while the llama-swap chat slots serve 20k-131k. Replayed transcripts past ~6k tokens were silently gutted every turn — losing conversation history and destroying llama.cpp KV-cache prefix reuse (full SWA re-prefill per turn). Default is now 32768 (real conversations top out around 16k), with AGENTIC_CHAT_DEFAULT_NUM_CTX to override per deploy, floored at headroom + 1024. Explicit per-request num_ctx still wins. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-09 19:14:02 -04:00
parent 13f3635db2
commit 31904fef80
2 changed files with 33 additions and 8 deletions
@@ -19,7 +19,13 @@ use futures::stream::{BoxStream, StreamExt};
 use uuid::Uuid;

 const DEFAULT_MAX_ITERATIONS: usize = 6;
-const DEFAULT_NUM_CTX: i32 = 8192;
+/// Assumed context window when the request doesn't specify `num_ctx`.
+/// The llama-swap chat slots serve 20k-131k contexts and real conversations
+/// rarely pass ~16k tokens, so 32k keeps the truncation pass from gutting
+/// history that the server could comfortably hold (which also destroys the
+/// server's KV-cache prefix reuse). Override per-deploy with
+/// AGENTIC_CHAT_DEFAULT_NUM_CTX if the serving models change shape.
+const DEFAULT_NUM_CTX: i32 = 32768;
 /// Headroom reserved for the model's response, deducted from the context
 /// budget when deciding whether to truncate the replayed history.
 const RESPONSE_HEADROOM_TOKENS: usize = 2048;
@@ -367,7 +373,7 @@ impl InsightChatService {

        // 6. Apply truncation budget. Drops oldest tool_call+tool pairs
        //    (preserves system + first user including any images).
-        let budget_tokens = (req.num_ctx.unwrap_or(DEFAULT_NUM_CTX) as usize)
+        let budget_tokens = (req.num_ctx.unwrap_or_else(env_default_num_ctx) as usize)
            .saturating_sub(RESPONSE_HEADROOM_TOKENS);
        let budget_bytes = budget_tokens.saturating_mul(BYTES_PER_TOKEN);
        let truncated = apply_context_budget(&mut messages, budget_bytes);
@@ -864,7 +870,7 @@ impl InsightChatService {
            None
        };

-        let budget_tokens = (req.num_ctx.unwrap_or(DEFAULT_NUM_CTX) as usize)
+        let budget_tokens = (req.num_ctx.unwrap_or_else(env_default_num_ctx) as usize)
            .saturating_sub(RESPONSE_HEADROOM_TOKENS);
        let budget_bytes = budget_tokens.saturating_mul(BYTES_PER_TOKEN);
        let truncated = apply_context_budget(&mut messages, budget_bytes);
@@ -1446,7 +1452,7 @@ impl InsightChatService {
        };

        // Truncate before appending the new user turn.
-        let budget_tokens = (req.num_ctx.unwrap_or(DEFAULT_NUM_CTX) as usize)
+        let budget_tokens = (req.num_ctx.unwrap_or_else(env_default_num_ctx) as usize)
            .saturating_sub(RESPONSE_HEADROOM_TOKENS);
        let budget_bytes = budget_tokens.saturating_mul(BYTES_PER_TOKEN);
        let truncated = apply_context_budget(&mut messages, budget_bytes);
@@ -2191,6 +2197,17 @@ fn env_max_iterations() -> usize {
        .max(1)
 }

+/// Read AGENTIC_CHAT_DEFAULT_NUM_CTX once per call — the assumed context
+/// window for the truncation budget when the request omits `num_ctx`. Same
+/// no-static-global rationale as `env_max_iterations` above.
+fn env_default_num_ctx() -> i32 {
+    std::env::var("AGENTIC_CHAT_DEFAULT_NUM_CTX")
+        .ok()
+        .and_then(|s| s.parse::<i32>().ok())
+        .unwrap_or(DEFAULT_NUM_CTX)
+        .max(RESPONSE_HEADROOM_TOKENS as i32 + 1024)
+}
+
 /// Append a per-turn iteration-budget reminder to the replayed system
 /// message so the model knows how many tool-calling rounds this turn gets.
 /// Returns the original `content` so the caller can restore it before
@@ -2516,8 +2533,8 @@ mod tests {
            assistant_text("here is the answer"),
        ];

-        // Default budget: (8192 - 2048) * 4 bytes ≈ 24KB. The text easily fits;
-        // only the (excluded) image bytes could blow it.
+        // Default budget: (32768 - 2048) * 4 bytes ≈ 120KB. The text easily
+        // fits; only the (excluded) image bytes could blow it.
        let budget_bytes = (DEFAULT_NUM_CTX as usize - RESPONSE_HEADROOM_TOKENS) * BYTES_PER_TOKEN;
        let original_len = msgs.len();
        let dropped = apply_context_budget(&mut msgs, budget_bytes);