diff --git a/CLAUDE.md b/CLAUDE.md index fba33e0..b63ed4c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -671,6 +671,11 @@ LLAMA_SWAP_TTS_REQUEST_TIMEOUT_SECONDS=600 # Per-request synth timeout (long # Insight Chat Continuation AGENTIC_CHAT_MAX_ITERATIONS=6 # Cap on tool-calling iterations per chat turn (default 6) +AGENTIC_CHAT_DEFAULT_NUM_CTX=32768 # Assumed context window for the history-truncation budget + # when a chat request omits num_ctx (default 32768). Size to + # the smallest context among the chat models actually served; + # too small silently guts replayed history every turn (and + # destroys llama.cpp KV-cache prefix reuse). ``` **AI Insights Fallback Behavior:** @@ -794,14 +799,17 @@ Per-`(library_id, file_path)` async mutex (`AppState.insight_chat.chat_locks`) serialises concurrent turns on the same insight so the JSON blob doesn't race. Context management is a soft bound: if the serialized history exceeds -`num_ctx - 2048` tokens (cheap 4-byte/token heuristic), the oldest -assistant-tool_call + tool_result pairs are dropped until under budget. The +`num_ctx - 2048` tokens (cheap 4-byte/token heuristic; `num_ctx` defaults +to `AGENTIC_CHAT_DEFAULT_NUM_CTX`, 32768, when the request omits it), the +oldest assistant-tool_call + tool_result pairs are dropped until under budget. The initial user message (with any images) and system prompt are always preserved. The `truncated` event / flag is surfaced to the client when a drop occurred. Configurable env: - `AGENTIC_CHAT_MAX_ITERATIONS` — cap on tool-calling iterations per turn (default 6). Per-request `max_iterations` is clamped to this cap. +- `AGENTIC_CHAT_DEFAULT_NUM_CTX` — assumed context window for the truncation + budget when the request omits `num_ctx` (default 32768). **Apollo Places integration (optional):** diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs index 7298296..84f2b32 100644 --- a/src/ai/insight_chat.rs +++ b/src/ai/insight_chat.rs @@ -19,7 +19,13 @@ use futures::stream::{BoxStream, StreamExt}; use uuid::Uuid; const DEFAULT_MAX_ITERATIONS: usize = 6; -const DEFAULT_NUM_CTX: i32 = 8192; +/// Assumed context window when the request doesn't specify `num_ctx`. +/// The llama-swap chat slots serve 20k-131k contexts and real conversations +/// rarely pass ~16k tokens, so 32k keeps the truncation pass from gutting +/// history that the server could comfortably hold (which also destroys the +/// server's KV-cache prefix reuse). Override per-deploy with +/// AGENTIC_CHAT_DEFAULT_NUM_CTX if the serving models change shape. +const DEFAULT_NUM_CTX: i32 = 32768; /// Headroom reserved for the model's response, deducted from the context /// budget when deciding whether to truncate the replayed history. const RESPONSE_HEADROOM_TOKENS: usize = 2048; @@ -367,7 +373,7 @@ impl InsightChatService { // 6. Apply truncation budget. Drops oldest tool_call+tool pairs // (preserves system + first user including any images). - let budget_tokens = (req.num_ctx.unwrap_or(DEFAULT_NUM_CTX) as usize) + let budget_tokens = (req.num_ctx.unwrap_or_else(env_default_num_ctx) as usize) .saturating_sub(RESPONSE_HEADROOM_TOKENS); let budget_bytes = budget_tokens.saturating_mul(BYTES_PER_TOKEN); let truncated = apply_context_budget(&mut messages, budget_bytes); @@ -864,7 +870,7 @@ impl InsightChatService { None }; - let budget_tokens = (req.num_ctx.unwrap_or(DEFAULT_NUM_CTX) as usize) + let budget_tokens = (req.num_ctx.unwrap_or_else(env_default_num_ctx) as usize) .saturating_sub(RESPONSE_HEADROOM_TOKENS); let budget_bytes = budget_tokens.saturating_mul(BYTES_PER_TOKEN); let truncated = apply_context_budget(&mut messages, budget_bytes); @@ -1446,7 +1452,7 @@ impl InsightChatService { }; // Truncate before appending the new user turn. - let budget_tokens = (req.num_ctx.unwrap_or(DEFAULT_NUM_CTX) as usize) + let budget_tokens = (req.num_ctx.unwrap_or_else(env_default_num_ctx) as usize) .saturating_sub(RESPONSE_HEADROOM_TOKENS); let budget_bytes = budget_tokens.saturating_mul(BYTES_PER_TOKEN); let truncated = apply_context_budget(&mut messages, budget_bytes); @@ -2191,6 +2197,17 @@ fn env_max_iterations() -> usize { .max(1) } +/// Read AGENTIC_CHAT_DEFAULT_NUM_CTX once per call — the assumed context +/// window for the truncation budget when the request omits `num_ctx`. Same +/// no-static-global rationale as `env_max_iterations` above. +fn env_default_num_ctx() -> i32 { + std::env::var("AGENTIC_CHAT_DEFAULT_NUM_CTX") + .ok() + .and_then(|s| s.parse::().ok()) + .unwrap_or(DEFAULT_NUM_CTX) + .max(RESPONSE_HEADROOM_TOKENS as i32 + 1024) +} + /// Append a per-turn iteration-budget reminder to the replayed system /// message so the model knows how many tool-calling rounds this turn gets. /// Returns the original `content` so the caller can restore it before @@ -2516,8 +2533,8 @@ mod tests { assistant_text("here is the answer"), ]; - // Default budget: (8192 - 2048) * 4 bytes ≈ 24KB. The text easily fits; - // only the (excluded) image bytes could blow it. + // Default budget: (32768 - 2048) * 4 bytes ≈ 120KB. The text easily + // fits; only the (excluded) image bytes could blow it. let budget_bytes = (DEFAULT_NUM_CTX as usize - RESPONSE_HEADROOM_TOKENS) * BYTES_PER_TOKEN; let original_len = msgs.len(); let dropped = apply_context_budget(&mut msgs, budget_bytes);