diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs index 3622972..f4d478b 100644 --- a/src/ai/insight_chat.rs +++ b/src/ai/insight_chat.rs @@ -27,6 +27,12 @@ const RESPONSE_HEADROOM_TOKENS: usize = 2048; /// tokenization is model-specific; this avoids carrying tiktoken just for a /// soft bound. const BYTES_PER_TOKEN: usize = 4; +/// Flat token cost charged per inlined image in the truncation budget. A +/// 1024px-longest-edge JPEG (see `load_image_as_base64`) costs vision models on +/// the order of ~1.3K tokens. Crucially, the raw base64 (hundreds of KB of +/// characters) must NOT be counted as text bytes — doing so dwarfs the entire +/// text budget and forces spurious truncation on every turn. +const IMAGE_TOKENS_EACH: usize = 1300; pub type ChatLockMap = Arc>>>>; @@ -2359,10 +2365,32 @@ pub(crate) fn apply_context_budget(messages: &mut Vec, budget_bytes dropped_any } +/// Estimate the serialized byte size of `messages` for the truncation budget, +/// EXCLUDING inlined base64 image payloads. Images are charged a flat +/// `IMAGE_TOKENS_EACH` instead: their base64 is hundreds of KB of characters +/// that have no relation to the text token pressure we're budgeting against, +/// and counting them verbatim makes a single photo exceed the entire budget, +/// spuriously trimming all history on every turn. fn estimate_bytes(messages: &[ChatMessage]) -> usize { - serde_json::to_string(messages) + let mut image_count = 0usize; + // Clone with image payloads stripped so they don't inflate the byte count. + // We still account for the (small) non-image fields verbatim. + let stripped: Vec = messages + .iter() + .map(|m| { + if let Some(imgs) = m.images.as_ref() { + image_count += imgs.len(); + } + ChatMessage { + images: None, + ..m.clone() + } + }) + .collect(); + let text_bytes = serde_json::to_string(&stripped) .map(|s| s.len()) - .unwrap_or(0) + .unwrap_or(0); + text_bytes + image_count * IMAGE_TOKENS_EACH * BYTES_PER_TOKEN } #[cfg(test)] @@ -2422,6 +2450,35 @@ mod tests { assert_eq!(msgs.len(), 2); } + #[test] + fn image_payload_excluded_from_budget() { + // First user message carries a ~400KB base64 image but only a little + // text. Counting the base64 verbatim (old behavior) dwarfs the budget + // and forces all tool history to be dropped on every turn. The image + // must instead be charged a flat per-image cost so a short + // conversation comfortably fits. + let mut user = ChatMessage::user("describe this"); + user.images = Some(vec!["A".repeat(400_000)]); + let mut msgs = vec![ + ChatMessage::system("sys"), + user, + assistant_with_tool_call("get_x"), + ChatMessage::tool_result("small x result"), + assistant_text("here is the answer"), + ]; + + // Default budget: (8192 - 2048) * 4 bytes ≈ 24KB. The text easily fits; + // only the (excluded) image bytes could blow it. + let budget_bytes = (DEFAULT_NUM_CTX as usize - RESPONSE_HEADROOM_TOKENS) * BYTES_PER_TOKEN; + let original_len = msgs.len(); + let dropped = apply_context_budget(&mut msgs, budget_bytes); + + assert!(!dropped, "short conversation with one image must not truncate"); + assert_eq!(msgs.len(), original_len, "no messages should be dropped"); + // Sanity: the flat image charge is accounted for but stays well under budget. + assert!(estimate_bytes(&msgs) <= budget_bytes); + } + #[test] fn truncation_returns_false_with_no_droppable_pairs() { // Only system + user, no tool-call turns to drop.