From a542ea411b820100a793cd85e6f751e0ce9709e6 Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Sat, 30 May 2026 11:51:57 -0400
Subject: [PATCH] Exclude inlined image bytes from chat context budget
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The truncation budget estimated message size by serializing the full
ChatMessage array, including the base64 image persisted in the first
user message. A 1024px JPEG is hundreds of KB of base64 characters —
8-19x the entire ~24KB text budget at the default num_ctx — and the
image lives in the protected prefix that's never dropped. The budget
check was therefore essentially always over, dropping all tool history
and firing the "trimmed context" banner on every turn for vision
backends that inline images.

estimate_bytes now strips image payloads before counting and charges a
flat IMAGE_TOKENS_EACH per image instead, so the budget reflects real
text token pressure. Adds a regression test covering a short
conversation with one large image.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 src/ai/insight_chat.rs | 61 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 59 insertions(+), 2 deletions(-)
diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs
index 3622972..f4d478b 100644
--- a/src/ai/insight_chat.rs
+++ b/src/ai/insight_chat.rs
@@ -27,6 +27,12 @@ const RESPONSE_HEADROOM_TOKENS: usize = 2048;
 /// tokenization is model-specific; this avoids carrying tiktoken just for a
 /// soft bound.
 const BYTES_PER_TOKEN: usize = 4;
+/// Flat token cost charged per inlined image in the truncation budget. A
+/// 1024px-longest-edge JPEG (see `load_image_as_base64`) costs vision models on
+/// the order of ~1.3K tokens. Crucially, the raw base64 (hundreds of KB of
+/// characters) must NOT be counted as text bytes — doing so dwarfs the entire
+/// text budget and forces spurious truncation on every turn.
+const IMAGE_TOKENS_EACH: usize = 1300;
 
 pub type ChatLockMap = Arc<TokioMutex<HashMap<(i32, String), Arc<TokioMutex<()>>>>>;
 
@@ -2359,10 +2365,32 @@ pub(crate) fn apply_context_budget(messages: &mut Vec<ChatMessage>, budget_bytes
     dropped_any
 }
 
+/// Estimate the serialized byte size of `messages` for the truncation budget,
+/// EXCLUDING inlined base64 image payloads. Images are charged a flat
+/// `IMAGE_TOKENS_EACH` instead: their base64 is hundreds of KB of characters
+/// that have no relation to the text token pressure we're budgeting against,
+/// and counting them verbatim makes a single photo exceed the entire budget,
+/// spuriously trimming all history on every turn.
 fn estimate_bytes(messages: &[ChatMessage]) -> usize {
-    serde_json::to_string(messages)
+    let mut image_count = 0usize;
+    // Clone with image payloads stripped so they don't inflate the byte count.
+    // We still account for the (small) non-image fields verbatim.
+    let stripped: Vec<ChatMessage> = messages
+        .iter()
+        .map(|m| {
+            if let Some(imgs) = m.images.as_ref() {
+                image_count += imgs.len();
+            }
+            ChatMessage {
+                images: None,
+                ..m.clone()
+            }
+        })
+        .collect();
+    let text_bytes = serde_json::to_string(&stripped)
         .map(|s| s.len())
-        .unwrap_or(0)
+        .unwrap_or(0);
+    text_bytes + image_count * IMAGE_TOKENS_EACH * BYTES_PER_TOKEN
 }
 
 #[cfg(test)]
@@ -2422,6 +2450,35 @@ mod tests {
         assert_eq!(msgs.len(), 2);
     }
 
+    #[test]
+    fn image_payload_excluded_from_budget() {
+        // First user message carries a ~400KB base64 image but only a little
+        // text. Counting the base64 verbatim (old behavior) dwarfs the budget
+        // and forces all tool history to be dropped on every turn. The image
+        // must instead be charged a flat per-image cost so a short
+        // conversation comfortably fits.
+        let mut user = ChatMessage::user("describe this");
+        user.images = Some(vec!["A".repeat(400_000)]);
+        let mut msgs = vec![
+            ChatMessage::system("sys"),
+            user,
+            assistant_with_tool_call("get_x"),
+            ChatMessage::tool_result("small x result"),
+            assistant_text("here is the answer"),
+        ];
+
+        // Default budget: (8192 - 2048) * 4 bytes ≈ 24KB. The text easily fits;
+        // only the (excluded) image bytes could blow it.
+        let budget_bytes = (DEFAULT_NUM_CTX as usize - RESPONSE_HEADROOM_TOKENS) * BYTES_PER_TOKEN;
+        let original_len = msgs.len();
+        let dropped = apply_context_budget(&mut msgs, budget_bytes);
+
+        assert!(!dropped, "short conversation with one image must not truncate");
+        assert_eq!(msgs.len(), original_len, "no messages should be dropped");
+        // Sanity: the flat image charge is accounted for but stays well under budget.
+        assert!(estimate_bytes(&msgs) <= budget_bytes);
+    }
+
     #[test]
     fn truncation_returns_false_with_no_droppable_pairs() {
         // Only system + user, no tool-call turns to drop.