2026-06-02 23:41:37 +00:00
1 changed files with 59 additions and 2 deletions
@@ -27,6 +27,12 @@ const RESPONSE_HEADROOM_TOKENS: usize = 2048;
 /// tokenization is model-specific; this avoids carrying tiktoken just for a
 /// soft bound.
 const BYTES_PER_TOKEN: usize = 4;
 /// Flat token cost charged per inlined image in the truncation budget. A
 /// 1024px-longest-edge JPEG (see `load_image_as_base64`) costs vision models on
 /// the order of ~1.3K tokens. Crucially, the raw base64 (hundreds of KB of
 /// characters) must NOT be counted as text bytes — doing so dwarfs the entire
 /// text budget and forces spurious truncation on every turn.
 const IMAGE_TOKENS_EACH: usize = 1300;
 pub type ChatLockMap = Arc<TokioMutex<HashMap<(i32, String), Arc<TokioMutex<()>>>>>;
@@ -2359,10 +2365,32 @@ pub(crate) fn apply_context_budget(messages: &mut Vec<ChatMessage>, budget_bytes
    dropped_any
 }
 /// Estimate the serialized byte size of `messages` for the truncation budget,
 /// EXCLUDING inlined base64 image payloads. Images are charged a flat
 /// `IMAGE_TOKENS_EACH` instead: their base64 is hundreds of KB of characters
 /// that have no relation to the text token pressure we're budgeting against,
 /// and counting them verbatim makes a single photo exceed the entire budget,
 /// spuriously trimming all history on every turn.
 fn estimate_bytes(messages: &[ChatMessage]) -> usize {
-    serde_json::to_string(messages)
+    let mut image_count = 0usize;
    // Clone with image payloads stripped so they don't inflate the byte count.
    // We still account for the (small) non-image fields verbatim.
    let stripped: Vec<ChatMessage> = messages
        .iter()
        .map(|m| {
            if let Some(imgs) = m.images.as_ref() {
                image_count += imgs.len();
            }
            ChatMessage {
                images: None,
                ..m.clone()
            }
        })
        .collect();
    let text_bytes = serde_json::to_string(&stripped)
        .map(|s| s.len())
-        .unwrap_or(0)
+        .unwrap_or(0);
    text_bytes + image_count * IMAGE_TOKENS_EACH * BYTES_PER_TOKEN
 }
 #[cfg(test)]
@@ -2422,6 +2450,35 @@ mod tests {
        assert_eq!(msgs.len(), 2);
    }
    #[test]
    fn image_payload_excluded_from_budget() {
        // First user message carries a ~400KB base64 image but only a little
        // text. Counting the base64 verbatim (old behavior) dwarfs the budget
        // and forces all tool history to be dropped on every turn. The image
        // must instead be charged a flat per-image cost so a short
        // conversation comfortably fits.
        let mut user = ChatMessage::user("describe this");
        user.images = Some(vec!["A".repeat(400_000)]);
        let mut msgs = vec![
            ChatMessage::system("sys"),
            user,
            assistant_with_tool_call("get_x"),
            ChatMessage::tool_result("small x result"),
            assistant_text("here is the answer"),
        ];
        // Default budget: (8192 - 2048) * 4 bytes ≈ 24KB. The text easily fits;
        // only the (excluded) image bytes could blow it.
        let budget_bytes = (DEFAULT_NUM_CTX as usize - RESPONSE_HEADROOM_TOKENS) * BYTES_PER_TOKEN;
        let original_len = msgs.len();
        let dropped = apply_context_budget(&mut msgs, budget_bytes);
        assert!(!dropped, "short conversation with one image must not truncate");
        assert_eq!(msgs.len(), original_len, "no messages should be dropped");
        // Sanity: the flat image charge is accounted for but stays well under budget.
        assert!(estimate_bytes(&msgs) <= budget_bytes);
    }
    #[test]
    fn truncation_returns_false_with_no_droppable_pairs() {
        // Only system + user, no tool-call turns to drop.