feature/insight-jobs #102
+59
-2
@@ -27,6 +27,12 @@ const RESPONSE_HEADROOM_TOKENS: usize = 2048;
|
|||||||
/// tokenization is model-specific; this avoids carrying tiktoken just for a
|
/// tokenization is model-specific; this avoids carrying tiktoken just for a
|
||||||
/// soft bound.
|
/// soft bound.
|
||||||
const BYTES_PER_TOKEN: usize = 4;
|
const BYTES_PER_TOKEN: usize = 4;
|
||||||
|
/// Flat token cost charged per inlined image in the truncation budget. A
|
||||||
|
/// 1024px-longest-edge JPEG (see `load_image_as_base64`) costs vision models on
|
||||||
|
/// the order of ~1.3K tokens. Crucially, the raw base64 (hundreds of KB of
|
||||||
|
/// characters) must NOT be counted as text bytes — doing so dwarfs the entire
|
||||||
|
/// text budget and forces spurious truncation on every turn.
|
||||||
|
const IMAGE_TOKENS_EACH: usize = 1300;
|
||||||
|
|
||||||
pub type ChatLockMap = Arc<TokioMutex<HashMap<(i32, String), Arc<TokioMutex<()>>>>>;
|
pub type ChatLockMap = Arc<TokioMutex<HashMap<(i32, String), Arc<TokioMutex<()>>>>>;
|
||||||
|
|
||||||
@@ -2359,10 +2365,32 @@ pub(crate) fn apply_context_budget(messages: &mut Vec<ChatMessage>, budget_bytes
|
|||||||
dropped_any
|
dropped_any
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Estimate the serialized byte size of `messages` for the truncation budget,
|
||||||
|
/// EXCLUDING inlined base64 image payloads. Images are charged a flat
|
||||||
|
/// `IMAGE_TOKENS_EACH` instead: their base64 is hundreds of KB of characters
|
||||||
|
/// that have no relation to the text token pressure we're budgeting against,
|
||||||
|
/// and counting them verbatim makes a single photo exceed the entire budget,
|
||||||
|
/// spuriously trimming all history on every turn.
|
||||||
fn estimate_bytes(messages: &[ChatMessage]) -> usize {
|
fn estimate_bytes(messages: &[ChatMessage]) -> usize {
|
||||||
serde_json::to_string(messages)
|
let mut image_count = 0usize;
|
||||||
|
// Clone with image payloads stripped so they don't inflate the byte count.
|
||||||
|
// We still account for the (small) non-image fields verbatim.
|
||||||
|
let stripped: Vec<ChatMessage> = messages
|
||||||
|
.iter()
|
||||||
|
.map(|m| {
|
||||||
|
if let Some(imgs) = m.images.as_ref() {
|
||||||
|
image_count += imgs.len();
|
||||||
|
}
|
||||||
|
ChatMessage {
|
||||||
|
images: None,
|
||||||
|
..m.clone()
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
let text_bytes = serde_json::to_string(&stripped)
|
||||||
.map(|s| s.len())
|
.map(|s| s.len())
|
||||||
.unwrap_or(0)
|
.unwrap_or(0);
|
||||||
|
text_bytes + image_count * IMAGE_TOKENS_EACH * BYTES_PER_TOKEN
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -2422,6 +2450,35 @@ mod tests {
|
|||||||
assert_eq!(msgs.len(), 2);
|
assert_eq!(msgs.len(), 2);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn image_payload_excluded_from_budget() {
|
||||||
|
// First user message carries a ~400KB base64 image but only a little
|
||||||
|
// text. Counting the base64 verbatim (old behavior) dwarfs the budget
|
||||||
|
// and forces all tool history to be dropped on every turn. The image
|
||||||
|
// must instead be charged a flat per-image cost so a short
|
||||||
|
// conversation comfortably fits.
|
||||||
|
let mut user = ChatMessage::user("describe this");
|
||||||
|
user.images = Some(vec!["A".repeat(400_000)]);
|
||||||
|
let mut msgs = vec![
|
||||||
|
ChatMessage::system("sys"),
|
||||||
|
user,
|
||||||
|
assistant_with_tool_call("get_x"),
|
||||||
|
ChatMessage::tool_result("small x result"),
|
||||||
|
assistant_text("here is the answer"),
|
||||||
|
];
|
||||||
|
|
||||||
|
// Default budget: (8192 - 2048) * 4 bytes ≈ 24KB. The text easily fits;
|
||||||
|
// only the (excluded) image bytes could blow it.
|
||||||
|
let budget_bytes = (DEFAULT_NUM_CTX as usize - RESPONSE_HEADROOM_TOKENS) * BYTES_PER_TOKEN;
|
||||||
|
let original_len = msgs.len();
|
||||||
|
let dropped = apply_context_budget(&mut msgs, budget_bytes);
|
||||||
|
|
||||||
|
assert!(!dropped, "short conversation with one image must not truncate");
|
||||||
|
assert_eq!(msgs.len(), original_len, "no messages should be dropped");
|
||||||
|
// Sanity: the flat image charge is accounted for but stays well under budget.
|
||||||
|
assert!(estimate_bytes(&msgs) <= budget_bytes);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn truncation_returns_false_with_no_droppable_pairs() {
|
fn truncation_returns_false_with_no_droppable_pairs() {
|
||||||
// Only system + user, no tool-call turns to drop.
|
// Only system + user, no tool-call turns to drop.
|
||||||
|
|||||||
Reference in New Issue
Block a user