insight-chat: photo context belongs in system msg, not user turn
After refresh, the rendered transcript was showing two unwanted
artifacts in the initial user bubble:
Photo file path: pics/DSC_5171.jpg
please tell me about this photo and what was going on around it
Please write your final answer now without calling any more tools.
Two distinct bugs:
1. Bootstrap was prepending `Photo file path: <path>` (and, in
hybrid mode, the visual description block) into the user-turn
content. The model needed it to call file_path-keyed tools, but
the user could see it in their own bubble on replay.
2. The no-tools fallback ("Please write your final answer now…")
was a synthetic user message we never stripped from history,
so it persisted into training_messages, rendered as a second
user bubble, AND wiped the prior tool-call accumulator inside
load_history (user-turn handler clears pending_tools), which
is why the tool invocations disappeared from the assistant
bubble after refresh.
Fixes:
- New `build_bootstrap_system_message` helper composes the persona
with a `--- PHOTO CONTEXT ---` block (path + optional visual
description). Lives in the system message, not the user turn.
The user's bubble shows only what they typed.
- Streaming agentic loop's no-tools fallback now records its
insertion index and removes the synthetic user prompt from
`messages` after the model responds. Final assistant content
stays — it reads coherently on replay without the synthetic
prompt above it. Applies to both bootstrap and continuation.
3 new tests cover the system-message composer (path-only, with
visual block, persona-trim). Total insight_chat unit tests: 27.
This commit is contained in:
@@ -934,25 +934,16 @@ impl InsightChatService {
|
|||||||
// discusses metadata-only is still useful.
|
// discusses metadata-only is still useful.
|
||||||
let image_base64: Option<String> = self.generator.load_image_as_base64(&normalized).ok();
|
let image_base64: Option<String> = self.generator.load_image_as_base64(&normalized).ok();
|
||||||
|
|
||||||
// Photo path block. Several agentic tools (recall_facts_for_photo,
|
|
||||||
// get_file_tags, get_faces_in_photo, etc.) take a `file_path` arg
|
|
||||||
// that the model has no way to know unless we put it in the user
|
|
||||||
// turn. Without this block small models invent placeholders like
|
|
||||||
// "input_file_0.png" or refuse and ask the user. Mirrors the
|
|
||||||
// user_content layout `generate_agentic_insight_for_photo` uses.
|
|
||||||
let path_block = format!("Photo file path: {}\n\n", normalized);
|
|
||||||
|
|
||||||
// Hybrid backend: pre-describe the image via local Ollama vision
|
// Hybrid backend: pre-describe the image via local Ollama vision
|
||||||
// and inline the description into the user turn. OpenRouter chat
|
// so OpenRouter chat models (which can't see images directly) get
|
||||||
// models don't see images directly. Mirrors the same pre-describe
|
// the visual description as text. Mirrors the same pre-describe
|
||||||
// pass that `generate_agentic_insight_for_photo` does for hybrid.
|
// pass that `generate_agentic_insight_for_photo` does for hybrid.
|
||||||
let visual_block = if is_hybrid {
|
let visual_block = if is_hybrid {
|
||||||
match image_base64.as_deref() {
|
match image_base64.as_deref() {
|
||||||
Some(b64) => match self.ollama.describe_image(b64).await {
|
Some(b64) => match self.ollama.describe_image(b64).await {
|
||||||
Ok(desc) => format!(
|
Ok(desc) => {
|
||||||
"Visual description (from local vision model):\n{}\n\n",
|
format!("Visual description (from local vision model):\n{}\n", desc)
|
||||||
desc
|
}
|
||||||
),
|
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
log::warn!("hybrid bootstrap: local describe_image failed: {}", e);
|
log::warn!("hybrid bootstrap: local describe_image failed: {}", e);
|
||||||
String::new()
|
String::new()
|
||||||
@@ -964,8 +955,6 @@ impl InsightChatService {
|
|||||||
String::new()
|
String::new()
|
||||||
};
|
};
|
||||||
|
|
||||||
let user_content = format!("{}{}{}", path_block, visual_block, req.user_message);
|
|
||||||
|
|
||||||
// Tool gates. Local + image present → expose describe_photo so
|
// Tool gates. Local + image present → expose describe_photo so
|
||||||
// the chat model can re-look at the photo on demand. Hybrid:
|
// the chat model can re-look at the photo on demand. Hybrid:
|
||||||
// already inlined, no tool needed.
|
// already inlined, no tool needed.
|
||||||
@@ -973,10 +962,17 @@ impl InsightChatService {
|
|||||||
let gate_opts = self.generator.current_gate_opts(offer_describe_tool);
|
let gate_opts = self.generator.current_gate_opts(offer_describe_tool);
|
||||||
let tools = InsightGenerator::build_tool_definitions(gate_opts);
|
let tools = InsightGenerator::build_tool_definitions(gate_opts);
|
||||||
|
|
||||||
// Build initial messages: persona system + user-turn-with-image.
|
// System message = persona + photo context block. Photo context
|
||||||
let system_content = resolve_bootstrap_system_prompt(req.system_prompt.as_deref());
|
// is in the system message — not the user turn — so the user's
|
||||||
|
// bubble in the rendered transcript shows only what they typed.
|
||||||
|
// Several agentic tools (recall_facts_for_photo, get_file_tags,
|
||||||
|
// get_faces_in_photo, etc.) take a `file_path` arg the model
|
||||||
|
// can't know without being told; in hybrid mode the visual
|
||||||
|
// description belongs here for the same reason.
|
||||||
|
let persona = resolve_bootstrap_system_prompt(req.system_prompt.as_deref());
|
||||||
|
let system_content = build_bootstrap_system_message(&persona, &normalized, &visual_block);
|
||||||
let system_msg = ChatMessage::system(system_content);
|
let system_msg = ChatMessage::system(system_content);
|
||||||
let mut user_msg = ChatMessage::user(user_content);
|
let mut user_msg = ChatMessage::user(req.user_message.clone());
|
||||||
if !is_hybrid && let Some(ref img) = image_base64 {
|
if !is_hybrid && let Some(ref img) = image_base64 {
|
||||||
user_msg.images = Some(vec![img.clone()]);
|
user_msg.images = Some(vec![img.clone()]);
|
||||||
}
|
}
|
||||||
@@ -1241,6 +1237,13 @@ impl InsightChatService {
|
|||||||
// No-tools fallback: loop exhausted iterations without producing a
|
// No-tools fallback: loop exhausted iterations without producing a
|
||||||
// tool-free reply. Ask once more, with no tools attached.
|
// tool-free reply. Ask once more, with no tools attached.
|
||||||
if final_content.is_empty() {
|
if final_content.is_empty() {
|
||||||
|
// Index of the synthetic "please write your final answer"
|
||||||
|
// user message — we strip it from history after the model
|
||||||
|
// responds, so it never appears in the rendered transcript
|
||||||
|
// and load_history's user-turn handler doesn't reset
|
||||||
|
// pending_tools at this position (wiping the prior tool
|
||||||
|
// calls from the final assistant render).
|
||||||
|
let synthetic_idx = messages.len();
|
||||||
messages.push(ChatMessage::user(
|
messages.push(ChatMessage::user(
|
||||||
"Please write your final answer now without calling any more tools.",
|
"Please write your final answer now without calling any more tools.",
|
||||||
));
|
));
|
||||||
@@ -1270,6 +1273,10 @@ impl InsightChatService {
|
|||||||
final_message.ok_or_else(|| anyhow!("final stream ended without a Done event"))?;
|
final_message.ok_or_else(|| anyhow!("final stream ended without a Done event"))?;
|
||||||
final_content = final_response.content.clone();
|
final_content = final_response.content.clone();
|
||||||
messages.push(final_response);
|
messages.push(final_response);
|
||||||
|
// Drop the synthetic prompt — internal scaffolding only. The
|
||||||
|
// model's final_response (now at the end) was generated with
|
||||||
|
// it in context and reads coherently without it on replay.
|
||||||
|
messages.remove(synthetic_idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(AgenticLoopOutcome {
|
Ok(AgenticLoopOutcome {
|
||||||
@@ -1301,6 +1308,27 @@ fn resolve_bootstrap_system_prompt(supplied: Option<&str>) -> String {
|
|||||||
.unwrap_or_else(|| BOOTSTRAP_DEFAULT_SYSTEM_PROMPT.to_string())
|
.unwrap_or_else(|| BOOTSTRAP_DEFAULT_SYSTEM_PROMPT.to_string())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Compose the bootstrap system message: the persona on top, followed
|
||||||
|
/// by a photo-context block carrying the file path (and, in hybrid
|
||||||
|
/// mode, the local-vision visual description). Lives in the system
|
||||||
|
/// message — not the user turn — so the rendered transcript shows
|
||||||
|
/// only what the user typed.
|
||||||
|
fn build_bootstrap_system_message(
|
||||||
|
persona: &str,
|
||||||
|
normalized_path: &str,
|
||||||
|
visual_block: &str,
|
||||||
|
) -> String {
|
||||||
|
let mut out = persona.trim_end().to_string();
|
||||||
|
out.push_str("\n\n--- PHOTO CONTEXT ---\n");
|
||||||
|
out.push_str(&format!("Photo file path: {}\n", normalized_path));
|
||||||
|
if !visual_block.is_empty() {
|
||||||
|
// visual_block already ends with a newline; no extra separator
|
||||||
|
// needed.
|
||||||
|
out.push_str(visual_block);
|
||||||
|
}
|
||||||
|
out
|
||||||
|
}
|
||||||
|
|
||||||
/// Pick the backend label for bootstrap. Bootstrap has no stored insight
|
/// Pick the backend label for bootstrap. Bootstrap has no stored insight
|
||||||
/// to defer to (that's continuation's behaviour), so the default is
|
/// to defer to (that's continuation's behaviour), so the default is
|
||||||
/// `"local"`. Returns an error if the supplied label is non-empty but
|
/// `"local"`. Returns an error if the supplied label is non-empty but
|
||||||
@@ -1929,4 +1957,35 @@ mod tests {
|
|||||||
assert!(msg.contains("unknown backend"));
|
assert!(msg.contains("unknown backend"));
|
||||||
assert!(msg.contains("openrouter"));
|
assert!(msg.contains("openrouter"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bootstrap_system_message_includes_path_and_persona() {
|
||||||
|
let out = build_bootstrap_system_message("you are helpful", "pics/IMG.jpg", "");
|
||||||
|
assert!(out.starts_with("you are helpful"));
|
||||||
|
assert!(out.contains("--- PHOTO CONTEXT ---"));
|
||||||
|
assert!(out.contains("Photo file path: pics/IMG.jpg"));
|
||||||
|
// No visual block — should not introduce a stray "Visual" line.
|
||||||
|
assert!(!out.contains("Visual description"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bootstrap_system_message_includes_visual_block_when_supplied() {
|
||||||
|
let visual = "Visual description (from local vision model):\nA dog in a park.\n";
|
||||||
|
let out = build_bootstrap_system_message("voice", "p.jpg", visual);
|
||||||
|
assert!(out.contains("Photo file path: p.jpg"));
|
||||||
|
assert!(out.contains("A dog in a park"));
|
||||||
|
// Path appears before visual.
|
||||||
|
let path_pos = out.find("Photo file path:").unwrap();
|
||||||
|
let visual_pos = out.find("A dog in a park").unwrap();
|
||||||
|
assert!(path_pos < visual_pos);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn bootstrap_system_message_trims_persona_trailing_whitespace() {
|
||||||
|
// Two consecutive newlines before the photo-context divider —
|
||||||
|
// any trailing whitespace from the persona must be collapsed
|
||||||
|
// so we don't end up with `\n\n\n\n--- PHOTO CONTEXT ---`.
|
||||||
|
let out = build_bootstrap_system_message("voice \n\n\n", "p.jpg", "");
|
||||||
|
assert!(out.contains("voice\n\n--- PHOTO CONTEXT ---"));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user