insight-chat: photo context belongs in system msg, not user turn

After refresh, the rendered transcript was showing two unwanted artifacts in the initial user bubble: Photo file path: pics/DSC_5171.jpg please tell me about this photo and what was going on around it Please write your final answer now without calling any more tools. Two distinct bugs: 1. Bootstrap was prepending `Photo file path: <path>` (and, in hybrid mode, the visual description block) into the user-turn content. The model needed it to call file_path-keyed tools, but the user could see it in their own bubble on replay. 2. The no-tools fallback ("Please write your final answer now…") was a synthetic user message we never stripped from history, so it persisted into training_messages, rendered as a second user bubble, AND wiped the prior tool-call accumulator inside load_history (user-turn handler clears pending_tools), which is why the tool invocations disappeared from the assistant bubble after refresh. Fixes: - New `build_bootstrap_system_message` helper composes the persona with a `--- PHOTO CONTEXT ---` block (path + optional visual description). Lives in the system message, not the user turn. The user's bubble shows only what they typed. - Streaming agentic loop's no-tools fallback now records its insertion index and removes the synthetic user prompt from `messages` after the model responds. Final assistant content stays — it reads coherently on replay without the synthetic prompt above it. Applies to both bootstrap and continuation. 3 new tests cover the system-message composer (path-only, with visual block, persona-trim). Total insight_chat unit tests: 27.
2026-05-08 11:07:03 -04:00
parent 24ecf2abd4
commit a0ec1a5080
1 changed files with 78 additions and 19 deletions
--- a/src/ai/insight_chat.rs
+++ b/src/ai/insight_chat.rs
@@ -934,25 +934,16 @@ impl InsightChatService {
        // discusses metadata-only is still useful.
        let image_base64: Option<String> = self.generator.load_image_as_base64(&normalized).ok();
        // Photo path block. Several agentic tools (recall_facts_for_photo,
        // get_file_tags, get_faces_in_photo, etc.) take a `file_path` arg
        // that the model has no way to know unless we put it in the user
        // turn. Without this block small models invent placeholders like
        // "input_file_0.png" or refuse and ask the user. Mirrors the
        // user_content layout `generate_agentic_insight_for_photo` uses.
        let path_block = format!("Photo file path: {}\n\n", normalized);
        // Hybrid backend: pre-describe the image via local Ollama vision
-        // and inline the description into the user turn. OpenRouter chat
+        // so OpenRouter chat models (which can't see images directly) get
-        // models don't see images directly. Mirrors the same pre-describe
+        // the visual description as text. Mirrors the same pre-describe
        // pass that `generate_agentic_insight_for_photo` does for hybrid.
        let visual_block = if is_hybrid {
            match image_base64.as_deref() {
                Some(b64) => match self.ollama.describe_image(b64).await {
-                    Ok(desc) => format!(
+                    Ok(desc) => {
-                        "Visual description (from local vision model):\n{}\n\n",
+                        format!("Visual description (from local vision model):\n{}\n", desc)
-                        desc
+                    }
                    ),
                    Err(e) => {
                        log::warn!("hybrid bootstrap: local describe_image failed: {}", e);
                        String::new()
@@ -964,8 +955,6 @@ impl InsightChatService {
            String::new()
        };
        let user_content = format!("{}{}{}", path_block, visual_block, req.user_message);
        // Tool gates. Local + image present → expose describe_photo so
        // the chat model can re-look at the photo on demand. Hybrid:
        // already inlined, no tool needed.
@@ -973,10 +962,17 @@ impl InsightChatService {
        let gate_opts = self.generator.current_gate_opts(offer_describe_tool);
        let tools = InsightGenerator::build_tool_definitions(gate_opts);
-        // Build initial messages: persona system + user-turn-with-image.
+        // System message = persona + photo context block. Photo context
-        let system_content = resolve_bootstrap_system_prompt(req.system_prompt.as_deref());
+        // is in the system message — not the user turn — so the user's
        // bubble in the rendered transcript shows only what they typed.
        // Several agentic tools (recall_facts_for_photo, get_file_tags,
        // get_faces_in_photo, etc.) take a `file_path` arg the model
        // can't know without being told; in hybrid mode the visual
        // description belongs here for the same reason.
        let persona = resolve_bootstrap_system_prompt(req.system_prompt.as_deref());
        let system_content = build_bootstrap_system_message(&persona, &normalized, &visual_block);
        let system_msg = ChatMessage::system(system_content);
-        let mut user_msg = ChatMessage::user(user_content);
+        let mut user_msg = ChatMessage::user(req.user_message.clone());
        if !is_hybrid && let Some(ref img) = image_base64 {
            user_msg.images = Some(vec![img.clone()]);
        }
@@ -1241,6 +1237,13 @@ impl InsightChatService {
        // No-tools fallback: loop exhausted iterations without producing a
        // tool-free reply. Ask once more, with no tools attached.
        if final_content.is_empty() {
            // Index of the synthetic "please write your final answer"
            // user message — we strip it from history after the model
            // responds, so it never appears in the rendered transcript
            // and load_history's user-turn handler doesn't reset
            // pending_tools at this position (wiping the prior tool
            // calls from the final assistant render).
            let synthetic_idx = messages.len();
            messages.push(ChatMessage::user(
                "Please write your final answer now without calling any more tools.",
            ));
@@ -1270,6 +1273,10 @@ impl InsightChatService {
                final_message.ok_or_else(|| anyhow!("final stream ended without a Done event"))?;
            final_content = final_response.content.clone();
            messages.push(final_response);
            // Drop the synthetic prompt — internal scaffolding only. The
            // model's final_response (now at the end) was generated with
            // it in context and reads coherently without it on replay.
            messages.remove(synthetic_idx);
        }
        Ok(AgenticLoopOutcome {
@@ -1301,6 +1308,27 @@ fn resolve_bootstrap_system_prompt(supplied: Option<&str>) -> String {
        .unwrap_or_else(|| BOOTSTRAP_DEFAULT_SYSTEM_PROMPT.to_string())
 }
 /// Compose the bootstrap system message: the persona on top, followed
 /// by a photo-context block carrying the file path (and, in hybrid
 /// mode, the local-vision visual description). Lives in the system
 /// message — not the user turn — so the rendered transcript shows
 /// only what the user typed.
 fn build_bootstrap_system_message(
    persona: &str,
    normalized_path: &str,
    visual_block: &str,
 ) -> String {
    let mut out = persona.trim_end().to_string();
    out.push_str("\n\n--- PHOTO CONTEXT ---\n");
    out.push_str(&format!("Photo file path: {}\n", normalized_path));
    if !visual_block.is_empty() {
        // visual_block already ends with a newline; no extra separator
        // needed.
        out.push_str(visual_block);
    }
    out
 }
 /// Pick the backend label for bootstrap. Bootstrap has no stored insight
 /// to defer to (that's continuation's behaviour), so the default is
 /// `"local"`. Returns an error if the supplied label is non-empty but
@@ -1929,4 +1957,35 @@ mod tests {
        assert!(msg.contains("unknown backend"));
        assert!(msg.contains("openrouter"));
    }
    #[test]
    fn bootstrap_system_message_includes_path_and_persona() {
        let out = build_bootstrap_system_message("you are helpful", "pics/IMG.jpg", "");
        assert!(out.starts_with("you are helpful"));
        assert!(out.contains("--- PHOTO CONTEXT ---"));
        assert!(out.contains("Photo file path: pics/IMG.jpg"));
        // No visual block — should not introduce a stray "Visual" line.
        assert!(!out.contains("Visual description"));
    }
    #[test]
    fn bootstrap_system_message_includes_visual_block_when_supplied() {
        let visual = "Visual description (from local vision model):\nA dog in a park.\n";
        let out = build_bootstrap_system_message("voice", "p.jpg", visual);
        assert!(out.contains("Photo file path: p.jpg"));
        assert!(out.contains("A dog in a park"));
        // Path appears before visual.
        let path_pos = out.find("Photo file path:").unwrap();
        let visual_pos = out.find("A dog in a park").unwrap();
        assert!(path_pos < visual_pos);
    }
    #[test]
    fn bootstrap_system_message_trims_persona_trailing_whitespace() {
        // Two consecutive newlines before the photo-context divider —
        // any trailing whitespace from the persona must be collapsed
        // so we don't end up with `\n\n\n\n--- PHOTO CONTEXT ---`.
        let out = build_bootstrap_system_message("voice  \n\n\n", "p.jpg", "");
        assert!(out.contains("voice\n\n--- PHOTO CONTEXT ---"));
    }
 }