diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs index f8a023f..c58dc4d 100644 --- a/src/ai/insight_chat.rs +++ b/src/ai/insight_chat.rs @@ -934,25 +934,16 @@ impl InsightChatService { // discusses metadata-only is still useful. let image_base64: Option = self.generator.load_image_as_base64(&normalized).ok(); - // Photo path block. Several agentic tools (recall_facts_for_photo, - // get_file_tags, get_faces_in_photo, etc.) take a `file_path` arg - // that the model has no way to know unless we put it in the user - // turn. Without this block small models invent placeholders like - // "input_file_0.png" or refuse and ask the user. Mirrors the - // user_content layout `generate_agentic_insight_for_photo` uses. - let path_block = format!("Photo file path: {}\n\n", normalized); - // Hybrid backend: pre-describe the image via local Ollama vision - // and inline the description into the user turn. OpenRouter chat - // models don't see images directly. Mirrors the same pre-describe + // so OpenRouter chat models (which can't see images directly) get + // the visual description as text. Mirrors the same pre-describe // pass that `generate_agentic_insight_for_photo` does for hybrid. let visual_block = if is_hybrid { match image_base64.as_deref() { Some(b64) => match self.ollama.describe_image(b64).await { - Ok(desc) => format!( - "Visual description (from local vision model):\n{}\n\n", - desc - ), + Ok(desc) => { + format!("Visual description (from local vision model):\n{}\n", desc) + } Err(e) => { log::warn!("hybrid bootstrap: local describe_image failed: {}", e); String::new() @@ -964,8 +955,6 @@ impl InsightChatService { String::new() }; - let user_content = format!("{}{}{}", path_block, visual_block, req.user_message); - // Tool gates. Local + image present → expose describe_photo so // the chat model can re-look at the photo on demand. Hybrid: // already inlined, no tool needed. @@ -973,10 +962,17 @@ impl InsightChatService { let gate_opts = self.generator.current_gate_opts(offer_describe_tool); let tools = InsightGenerator::build_tool_definitions(gate_opts); - // Build initial messages: persona system + user-turn-with-image. - let system_content = resolve_bootstrap_system_prompt(req.system_prompt.as_deref()); + // System message = persona + photo context block. Photo context + // is in the system message — not the user turn — so the user's + // bubble in the rendered transcript shows only what they typed. + // Several agentic tools (recall_facts_for_photo, get_file_tags, + // get_faces_in_photo, etc.) take a `file_path` arg the model + // can't know without being told; in hybrid mode the visual + // description belongs here for the same reason. + let persona = resolve_bootstrap_system_prompt(req.system_prompt.as_deref()); + let system_content = build_bootstrap_system_message(&persona, &normalized, &visual_block); let system_msg = ChatMessage::system(system_content); - let mut user_msg = ChatMessage::user(user_content); + let mut user_msg = ChatMessage::user(req.user_message.clone()); if !is_hybrid && let Some(ref img) = image_base64 { user_msg.images = Some(vec![img.clone()]); } @@ -1241,6 +1237,13 @@ impl InsightChatService { // No-tools fallback: loop exhausted iterations without producing a // tool-free reply. Ask once more, with no tools attached. if final_content.is_empty() { + // Index of the synthetic "please write your final answer" + // user message — we strip it from history after the model + // responds, so it never appears in the rendered transcript + // and load_history's user-turn handler doesn't reset + // pending_tools at this position (wiping the prior tool + // calls from the final assistant render). + let synthetic_idx = messages.len(); messages.push(ChatMessage::user( "Please write your final answer now without calling any more tools.", )); @@ -1270,6 +1273,10 @@ impl InsightChatService { final_message.ok_or_else(|| anyhow!("final stream ended without a Done event"))?; final_content = final_response.content.clone(); messages.push(final_response); + // Drop the synthetic prompt — internal scaffolding only. The + // model's final_response (now at the end) was generated with + // it in context and reads coherently without it on replay. + messages.remove(synthetic_idx); } Ok(AgenticLoopOutcome { @@ -1301,6 +1308,27 @@ fn resolve_bootstrap_system_prompt(supplied: Option<&str>) -> String { .unwrap_or_else(|| BOOTSTRAP_DEFAULT_SYSTEM_PROMPT.to_string()) } +/// Compose the bootstrap system message: the persona on top, followed +/// by a photo-context block carrying the file path (and, in hybrid +/// mode, the local-vision visual description). Lives in the system +/// message — not the user turn — so the rendered transcript shows +/// only what the user typed. +fn build_bootstrap_system_message( + persona: &str, + normalized_path: &str, + visual_block: &str, +) -> String { + let mut out = persona.trim_end().to_string(); + out.push_str("\n\n--- PHOTO CONTEXT ---\n"); + out.push_str(&format!("Photo file path: {}\n", normalized_path)); + if !visual_block.is_empty() { + // visual_block already ends with a newline; no extra separator + // needed. + out.push_str(visual_block); + } + out +} + /// Pick the backend label for bootstrap. Bootstrap has no stored insight /// to defer to (that's continuation's behaviour), so the default is /// `"local"`. Returns an error if the supplied label is non-empty but @@ -1929,4 +1957,35 @@ mod tests { assert!(msg.contains("unknown backend")); assert!(msg.contains("openrouter")); } + + #[test] + fn bootstrap_system_message_includes_path_and_persona() { + let out = build_bootstrap_system_message("you are helpful", "pics/IMG.jpg", ""); + assert!(out.starts_with("you are helpful")); + assert!(out.contains("--- PHOTO CONTEXT ---")); + assert!(out.contains("Photo file path: pics/IMG.jpg")); + // No visual block — should not introduce a stray "Visual" line. + assert!(!out.contains("Visual description")); + } + + #[test] + fn bootstrap_system_message_includes_visual_block_when_supplied() { + let visual = "Visual description (from local vision model):\nA dog in a park.\n"; + let out = build_bootstrap_system_message("voice", "p.jpg", visual); + assert!(out.contains("Photo file path: p.jpg")); + assert!(out.contains("A dog in a park")); + // Path appears before visual. + let path_pos = out.find("Photo file path:").unwrap(); + let visual_pos = out.find("A dog in a park").unwrap(); + assert!(path_pos < visual_pos); + } + + #[test] + fn bootstrap_system_message_trims_persona_trailing_whitespace() { + // Two consecutive newlines before the photo-context divider — + // any trailing whitespace from the persona must be collapsed + // so we don't end up with `\n\n\n\n--- PHOTO CONTEXT ---`. + let out = build_bootstrap_system_message("voice \n\n\n", "p.jpg", ""); + assert!(out.contains("voice\n\n--- PHOTO CONTEXT ---")); + } }