From 24ecf2abd462868d302e02f4869b53eac6b43f79 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Fri, 8 May 2026 10:59:35 -0400 Subject: [PATCH] insight-chat: prepend Photo file path: to bootstrap user turn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bug: bootstrap user_content was just the user's typed message (plus the hybrid visual description). Tools that take a file_path arg — recall_facts_for_photo, get_file_tags, get_faces_in_photo — had no way to learn the canonical path. Small models would invent placeholders like "input_file_0.png" or call the tool with a name guessed from a hidden multimodal input handle, neither of which matched any real photo. Fix: prepend a single-line "Photo file path: \n\n" block to user_content. Same shape generate_agentic_insight_for_photo already uses for non-chat callers — kept the bootstrap minimal (no date / GPS / tags pre-stuffing; the agentic loop can fetch those via tools when needed). Hybrid still injects the visual description block between the path block and the user message; local mode just gets path + user text. --- src/ai/insight_chat.rs | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs index b3126f4..f8a023f 100644 --- a/src/ai/insight_chat.rs +++ b/src/ai/insight_chat.rs @@ -934,12 +934,20 @@ impl InsightChatService { // discusses metadata-only is still useful. let image_base64: Option = self.generator.load_image_as_base64(&normalized).ok(); + // Photo path block. Several agentic tools (recall_facts_for_photo, + // get_file_tags, get_faces_in_photo, etc.) take a `file_path` arg + // that the model has no way to know unless we put it in the user + // turn. Without this block small models invent placeholders like + // "input_file_0.png" or refuse and ask the user. Mirrors the + // user_content layout `generate_agentic_insight_for_photo` uses. + let path_block = format!("Photo file path: {}\n\n", normalized); + // Hybrid backend: pre-describe the image via local Ollama vision // and inline the description into the user turn. OpenRouter chat // models don't see images directly. Mirrors the same pre-describe // pass that `generate_agentic_insight_for_photo` does for hybrid. - let user_content = if is_hybrid { - let visual = match image_base64.as_deref() { + let visual_block = if is_hybrid { + match image_base64.as_deref() { Some(b64) => match self.ollama.describe_image(b64).await { Ok(desc) => format!( "Visual description (from local vision model):\n{}\n\n", @@ -951,12 +959,13 @@ impl InsightChatService { } }, None => String::new(), - }; - format!("{}{}", visual, req.user_message) + } } else { - req.user_message.clone() + String::new() }; + let user_content = format!("{}{}{}", path_block, visual_block, req.user_message); + // Tool gates. Local + image present → expose describe_photo so // the chat model can re-look at the photo on demand. Hybrid: // already inlined, no tool needed.