ai: restructure agentic user message — facts up top + forcing gate

Small models (~8B) were producing generic responses regardless of persona, and bailing out of the agentic loop on iteration 1. Two underlying causes: 1. Photo facts (date, location, contact, tags, visual) were buried between "Please analyze this photo" preamble and "Use the available tools" outro. Small models skim and miss them, which is why outputs weren't anchoring to the actual photo. 2. The user message ended with "write a detailed insight" — small models took the path of least resistance and just wrote, ignoring the soft "aim to use 5 tools" floor in the system prompt. Restructured the user message: - Leads with a "## This photo" bulleted block so the metadata is visible top-down. File path, date+source, contact, location+GPS, tags, and (in hybrid) the visual description are all bullets the model can't skim past. - Replaces the prose body with a numbered "## What to do" recipe: (1) recall_facts_for_photo + recall_entities, (2) ≥3 of the time-window tools, (3) write only after tool results, referencing specific facts. "Generic narration is not acceptable" is explicit. - Ends with a hard forcing line: "YOUR FIRST RESPONSE MUST BE A TOOL CALL. Do not output any final answer text until you have called at least 5 tools." Replaces the soft "aim to" floor with a directive small models actually follow. Tradeoff: big models also follow the recipe literally and may call 5 tools when 3 would do. Optimizing for the small-model floor first; soften once that's working. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-07 10:59:39 -04:00
parent 66ea8490ab
commit 2ff06413c6
1 changed files with 62 additions and 43 deletions
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -3732,13 +3732,28 @@ Return ONLY the summary, nothing else."#,
        };

        // 9. Build user message
-        // Compose a single Location: line with both the resolved name and the
-        // raw coordinates. Falls back to bare GPS when the geocoders failed,
-        // and to "Location: unknown" when there are no coordinates at all.
-        let location_info = match (resolved_location.as_deref(), exif.as_ref()) {
+        // The user message is restructured to lead with photo facts as a
+        // bulleted "## This photo" block (so small models can't skim past
+        // them), followed by an imperative "## What to do" recipe and a
+        // forcing line. Small models bail out of the agentic loop when the
+        // user message ends with "write a detailed insight" — they just
+        // write. The forcing line replaces the soft "aim to use 5 tools"
+        // floor with a hard "do not output text yet" gate.
+
+        // Date with weekday + canonical-date source so the model can hedge
+        // on filename- or fs_time-derived dates.
+        let date_bullet = format!(
+            "- Date: {} (source: {})",
+            date_taken.format("%A, %B %d, %Y"),
+            date_taken_source
+        );
+
+        // Location: full resolved string + raw coordinates when GPS is
+        // present, falling back to "unknown" when not.
+        let location_bullet = match (resolved_location.as_deref(), exif.as_ref()) {
            (Some(name), Some(e)) if e.gps_latitude.is_some() && e.gps_longitude.is_some() => {
                format!(
-                    "Location: {} (GPS {:.4}, {:.4})",
+                    "- Location: {} (GPS {:.4}, {:.4})",
                    name,
                    e.gps_latitude.unwrap(),
                    e.gps_longitude.unwrap()
@@ -3746,61 +3761,65 @@ Return ONLY the summary, nothing else."#,
            }
            (None, Some(e)) if e.gps_latitude.is_some() && e.gps_longitude.is_some() => {
                format!(
-                    "Location: GPS {:.4}, {:.4} (geocoder unavailable)",
+                    "- Location: GPS {:.4}, {:.4} (geocoder unavailable)",
                    e.gps_latitude.unwrap(),
                    e.gps_longitude.unwrap()
                )
            }
-            _ => "Location: unknown".to_string(),
+            _ => "- Location: unknown".to_string(),
        };

-        let tags_info = if tag_names.is_empty() {
-            "Tags: none".to_string()
-        } else {
-            format!("Tags: {}", tag_names.join(", "))
-        };
-
-        let contact_info = contact
+        let contact_bullet = contact
            .as_ref()
-            .map(|c| format!("Contact/Person: {}", c))
-            .unwrap_or_else(|| "Contact/Person: unknown".to_string());
+            .map(|c| format!("- Contact/Person: {}", c))
+            .unwrap_or_else(|| "- Contact/Person: unknown".to_string());

-        // Hybrid mode: the chat model never receives the image bytes, so we
-        // inline the visual description as text and explicitly tell the model
-        // not to call describe_photo (the tool is gated off in hybrid anyway).
-        let visual_block = hybrid_visual_description
+        let tags_bullet = if tag_names.is_empty() {
+            "- Tags: none".to_string()
+        } else {
+            format!("- Tags: {}", tag_names.join(", "))
+        };
+
+        let path_bullet = format!("- File path: {}", file_path);
+
+        // Hybrid: visual description is inlined as a bullet (no image bytes
+        // reach the chat model). Local: the image is attached to this
+        // message, no inline description bullet — describe_photo is the tool.
+        let visual_bullet = hybrid_visual_description
            .as_deref()
            .map(|d| {
                format!(
-                    "Visual description (already generated for you — do not call describe_photo):\n{}\n\n",
-                    d
+                    "- Visual description (already generated — do not call describe_photo):\n  {}",
+                    d.lines().collect::<Vec<_>>().join("\n  ")
                )
            })
            .unwrap_or_default();

-        // Format date with weekday + the canonical-date source so the model
-        // can temper claims when the date is filename- or fs_time-derived.
-        let date_line = format!(
-            "Date taken: {} (source: {})",
-            date_taken.format("%A, %B %d, %Y"),
-            date_taken_source
-        );
+        // Compose the photo block (omit empty visual bullet to avoid stray newline).
+        let photo_block = if visual_bullet.is_empty() {
+            format!(
+                "## This photo\n\n{}\n{}\n{}\n{}\n{}",
+                path_bullet, date_bullet, contact_bullet, location_bullet, tags_bullet
+            )
+        } else {
+            format!(
+                "## This photo\n\n{}\n{}\n{}\n{}\n{}\n{}",
+                path_bullet,
+                date_bullet,
+                contact_bullet,
+                location_bullet,
+                tags_bullet,
+                visual_bullet
+            )
+        };

        let user_content = format!(
-            "{visual_block}Please analyze this photo and gather any relevant context from the surrounding weeks.\n\n\
-             Photo file path: {}\n\
-             {}\n\
-             {}\n\
-             {}\n\
-             {}\n\n\
-             Use the available tools to gather more context about this moment (messages, calendar events, location history, etc.), \
-             then write a detailed insight with a title and summary.",
-            file_path,
-            date_line,
-            contact_info,
-            location_info,
-            tags_info,
-            visual_block = visual_block,
+            "{photo_block}\n\n\
+             ## What to do\n\n\
+             1. First, call recall_facts_for_photo and recall_entities to load any prior knowledge about subjects in this photo.\n\
+             2. Then call at least 3 of: search_rag, get_sms_messages (try once with the contact filter and once without), get_calendar_events, get_location_history — pick the ones most relevant to this photo's date and context.\n\
+             3. Only after you have tool results, write the final insight with a title and a detailed summary that references specific facts from the metadata above and from your tool results. Generic narration is not acceptable.\n\n\
+             YOUR FIRST RESPONSE MUST BE A TOOL CALL. Do not output any final answer text until you have called at least 5 tools."
        );

        // 10. Define tools. Hybrid mode omits `describe_photo` since the