//! Narration scripting for memory reels. //! //! One LLM call turns the planned beats (each carrying its date and, where //! available, its cached insight) into a short first-person narration line per //! beat plus a title for the reel. A beat may show several photos in a quick //! burst, so a line narrates the *moment*, not a single frame. We reuse the //! cached insight summary as the richest signal rather than re-running vision //! at reel time — that keeps reel generation off the GPU's vision slot. //! //! The prompt builder and response parser are pure so the contract is //! unit-testable; `generate_script` wires them to the LLM client. //! //! The agentic scripter (pre-generation) resolves the backend through the //! InsightGenerator, builds a read-only tool set, and runs a tool loop to //! ground the narration in retrieved context before asking for the final JSON. use anyhow::{Context, Result}; use std::sync::Arc; use super::{PlannedBeat, ReelMeta}; use crate::ai::backend::{BackendKind, SamplingOverrides}; use crate::ai::insight_generator::InsightGenerator; use crate::ai::llamacpp::LlamaCppClient; use crate::ai::llm_client::{LlmClient, Tool}; use crate::ai::ollama::ChatMessage; /// The narration for a whole reel: a title and one line per beat, in order. #[derive(Debug, Clone, PartialEq)] pub struct ReelScript { pub title: String, pub lines: Vec, } const SYSTEM_PROMPT: &str = "You are narrating a personal memory reel — a short \ slideshow of someone's own photos set to a spoken voiceover. Write warm, \ specific, first-person narration as if the person is gently looking back on \ their own memories. Each line plays over one moment, which may be a quick burst \ of several photos, so narrate the moment as a whole rather than a single frame. \ Be concrete and grounded in the details given; never invent names, places, or \ events that aren't supported. Keep each line to one or two short sentences that \ can be read aloud in a few seconds. Avoid generic filler like \"what a \ wonderful day\" — if you have little to go on, simply describe the moment \ plainly."; /// Agentic scripter system prompt: richer version that tells the model it may /// call read-only tools to ground each line. const AGENTIC_SYSTEM_PROMPT: &str = "You are narrating a personal memory reel — a short \ slideshow of someone's own photos set to a spoken voiceover. Write warm, \ specific, first-person narration as if the person is gently looking back on \ their own memories. Each line plays over one moment, which may be a quick burst \ of several photos, so narrate the moment as a whole rather than a single frame. \ Be concrete and grounded in the details given; never invent names, places, or \ events that aren't supported. Keep each line to one or two short sentences that \ can be read aloud in a few seconds. Avoid generic filler like \"what a \ wonderful day\" — if you have little to go on, simply describe the moment \ plainly.\n\nYou may call read-only tools (search_messages, get_file_tags, \ reverse_geocode, get_current_datetime, recall_entities, recall_facts_for_photo, \ recall_facts_for_entity) to ground each line in real context. Never invent \ details. Return ONLY the JSON object, no prose or code fences."; /// Maximum agentic tool iterations for pre-generation. Tunable via /// `REEL_PREGEN_MAX_TOOL_ITERS` (default 8). fn reel_pregen_max_tool_iters() -> usize { std::env::var("REEL_PREGEN_MAX_TOOL_ITERS") .ok() .and_then(|s| s.trim().parse::().ok()) .filter(|x| *x > 0) .unwrap_or(8) } /// Build the (system, user) prompt pair for the scripter. The user message /// describes each beat in order and asks for strict JSON back. pub fn build_script_messages(meta: &ReelMeta, beats: &[PlannedBeat]) -> (String, String) { let mut user = String::new(); user.push_str(&format!( "This reel has {} moments surfaced as memories {}.\n\n", beats.len(), meta.span_phrase() )); if !meta.years.is_empty() { let years: Vec = meta.years.iter().map(|y| y.to_string()).collect(); user.push_str(&format!("They span the years: {}.\n\n", years.join(", "))); } user.push_str("Moments, in the order they will appear:\n"); for (i, beat) in beats.iter().enumerate() { user.push_str(&format!("\n[{}]", i + 1)); if let Some(date) = beat.date_label() { user.push_str(&format!(" {date}")); } if beat.is_clip() { user.push_str(" (a video clip)"); } else if beat.media.len() > 1 { user.push_str(&format!(" (a burst of {} photos)", beat.media.len())); } user.push('\n'); match (&beat.insight_title, &beat.insight_summary) { (Some(t), Some(s)) if !s.trim().is_empty() => { user.push_str(&format!(" Known context: {t} — {s}\n")); } (Some(t), _) => user.push_str(&format!(" Known context: {t}\n")), (_, Some(s)) if !s.trim().is_empty() => { user.push_str(&format!(" Known context: {s}\n")); } _ => user.push_str(" (no extra context — narrate plainly from the date)\n"), } } user.push_str(&format!( "\nReturn ONLY a JSON object, no prose or code fences, shaped exactly:\n\ {{\"title\": \"\", \"segments\": [\"\", \ \"\", ... ]}}\n\ The \"segments\" array MUST have exactly {} items, one per moment in order.", beats.len() )); (SYSTEM_PROMPT.to_string(), user) } /// Build a richer (system, user) prompt pair for the agentic scripter. The /// system prompt tells the model it may call read-only tools to ground each /// line. The user message uses the same per-beat enumeration as /// `build_script_messages` plus a GPS line per beat when available. pub fn build_agentic_script_messages(meta: &ReelMeta, beats: &[PlannedBeat]) -> Vec { let mut user = String::new(); user.push_str(&format!( "This reel has {} moments surfaced as memories {}.\n\n", beats.len(), meta.span_phrase() )); if !meta.years.is_empty() { let years: Vec = meta.years.iter().map(|y| y.to_string()).collect(); user.push_str(&format!("They span the years: {}.\n\n", years.join(", "))); } user.push_str("Moments, in the order they will appear:\n"); for (i, beat) in beats.iter().enumerate() { user.push_str(&format!("\n[{}]", i + 1)); if let Some(date) = beat.date_label() { user.push_str(&format!(" {date}")); } if beat.is_clip() { user.push_str(" (a video clip)"); } else if beat.media.len() > 1 { user.push_str(&format!(" (a burst of {} photos)", beat.media.len())); } if let Some((lat, lon)) = beat.gps { user.push_str(&format!("\n GPS: {:.4}, {:.4}", lat, lon)); } user.push('\n'); match (&beat.insight_title, &beat.insight_summary) { (Some(t), Some(s)) if !s.trim().is_empty() => { user.push_str(&format!(" Known context: {t} — {s}\n")); } (Some(t), _) => user.push_str(&format!(" Known context: {t}\n")), (_, Some(s)) if !s.trim().is_empty() => { user.push_str(&format!(" Known context: {s}\n")); } _ => user.push_str(" (no extra context — narrate plainly from the date)\n"), } } user.push_str(&format!( "\nReturn ONLY a JSON object, no prose or code fences, shaped exactly:\n\ {{\"title\": \"\", \"segments\": [\"\", \ \"\", ... ]}}\n\ The \"segments\" array MUST have exactly {} items, one per moment in order.", beats.len() )); vec![ ChatMessage::system(AGENTIC_SYSTEM_PROMPT.to_string()), ChatMessage::user(user), ] } /// Parse the model's response into a script with exactly `n` lines. Tolerant of /// code fences and surrounding prose, and of both `segments: [".."]` and /// `segments: [{"narration": ".."}]` shapes. Missing/extra lines are padded or /// truncated so the caller always gets `n` aligned to the segments. pub fn parse_script_response(raw: &str, n: usize) -> ReelScript { let fallback_line = "A moment worth remembering."; let value = extract_json_object(raw); let title = value .as_ref() .and_then(|v| v.get("title")) .and_then(|t| t.as_str()) .map(clean_text) .filter(|s| !s.is_empty()) .unwrap_or_else(|| "Memories".to_string()); let mut lines: Vec = value .as_ref() .and_then(|v| v.get("segments")) .and_then(|s| s.as_array()) .map(|arr| { arr.iter() .map(|item| { let text = item .as_str() .map(|s| s.to_string()) .or_else(|| { item.get("narration") .and_then(|n| n.as_str()) .map(|s| s.to_string()) }) .unwrap_or_default(); clean_text(&text) }) .collect() }) .unwrap_or_default(); // Align to exactly n: drop extras, pad shortfalls with a neutral line so // every photo still gets spoken audio. lines.truncate(n); while lines.len() < n { lines.push(fallback_line.to_string()); } for line in lines.iter_mut() { if line.is_empty() { *line = fallback_line.to_string(); } } ReelScript { title, lines } } /// Pull the first balanced top-level JSON object out of a possibly-noisy model /// response (code fences, leading prose). Returns None if nothing parses. fn extract_json_object(raw: &str) -> Option { // Fast path: the whole thing is valid JSON. if let Ok(v) = serde_json::from_str::(raw.trim()) { return Some(v); } // Otherwise scan for the first '{' ... matching '}' span, ignoring braces // inside strings. let bytes = raw.as_bytes(); let start = raw.find('{')?; let mut depth = 0i32; let mut in_str = false; let mut escaped = false; for i in start..bytes.len() { let c = bytes[i] as char; if in_str { if escaped { escaped = false; } else if c == '\\' { escaped = true; } else if c == '"' { in_str = false; } continue; } match c { '"' => in_str = true, '{' => depth += 1, '}' => { depth -= 1; if depth == 0 { return serde_json::from_str(&raw[start..=i]).ok(); } } _ => {} } } None } /// Collapse whitespace and strip stray markdown/quote decorations a model /// sometimes leaves around a line. fn clean_text(s: &str) -> String { let trimmed = s.trim().trim_matches('"').trim(); trimmed.split_whitespace().collect::>().join(" ") } /// Generate the reel script via the LLM. Text-only (no images) — the per-beat /// context comes from cached insights. The call takes the GPU read lease /// internally (see `LlamaCppClient::generate`). pub async fn generate_script( client: &Arc, meta: &ReelMeta, beats: &[PlannedBeat], ) -> Result { let (system, user) = build_script_messages(meta, beats); let raw = client .generate(&user, Some(&system), None) .await .context("LLM script generation failed")?; Ok(parse_script_response(&raw, beats.len())) } /// Agentic version of script generation: resolves the backend via the /// InsightGenerator (honouring LLM_BACKEND, model overrides, etc.), builds /// a read-only tool set, runs the tool loop, then parses the JSON response. /// Returns the same ReelScript shape. On failure the caller may fall back to /// `generate_script`. pub async fn generate_script_agentic( generator: &InsightGenerator, meta: &ReelMeta, beats: &[PlannedBeat], ) -> Result { // 1. Resolve the backend. Bail if the local model lacks tool-calling. let backend = generator .resolve_backend( BackendKind::Local, &SamplingOverrides { model: None, num_ctx: None, temperature: None, top_p: None, top_k: None, min_p: None, }, ) .await .context("resolving backend for agentic script")?; // 2. Build the read-only tool set. Start from the persona gate (no // persona context, so corrections are closed), force has_vision=false, // then filter out write tools. let gate = generator.current_gate_opts_for_persona(false, None); let all_tools = InsightGenerator::build_tool_definitions(gate); let read_only_names: std::collections::HashSet<&str> = [ "search_rag", "search_messages", "get_sms_messages", "get_calendar_events", "get_location_history", "get_file_tags", "get_faces_in_photo", "reverse_geocode", "get_personal_place_at", "recall_entities", "recall_facts_for_photo", "recall_facts_for_entity", "get_current_datetime", ] .into_iter() .collect(); let tools: Vec = all_tools .into_iter() .filter(|t| read_only_names.contains(t.function.name.as_str())) .collect(); // 3. Build the agentic prompt messages. let messages = build_agentic_script_messages(meta, beats); // 4. Run the tool loop. let max_iter = reel_pregen_max_tool_iters(); let raw = generator .run_readonly_tool_loop(&backend, messages, tools, max_iter) .await .context("agentic tool loop failed")?; // 5. Strip any think-blocks the model may have emitted, then parse. let raw = crate::ai::llm_client::strip_think_blocks(&raw); Ok(parse_script_response(&raw, beats.len())) } #[cfg(test)] mod tests { use super::*; use crate::memories::MemoriesSpan; fn meta() -> ReelMeta { ReelMeta { span: MemoriesSpan::Day, years: vec![2019, 2021], } } fn planned(n: usize) -> Vec { (0..n) .map(|i| PlannedBeat { media: vec![super::super::SegmentMedia::Photo { rel_path: format!("p{i}.jpg"), library_id: 1, }], date: Some(1_560_000_000 + i as i64 * 86_400), insight_title: None, insight_summary: None, gps: None, }) .collect() } #[test] fn prompt_states_exact_moment_count_and_span() { let (sys, user) = build_script_messages(&meta(), &planned(3)); assert!(sys.contains("memory reel")); assert!(user.contains("3 moments")); assert!(user.contains("on this day")); assert!(user.contains("exactly 3 items")); // Each moment gets an indexed entry. assert!(user.contains("[1]") && user.contains("[2]") && user.contains("[3]")); } #[test] fn prompt_notes_burst_photo_count() { let mut p = planned(1); p[0].media = vec![ super::super::SegmentMedia::Photo { rel_path: "a.jpg".into(), library_id: 1, }, super::super::SegmentMedia::Photo { rel_path: "b.jpg".into(), library_id: 1, }, super::super::SegmentMedia::Photo { rel_path: "c.jpg".into(), library_id: 1, }, ]; let (_sys, user) = build_script_messages(&meta(), &p); assert!(user.contains("a burst of 3 photos")); } #[test] fn prompt_marks_clip_beats() { let mut p = planned(1); p[0].media = vec![super::super::SegmentMedia::Clip { rel_path: "v.mp4".into(), library_id: 1, }]; let (_sys, user) = build_script_messages(&meta(), &p); assert!(user.contains("a video clip")); } #[test] fn prompt_includes_insight_context_when_present() { let mut p = planned(1); p[0].insight_title = Some("Lake house weekend".into()); p[0].insight_summary = Some("Swimming with the dogs.".into()); let (_sys, user) = build_script_messages(&meta(), &p); assert!(user.contains("Lake house weekend — Swimming with the dogs.")); } #[test] fn parse_plain_json_object() { let raw = r#"{"title":"Summer Days","segments":["First line.","Second line."]}"#; let script = parse_script_response(raw, 2); assert_eq!(script.title, "Summer Days"); assert_eq!(script.lines, vec!["First line.", "Second line."]); } #[test] fn parse_tolerates_code_fences_and_prose() { let raw = "Sure! Here's your reel:\n```json\n{\"title\": \"Trip\", \"segments\": [\"A.\", \"B.\"]}\n```\nEnjoy!"; let script = parse_script_response(raw, 2); assert_eq!(script.title, "Trip"); assert_eq!(script.lines, vec!["A.", "B."]); } #[test] fn parse_accepts_object_segment_shape() { let raw = r#"{"title":"T","segments":[{"narration":"One."},{"narration":"Two."}]}"#; let script = parse_script_response(raw, 2); assert_eq!(script.lines, vec!["One.", "Two."]); } #[test] fn parse_pads_short_and_truncates_long_to_n() { // Model returned 1 line but we have 3 segments → pad with neutral lines. let short = parse_script_response(r#"{"title":"T","segments":["Only one."]}"#, 3); assert_eq!(short.lines.len(), 3); assert_eq!(short.lines[0], "Only one."); assert!(!short.lines[1].is_empty()); // Model returned 3 but we have 2 → truncate. let long = parse_script_response(r#"{"title":"T","segments":["a","b","c"]}"#, 2); assert_eq!(long.lines, vec!["a", "b"]); } #[test] fn parse_falls_back_on_garbage() { let script = parse_script_response("the model said no", 2); assert_eq!(script.title, "Memories"); assert_eq!(script.lines.len(), 2); assert!(script.lines.iter().all(|l| !l.is_empty())); } #[test] fn parse_blank_line_replaced_with_fallback() { let script = parse_script_response(r#"{"title":"T","segments":[" ","Real."]}"#, 2); assert!(!script.lines[0].is_empty()); assert_eq!(script.lines[1], "Real."); } }