ImageApi/src/reels/script.rs

//! Narration scripting for memory reels.
//!
//! One LLM call turns the planned beats (each carrying its date and, where
//! available, its cached insight) into a short first-person narration line per
//! beat plus a title for the reel. A beat may show several photos in a quick
//! burst, so a line narrates the *moment*, not a single frame. We reuse the
//! cached insight summary as the richest signal rather than re-running vision
//! at reel time — that keeps reel generation off the GPU's vision slot.
//!
//! The prompt builder and response parser are pure so the contract is
//! unit-testable; `generate_script` wires them to the LLM client.
//!
//! The agentic scripter (pre-generation) resolves the backend through the
//! InsightGenerator, builds a read-only tool set, and runs a tool loop to
//! ground the narration in retrieved context before asking for the final JSON.

use anyhow::{Context, Result};
use std::sync::Arc;

use super::{PlannedBeat, ReelMeta};
use crate::ai::backend::{BackendKind, SamplingOverrides};
use crate::ai::insight_generator::InsightGenerator;
use crate::ai::llamacpp::LlamaCppClient;
use crate::ai::llm_client::{LlmClient, Tool};
use crate::ai::ollama::ChatMessage;

/// The narration for a whole reel: a title and one line per beat, in order.
#[derive(Debug, Clone, PartialEq)]
pub struct ReelScript {
    pub title: String,
    pub lines: Vec<String>,
}

const SYSTEM_PROMPT: &str = "You are narrating a personal memory reel — a short \
slideshow of someone's own photos set to a spoken voiceover. Write warm, \
specific, first-person narration as if the person is gently looking back on \
their own memories. Each line plays over one moment, which may be a quick burst \
of several photos, so narrate the moment as a whole rather than a single frame. \
Be concrete and grounded in the details given; never invent names, places, or \
events that aren't supported. Keep each line to one or two short sentences that \
can be read aloud in a few seconds. Avoid generic filler like \"what a \
wonderful day\" — if you have little to go on, simply describe the moment \
plainly.";

/// Agentic scripter system prompt: richer version that tells the model it may
/// call read-only tools to ground each line.
const AGENTIC_SYSTEM_PROMPT: &str = "You are narrating a personal memory reel — a short \
slideshow of someone's own photos set to a spoken voiceover. Write warm, \
specific, first-person narration as if the person is gently looking back on \
their own memories. Each line plays over one moment, which may be a quick burst \
of several photos, so narrate the moment as a whole rather than a single frame. \
Be concrete and grounded in the details given; never invent names, places, or \
events that aren't supported. Keep each line to one or two short sentences that \
can be read aloud in a few seconds. Avoid generic filler like \"what a \
wonderful day\" — if you have little to go on, simply describe the moment \
plainly.\n\nYou may call read-only tools (search_messages, get_file_tags, \
reverse_geocode, get_current_datetime, recall_entities, recall_facts_for_photo, \
recall_facts_for_entity) to ground each line in real context. Never invent \
details. Return ONLY the JSON object, no prose or code fences.";

/// Maximum agentic tool iterations for pre-generation. Tunable via
/// `REEL_PREGEN_MAX_TOOL_ITERS` (default 8).
fn reel_pregen_max_tool_iters() -> usize {
    std::env::var("REEL_PREGEN_MAX_TOOL_ITERS")
        .ok()
        .and_then(|s| s.trim().parse::<usize>().ok())
        .filter(|x| *x > 0)
        .unwrap_or(8)
}

/// Build the (system, user) prompt pair for the scripter. The user message
/// describes each beat in order and asks for strict JSON back.
pub fn build_script_messages(meta: &ReelMeta, beats: &[PlannedBeat]) -> (String, String) {
    let mut user = String::new();
    user.push_str(&format!(
        "This reel has {} moments surfaced as memories {}.\n\n",
        beats.len(),
        meta.span_phrase()
    ));
    if !meta.years.is_empty() {
        let years: Vec<String> = meta.years.iter().map(|y| y.to_string()).collect();
        user.push_str(&format!("They span the years: {}.\n\n", years.join(", ")));
    }
    user.push_str("Moments, in the order they will appear:\n");
    for (i, beat) in beats.iter().enumerate() {
        user.push_str(&format!("\n[{}]", i + 1));
        if let Some(date) = beat.date_label() {
            user.push_str(&format!(" {date}"));
        }
        if beat.is_clip() {
            user.push_str(" (a video clip)");
        } else if beat.media.len() > 1 {
            user.push_str(&format!(" (a burst of {} photos)", beat.media.len()));
        }
        user.push('\n');
        match (&beat.insight_title, &beat.insight_summary) {
            (Some(t), Some(s)) if !s.trim().is_empty() => {
                user.push_str(&format!("  Known context: {t} — {s}\n"));
            }
            (Some(t), _) => user.push_str(&format!("  Known context: {t}\n")),
            (_, Some(s)) if !s.trim().is_empty() => {
                user.push_str(&format!("  Known context: {s}\n"));
            }
            _ => user.push_str("  (no extra context — narrate plainly from the date)\n"),
        }
    }
    user.push_str(&format!(
        "\nReturn ONLY a JSON object, no prose or code fences, shaped exactly:\n\
         {{\"title\": \"<short reel title>\", \"segments\": [\"<line for moment 1>\", \
         \"<line for moment 2>\", ... ]}}\n\
         The \"segments\" array MUST have exactly {} items, one per moment in order.",
        beats.len()
    ));
    (SYSTEM_PROMPT.to_string(), user)
}

/// Build a richer (system, user) prompt pair for the agentic scripter. The
/// system prompt tells the model it may call read-only tools to ground each
/// line. The user message uses the same per-beat enumeration as
/// `build_script_messages` plus a GPS line per beat when available.
pub fn build_agentic_script_messages(meta: &ReelMeta, beats: &[PlannedBeat]) -> Vec<ChatMessage> {
    let mut user = String::new();
    user.push_str(&format!(
        "This reel has {} moments surfaced as memories {}.\n\n",
        beats.len(),
        meta.span_phrase()
    ));
    if !meta.years.is_empty() {
        let years: Vec<String> = meta.years.iter().map(|y| y.to_string()).collect();
        user.push_str(&format!("They span the years: {}.\n\n", years.join(", ")));
    }
    user.push_str("Moments, in the order they will appear:\n");
    for (i, beat) in beats.iter().enumerate() {
        user.push_str(&format!("\n[{}]", i + 1));
        if let Some(date) = beat.date_label() {
            user.push_str(&format!(" {date}"));
        }
        if beat.is_clip() {
            user.push_str(" (a video clip)");
        } else if beat.media.len() > 1 {
            user.push_str(&format!(" (a burst of {} photos)", beat.media.len()));
        }
        if let Some((lat, lon)) = beat.gps {
            user.push_str(&format!("\n  GPS: {:.4}, {:.4}", lat, lon));
        }
        user.push('\n');
        match (&beat.insight_title, &beat.insight_summary) {
            (Some(t), Some(s)) if !s.trim().is_empty() => {
                user.push_str(&format!("  Known context: {t} — {s}\n"));
            }
            (Some(t), _) => user.push_str(&format!("  Known context: {t}\n")),
            (_, Some(s)) if !s.trim().is_empty() => {
                user.push_str(&format!("  Known context: {s}\n"));
            }
            _ => user.push_str("  (no extra context — narrate plainly from the date)\n"),
        }
    }
    user.push_str(&format!(
        "\nReturn ONLY a JSON object, no prose or code fences, shaped exactly:\n\
         {{\"title\": \"<short reel title>\", \"segments\": [\"<line for moment 1>\", \
         \"<line for moment 2>\", ... ]}}\n\
         The \"segments\" array MUST have exactly {} items, one per moment in order.",
        beats.len()
    ));

    vec![
        ChatMessage::system(AGENTIC_SYSTEM_PROMPT.to_string()),
        ChatMessage::user(user),
    ]
}

/// Parse the model's response into a script with exactly `n` lines. Tolerant of
/// code fences and surrounding prose, and of both `segments: [".."]` and
/// `segments: [{"narration": ".."}]` shapes. Missing/extra lines are padded or
/// truncated so the caller always gets `n` aligned to the segments.
pub fn parse_script_response(raw: &str, n: usize) -> ReelScript {
    let fallback_line = "A moment worth remembering.";
    let value = extract_json_object(raw);

    let title = value
        .as_ref()
        .and_then(|v| v.get("title"))
        .and_then(|t| t.as_str())
        .map(clean_text)
        .filter(|s| !s.is_empty())
        .unwrap_or_else(|| "Memories".to_string());

    let mut lines: Vec<String> = value
        .as_ref()
        .and_then(|v| v.get("segments"))
        .and_then(|s| s.as_array())
        .map(|arr| {
            arr.iter()
                .map(|item| {
                    let text = item
                        .as_str()
                        .map(|s| s.to_string())
                        .or_else(|| {
                            item.get("narration")
                                .and_then(|n| n.as_str())
                                .map(|s| s.to_string())
                        })
                        .unwrap_or_default();
                    clean_text(&text)
                })
                .collect()
        })
        .unwrap_or_default();

    // Align to exactly n: drop extras, pad shortfalls with a neutral line so
    // every photo still gets spoken audio.
    lines.truncate(n);
    while lines.len() < n {
        lines.push(fallback_line.to_string());
    }
    for line in lines.iter_mut() {
        if line.is_empty() {
            *line = fallback_line.to_string();
        }
    }

    ReelScript { title, lines }
}

/// Pull the first balanced top-level JSON object out of a possibly-noisy model
/// response (code fences, leading prose). Returns None if nothing parses.
fn extract_json_object(raw: &str) -> Option<serde_json::Value> {
    // Fast path: the whole thing is valid JSON.
    if let Ok(v) = serde_json::from_str::<serde_json::Value>(raw.trim()) {
        return Some(v);
    }
    // Otherwise scan for the first '{' ... matching '}' span, ignoring braces
    // inside strings.
    let bytes = raw.as_bytes();
    let start = raw.find('{')?;
    let mut depth = 0i32;
    let mut in_str = false;
    let mut escaped = false;
    for i in start..bytes.len() {
        let c = bytes[i] as char;
        if in_str {
            if escaped {
                escaped = false;
            } else if c == '\\' {
                escaped = true;
            } else if c == '"' {
                in_str = false;
            }
            continue;
        }
        match c {
            '"' => in_str = true,
            '{' => depth += 1,
            '}' => {
                depth -= 1;
                if depth == 0 {
                    return serde_json::from_str(&raw[start..=i]).ok();
                }
            }
            _ => {}
        }
    }
    None
}

/// Collapse whitespace and strip stray markdown/quote decorations a model
/// sometimes leaves around a line.
fn clean_text(s: &str) -> String {
    let trimmed = s.trim().trim_matches('"').trim();
    trimmed.split_whitespace().collect::<Vec<_>>().join(" ")
}

/// Generate the reel script via the LLM. Text-only (no images) — the per-beat
/// context comes from cached insights. The call takes the GPU read lease
/// internally (see `LlamaCppClient::generate`).
pub async fn generate_script(
    client: &Arc<LlamaCppClient>,
    meta: &ReelMeta,
    beats: &[PlannedBeat],
) -> Result<ReelScript> {
    let (system, user) = build_script_messages(meta, beats);
    let raw = client
        .generate(&user, Some(&system), None)
        .await
        .context("LLM script generation failed")?;
    Ok(parse_script_response(&raw, beats.len()))
}

/// Agentic version of script generation: resolves the backend via the
/// InsightGenerator (honouring LLM_BACKEND, model overrides, etc.), builds
/// a read-only tool set, runs the tool loop, then parses the JSON response.
/// Returns the same ReelScript shape. On failure the caller may fall back to
/// `generate_script`.
pub async fn generate_script_agentic(
    generator: &InsightGenerator,
    meta: &ReelMeta,
    beats: &[PlannedBeat],
) -> Result<ReelScript> {
    // 1. Resolve the backend. Bail if the local model lacks tool-calling.
    let backend = generator
        .resolve_backend(
            BackendKind::Local,
            &SamplingOverrides {
                model: None,
                num_ctx: None,
                temperature: None,
                top_p: None,
                top_k: None,
                min_p: None,
            },
        )
        .await
        .context("resolving backend for agentic script")?;

    // 2. Build the read-only tool set. Start from the persona gate (no
    //    persona context, so corrections are closed), force has_vision=false,
    //    then filter out write tools.
    let gate = generator.current_gate_opts_for_persona(false, None);
    let all_tools = InsightGenerator::build_tool_definitions(gate);
    let read_only_names: std::collections::HashSet<&str> = [
        "search_rag",
        "search_messages",
        "get_sms_messages",
        "get_calendar_events",
        "get_location_history",
        "get_file_tags",
        "get_faces_in_photo",
        "reverse_geocode",
        "get_personal_place_at",
        "recall_entities",
        "recall_facts_for_photo",
        "recall_facts_for_entity",
        "get_current_datetime",
    ]
    .into_iter()
    .collect();
    let tools: Vec<Tool> = all_tools
        .into_iter()
        .filter(|t| read_only_names.contains(t.function.name.as_str()))
        .collect();

    // 3. Build the agentic prompt messages.
    let messages = build_agentic_script_messages(meta, beats);

    // 4. Run the tool loop.
    let max_iter = reel_pregen_max_tool_iters();
    let raw = generator
        .run_readonly_tool_loop(&backend, messages, tools, max_iter)
        .await
        .context("agentic tool loop failed")?;

    // 5. Strip any think-blocks the model may have emitted, then parse.
    let raw = crate::ai::llm_client::strip_think_blocks(&raw);
    Ok(parse_script_response(&raw, beats.len()))
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::memories::MemoriesSpan;

    fn meta() -> ReelMeta {
        ReelMeta {
            span: MemoriesSpan::Day,
            years: vec![2019, 2021],
        }
    }

    fn planned(n: usize) -> Vec<PlannedBeat> {
        (0..n)
            .map(|i| PlannedBeat {
                media: vec![super::super::SegmentMedia::Photo {
                    rel_path: format!("p{i}.jpg"),
                    library_id: 1,
                }],
                date: Some(1_560_000_000 + i as i64 * 86_400),
                insight_title: None,
                insight_summary: None,
                gps: None,
            })
            .collect()
    }

    #[test]
    fn prompt_states_exact_moment_count_and_span() {
        let (sys, user) = build_script_messages(&meta(), &planned(3));
        assert!(sys.contains("memory reel"));
        assert!(user.contains("3 moments"));
        assert!(user.contains("on this day"));
        assert!(user.contains("exactly 3 items"));
        // Each moment gets an indexed entry.
        assert!(user.contains("[1]") && user.contains("[2]") && user.contains("[3]"));
    }

    #[test]
    fn prompt_notes_burst_photo_count() {
        let mut p = planned(1);
        p[0].media = vec![
            super::super::SegmentMedia::Photo {
                rel_path: "a.jpg".into(),
                library_id: 1,
            },
            super::super::SegmentMedia::Photo {
                rel_path: "b.jpg".into(),
                library_id: 1,
            },
            super::super::SegmentMedia::Photo {
                rel_path: "c.jpg".into(),
                library_id: 1,
            },
        ];
        let (_sys, user) = build_script_messages(&meta(), &p);
        assert!(user.contains("a burst of 3 photos"));
    }

    #[test]
    fn prompt_marks_clip_beats() {
        let mut p = planned(1);
        p[0].media = vec![super::super::SegmentMedia::Clip {
            rel_path: "v.mp4".into(),
            library_id: 1,
        }];
        let (_sys, user) = build_script_messages(&meta(), &p);
        assert!(user.contains("a video clip"));
    }

    #[test]
    fn prompt_includes_insight_context_when_present() {
        let mut p = planned(1);
        p[0].insight_title = Some("Lake house weekend".into());
        p[0].insight_summary = Some("Swimming with the dogs.".into());
        let (_sys, user) = build_script_messages(&meta(), &p);
        assert!(user.contains("Lake house weekend — Swimming with the dogs."));
    }

    #[test]
    fn parse_plain_json_object() {
        let raw = r#"{"title":"Summer Days","segments":["First line.","Second line."]}"#;
        let script = parse_script_response(raw, 2);
        assert_eq!(script.title, "Summer Days");
        assert_eq!(script.lines, vec!["First line.", "Second line."]);
    }

    #[test]
    fn parse_tolerates_code_fences_and_prose() {
        let raw = "Sure! Here's your reel:\n```json\n{\"title\": \"Trip\", \"segments\": [\"A.\", \"B.\"]}\n```\nEnjoy!";
        let script = parse_script_response(raw, 2);
        assert_eq!(script.title, "Trip");
        assert_eq!(script.lines, vec!["A.", "B."]);
    }

    #[test]
    fn parse_accepts_object_segment_shape() {
        let raw = r#"{"title":"T","segments":[{"narration":"One."},{"narration":"Two."}]}"#;
        let script = parse_script_response(raw, 2);
        assert_eq!(script.lines, vec!["One.", "Two."]);
    }

    #[test]
    fn parse_pads_short_and_truncates_long_to_n() {
        // Model returned 1 line but we have 3 segments → pad with neutral lines.
        let short = parse_script_response(r#"{"title":"T","segments":["Only one."]}"#, 3);
        assert_eq!(short.lines.len(), 3);
        assert_eq!(short.lines[0], "Only one.");
        assert!(!short.lines[1].is_empty());

        // Model returned 3 but we have 2 → truncate.
        let long = parse_script_response(r#"{"title":"T","segments":["a","b","c"]}"#, 2);
        assert_eq!(long.lines, vec!["a", "b"]);
    }

    #[test]
    fn parse_falls_back_on_garbage() {
        let script = parse_script_response("the model said no", 2);
        assert_eq!(script.title, "Memories");
        assert_eq!(script.lines.len(), 2);
        assert!(script.lines.iter().all(|l| !l.is_empty()));
    }

    #[test]
    fn parse_blank_line_replaced_with_fallback() {
        let script = parse_script_response(r#"{"title":"T","segments":["  ","Real."]}"#, 2);
        assert!(!script.lines[0].is_empty());
        assert_eq!(script.lines[1], "Real.");
    }
}