Add memory-reel backend: on-demand narrated photo slideshow

New POST /reels + GET /reels/{id} (+ /video) build an MP4 slideshow of a memory span (day/week/month), narrated by the LLM in a cloned voice. Pipeline (src/reels/): a selector resolves which photos + reel metadata, the scripter writes one narration line per photo via a single LLM call (reusing each photo's cached insight as context — no fresh vision calls, so reel generation stays off the GPU's vision slot), each line is synthesized to speech, and the renderer assembles stills + narration via ffmpeg. Jobs run in the background (mirroring the TTS speech-job registry) since a reel takes minutes; the finished MP4 is cached on disk keyed by the selection so a repeat request is instant. The segment model is media-typed (Photo today) so a video-clip segment (phase 2) and a nightly pre-render (phase 3) slot in without reworking the pipeline. Ken Burns motion is implemented but defaulted off pending a visual check on the GPU box. Supporting changes: - memories: extract gather_memory_items() so the reel selector reuses the exact window/exclusion/tz/sort logic behind /memories. - ai::tts: add synthesize_serialized() so reel narration honors the same single-GPU permit + write lease as user TTS requests. - video::ffmpeg: make get_duration_seconds() pub for narration timing. - AppState: reels_path (REELS_DIRECTORY, defaults beside preview clips). Pure logic (cache key, script parsing, ffmpeg arg/filter construction, even sampling, segment timing) is unit-tested (26 tests). The runtime path (ffmpeg render, TTS, LLM) needs a real run on the GPU host to verify end-to-end — not exercisable in CI. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-12 22:31:08 -04:00
parent 98274c3301
commit e3f731b3b2
9 changed files with 1615 additions and 30 deletions
@@ -0,0 +1,289 @@
+//! Narration scripting for memory reels.
+//!
+//! One LLM call turns the planned segments (each carrying its date and, where
+//! available, its cached insight) into a short first-person narration line per
+//! photo plus a title for the reel. We reuse the cached insight summary as the
+//! richest per-photo signal rather than re-running vision at reel time — that
+//! keeps reel generation off the GPU's vision slot entirely.
+//!
+//! The prompt builder and response parser are pure so the contract is
+//! unit-testable; `generate_script` wires them to the LLM client.
+
+use anyhow::{Context, Result};
+use std::sync::Arc;
+
+use super::{PlannedSegment, ReelMeta};
+use crate::ai::llamacpp::LlamaCppClient;
+use crate::ai::llm_client::LlmClient;
+
+/// The narration for a whole reel: a title and one line per segment, in order.
+#[derive(Debug, Clone, PartialEq)]
+pub struct ReelScript {
+    pub title: String,
+    pub lines: Vec<String>,
+}
+
+const SYSTEM_PROMPT: &str = "You are narrating a personal memory reel — a short \
+slideshow of someone's own photos set to a spoken voiceover. Write warm, \
+specific, first-person narration as if the person is gently looking back on \
+their own memories. Be concrete and grounded in the details given; never \
+invent names, places, or events that aren't supported. Keep each line to one \
+or two short sentences that can be read aloud in a few seconds. Avoid generic \
+filler like \"what a wonderful day\" — if you have little to go on, simply \
+describe the moment plainly.";
+
+/// Build the (system, user) prompt pair for the scripter. The user message
+/// describes each segment in order and asks for strict JSON back.
+pub fn build_script_messages(meta: &ReelMeta, planned: &[PlannedSegment]) -> (String, String) {
+    let mut user = String::new();
+    user.push_str(&format!(
+        "These are {} photos surfaced as memories {}.\n\n",
+        planned.len(),
+        meta.span_phrase()
+    ));
+    if !meta.years.is_empty() {
+        let years: Vec<String> = meta.years.iter().map(|y| y.to_string()).collect();
+        user.push_str(&format!("They span the years: {}.\n\n", years.join(", ")));
+    }
+    user.push_str("Photos, in the order they will appear:\n");
+    for (i, seg) in planned.iter().enumerate() {
+        user.push_str(&format!("\n[{}]", i + 1));
+        if let Some(date) = seg.date_label() {
+            user.push_str(&format!(" {date}"));
+        }
+        user.push('\n');
+        match (&seg.insight_title, &seg.insight_summary) {
+            (Some(t), Some(s)) if !s.trim().is_empty() => {
+                user.push_str(&format!("  Known context: {t} — {s}\n"));
+            }
+            (Some(t), _) => user.push_str(&format!("  Known context: {t}\n")),
+            (_, Some(s)) if !s.trim().is_empty() => {
+                user.push_str(&format!("  Known context: {s}\n"));
+            }
+            _ => user.push_str("  (no extra context — narrate plainly from the date)\n"),
+        }
+    }
+    user.push_str(&format!(
+        "\nReturn ONLY a JSON object, no prose or code fences, shaped exactly:\n\
+         {{\"title\": \"<short reel title>\", \"segments\": [\"<line for photo 1>\", \
+         \"<line for photo 2>\", ... ]}}\n\
+         The \"segments\" array MUST have exactly {} items, one per photo in order.",
+        planned.len()
+    ));
+    (SYSTEM_PROMPT.to_string(), user)
+}
+
+/// Parse the model's response into a script with exactly `n` lines. Tolerant of
+/// code fences and surrounding prose, and of both `segments: [".."]` and
+/// `segments: [{"narration": ".."}]` shapes. Missing/extra lines are padded or
+/// truncated so the caller always gets `n` aligned to the segments.
+pub fn parse_script_response(raw: &str, n: usize) -> ReelScript {
+    let fallback_line = "A moment worth remembering.";
+    let value = extract_json_object(raw);
+
+    let title = value
+        .as_ref()
+        .and_then(|v| v.get("title"))
+        .and_then(|t| t.as_str())
+        .map(clean_text)
+        .filter(|s| !s.is_empty())
+        .unwrap_or_else(|| "Memories".to_string());
+
+    let mut lines: Vec<String> = value
+        .as_ref()
+        .and_then(|v| v.get("segments"))
+        .and_then(|s| s.as_array())
+        .map(|arr| {
+            arr.iter()
+                .map(|item| {
+                    let text = item
+                        .as_str()
+                        .map(|s| s.to_string())
+                        .or_else(|| {
+                            item.get("narration")
+                                .and_then(|n| n.as_str())
+                                .map(|s| s.to_string())
+                        })
+                        .unwrap_or_default();
+                    clean_text(&text)
+                })
+                .collect()
+        })
+        .unwrap_or_default();
+
+    // Align to exactly n: drop extras, pad shortfalls with a neutral line so
+    // every photo still gets spoken audio.
+    lines.truncate(n);
+    while lines.len() < n {
+        lines.push(fallback_line.to_string());
+    }
+    for line in lines.iter_mut() {
+        if line.is_empty() {
+            *line = fallback_line.to_string();
+        }
+    }
+
+    ReelScript { title, lines }
+}
+
+/// Pull the first balanced top-level JSON object out of a possibly-noisy model
+/// response (code fences, leading prose). Returns None if nothing parses.
+fn extract_json_object(raw: &str) -> Option<serde_json::Value> {
+    // Fast path: the whole thing is valid JSON.
+    if let Ok(v) = serde_json::from_str::<serde_json::Value>(raw.trim()) {
+        return Some(v);
+    }
+    // Otherwise scan for the first '{' ... matching '}' span, ignoring braces
+    // inside strings.
+    let bytes = raw.as_bytes();
+    let start = raw.find('{')?;
+    let mut depth = 0i32;
+    let mut in_str = false;
+    let mut escaped = false;
+    for i in start..bytes.len() {
+        let c = bytes[i] as char;
+        if in_str {
+            if escaped {
+                escaped = false;
+            } else if c == '\\' {
+                escaped = true;
+            } else if c == '"' {
+                in_str = false;
+            }
+            continue;
+        }
+        match c {
+            '"' => in_str = true,
+            '{' => depth += 1,
+            '}' => {
+                depth -= 1;
+                if depth == 0 {
+                    return serde_json::from_str(&raw[start..=i]).ok();
+                }
+            }
+            _ => {}
+        }
+    }
+    None
+}
+
+/// Collapse whitespace and strip stray markdown/quote decorations a model
+/// sometimes leaves around a line.
+fn clean_text(s: &str) -> String {
+    let trimmed = s.trim().trim_matches('"').trim();
+    trimmed.split_whitespace().collect::<Vec<_>>().join(" ")
+}
+
+/// Generate the reel script via the LLM. Text-only (no images) — the per-photo
+/// context comes from cached insights. The call takes the GPU read lease
+/// internally (see `LlamaCppClient::generate`).
+pub async fn generate_script(
+    client: &Arc<LlamaCppClient>,
+    meta: &ReelMeta,
+    planned: &[PlannedSegment],
+) -> Result<ReelScript> {
+    let (system, user) = build_script_messages(meta, planned);
+    let raw = client
+        .generate(&user, Some(&system), None)
+        .await
+        .context("LLM script generation failed")?;
+    Ok(parse_script_response(&raw, planned.len()))
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::memories::MemoriesSpan;
+
+    fn meta() -> ReelMeta {
+        ReelMeta {
+            span: MemoriesSpan::Day,
+            years: vec![2019, 2021],
+        }
+    }
+
+    fn planned(n: usize) -> Vec<PlannedSegment> {
+        (0..n)
+            .map(|i| PlannedSegment {
+                media: super::super::SegmentMedia::Photo {
+                    rel_path: format!("p{i}.jpg"),
+                    library_id: 1,
+                },
+                date: Some(1_560_000_000 + i as i64 * 86_400),
+                insight_title: None,
+                insight_summary: None,
+            })
+            .collect()
+    }
+
+    #[test]
+    fn prompt_states_exact_segment_count_and_span() {
+        let (sys, user) = build_script_messages(&meta(), &planned(3));
+        assert!(sys.contains("memory reel"));
+        assert!(user.contains("3 photos"));
+        assert!(user.contains("on this day"));
+        assert!(user.contains("exactly 3 items"));
+        // Each photo gets an indexed entry.
+        assert!(user.contains("[1]") && user.contains("[2]") && user.contains("[3]"));
+    }
+
+    #[test]
+    fn prompt_includes_insight_context_when_present() {
+        let mut p = planned(1);
+        p[0].insight_title = Some("Lake house weekend".into());
+        p[0].insight_summary = Some("Swimming with the dogs.".into());
+        let (_sys, user) = build_script_messages(&meta(), &p);
+        assert!(user.contains("Lake house weekend — Swimming with the dogs."));
+    }
+
+    #[test]
+    fn parse_plain_json_object() {
+        let raw = r#"{"title":"Summer Days","segments":["First line.","Second line."]}"#;
+        let script = parse_script_response(raw, 2);
+        assert_eq!(script.title, "Summer Days");
+        assert_eq!(script.lines, vec!["First line.", "Second line."]);
+    }
+
+    #[test]
+    fn parse_tolerates_code_fences_and_prose() {
+        let raw = "Sure! Here's your reel:\n```json\n{\"title\": \"Trip\", \"segments\": [\"A.\", \"B.\"]}\n```\nEnjoy!";
+        let script = parse_script_response(raw, 2);
+        assert_eq!(script.title, "Trip");
+        assert_eq!(script.lines, vec!["A.", "B."]);
+    }
+
+    #[test]
+    fn parse_accepts_object_segment_shape() {
+        let raw = r#"{"title":"T","segments":[{"narration":"One."},{"narration":"Two."}]}"#;
+        let script = parse_script_response(raw, 2);
+        assert_eq!(script.lines, vec!["One.", "Two."]);
+    }
+
+    #[test]
+    fn parse_pads_short_and_truncates_long_to_n() {
+        // Model returned 1 line but we have 3 segments → pad with neutral lines.
+        let short = parse_script_response(r#"{"title":"T","segments":["Only one."]}"#, 3);
+        assert_eq!(short.lines.len(), 3);
+        assert_eq!(short.lines[0], "Only one.");
+        assert!(!short.lines[1].is_empty());
+
+        // Model returned 3 but we have 2 → truncate.
+        let long = parse_script_response(r#"{"title":"T","segments":["a","b","c"]}"#, 2);
+        assert_eq!(long.lines, vec!["a", "b"]);
+    }
+
+    #[test]
+    fn parse_falls_back_on_garbage() {
+        let script = parse_script_response("the model said no", 2);
+        assert_eq!(script.title, "Memories");
+        assert_eq!(script.lines.len(), 2);
+        assert!(script.lines.iter().all(|l| !l.is_empty()));
+    }
+
+    #[test]
+    fn parse_blank_line_replaced_with_fallback() {
+        let script = parse_script_response(r#"{"title":"T","segments":["  ","Real."]}"#, 2);
+        assert!(!script.lines[0].is_empty());
+        assert_eq!(script.lines[1], "Real.");
+    }
+}