Add memory-reel backend: on-demand narrated photo slideshow
New POST /reels + GET /reels/{id} (+ /video) build an MP4 slideshow of a
memory span (day/week/month), narrated by the LLM in a cloned voice.
Pipeline (src/reels/): a selector resolves which photos + reel metadata,
the scripter writes one narration line per photo via a single LLM call
(reusing each photo's cached insight as context — no fresh vision calls,
so reel generation stays off the GPU's vision slot), each line is
synthesized to speech, and the renderer assembles stills + narration via
ffmpeg. Jobs run in the background (mirroring the TTS speech-job
registry) since a reel takes minutes; the finished MP4 is cached on disk
keyed by the selection so a repeat request is instant.
The segment model is media-typed (Photo today) so a video-clip segment
(phase 2) and a nightly pre-render (phase 3) slot in without reworking
the pipeline. Ken Burns motion is implemented but defaulted off pending a
visual check on the GPU box.
Supporting changes:
- memories: extract gather_memory_items() so the reel selector reuses the
exact window/exclusion/tz/sort logic behind /memories.
- ai::tts: add synthesize_serialized() so reel narration honors the same
single-GPU permit + write lease as user TTS requests.
- video::ffmpeg: make get_duration_seconds() pub for narration timing.
- AppState: reels_path (REELS_DIRECTORY, defaults beside preview clips).
Pure logic (cache key, script parsing, ffmpeg arg/filter construction,
even sampling, segment timing) is unit-tested (26 tests). The runtime
path (ffmpeg render, TTS, LLM) needs a real run on the GPU host to verify
end-to-end — not exercisable in CI.
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,289 @@
|
||||
//! Narration scripting for memory reels.
|
||||
//!
|
||||
//! One LLM call turns the planned segments (each carrying its date and, where
|
||||
//! available, its cached insight) into a short first-person narration line per
|
||||
//! photo plus a title for the reel. We reuse the cached insight summary as the
|
||||
//! richest per-photo signal rather than re-running vision at reel time — that
|
||||
//! keeps reel generation off the GPU's vision slot entirely.
|
||||
//!
|
||||
//! The prompt builder and response parser are pure so the contract is
|
||||
//! unit-testable; `generate_script` wires them to the LLM client.
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use std::sync::Arc;
|
||||
|
||||
use super::{PlannedSegment, ReelMeta};
|
||||
use crate::ai::llamacpp::LlamaCppClient;
|
||||
use crate::ai::llm_client::LlmClient;
|
||||
|
||||
/// The narration for a whole reel: a title and one line per segment, in order.
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub struct ReelScript {
|
||||
pub title: String,
|
||||
pub lines: Vec<String>,
|
||||
}
|
||||
|
||||
const SYSTEM_PROMPT: &str = "You are narrating a personal memory reel — a short \
|
||||
slideshow of someone's own photos set to a spoken voiceover. Write warm, \
|
||||
specific, first-person narration as if the person is gently looking back on \
|
||||
their own memories. Be concrete and grounded in the details given; never \
|
||||
invent names, places, or events that aren't supported. Keep each line to one \
|
||||
or two short sentences that can be read aloud in a few seconds. Avoid generic \
|
||||
filler like \"what a wonderful day\" — if you have little to go on, simply \
|
||||
describe the moment plainly.";
|
||||
|
||||
/// Build the (system, user) prompt pair for the scripter. The user message
|
||||
/// describes each segment in order and asks for strict JSON back.
|
||||
pub fn build_script_messages(meta: &ReelMeta, planned: &[PlannedSegment]) -> (String, String) {
|
||||
let mut user = String::new();
|
||||
user.push_str(&format!(
|
||||
"These are {} photos surfaced as memories {}.\n\n",
|
||||
planned.len(),
|
||||
meta.span_phrase()
|
||||
));
|
||||
if !meta.years.is_empty() {
|
||||
let years: Vec<String> = meta.years.iter().map(|y| y.to_string()).collect();
|
||||
user.push_str(&format!("They span the years: {}.\n\n", years.join(", ")));
|
||||
}
|
||||
user.push_str("Photos, in the order they will appear:\n");
|
||||
for (i, seg) in planned.iter().enumerate() {
|
||||
user.push_str(&format!("\n[{}]", i + 1));
|
||||
if let Some(date) = seg.date_label() {
|
||||
user.push_str(&format!(" {date}"));
|
||||
}
|
||||
user.push('\n');
|
||||
match (&seg.insight_title, &seg.insight_summary) {
|
||||
(Some(t), Some(s)) if !s.trim().is_empty() => {
|
||||
user.push_str(&format!(" Known context: {t} — {s}\n"));
|
||||
}
|
||||
(Some(t), _) => user.push_str(&format!(" Known context: {t}\n")),
|
||||
(_, Some(s)) if !s.trim().is_empty() => {
|
||||
user.push_str(&format!(" Known context: {s}\n"));
|
||||
}
|
||||
_ => user.push_str(" (no extra context — narrate plainly from the date)\n"),
|
||||
}
|
||||
}
|
||||
user.push_str(&format!(
|
||||
"\nReturn ONLY a JSON object, no prose or code fences, shaped exactly:\n\
|
||||
{{\"title\": \"<short reel title>\", \"segments\": [\"<line for photo 1>\", \
|
||||
\"<line for photo 2>\", ... ]}}\n\
|
||||
The \"segments\" array MUST have exactly {} items, one per photo in order.",
|
||||
planned.len()
|
||||
));
|
||||
(SYSTEM_PROMPT.to_string(), user)
|
||||
}
|
||||
|
||||
/// Parse the model's response into a script with exactly `n` lines. Tolerant of
|
||||
/// code fences and surrounding prose, and of both `segments: [".."]` and
|
||||
/// `segments: [{"narration": ".."}]` shapes. Missing/extra lines are padded or
|
||||
/// truncated so the caller always gets `n` aligned to the segments.
|
||||
pub fn parse_script_response(raw: &str, n: usize) -> ReelScript {
|
||||
let fallback_line = "A moment worth remembering.";
|
||||
let value = extract_json_object(raw);
|
||||
|
||||
let title = value
|
||||
.as_ref()
|
||||
.and_then(|v| v.get("title"))
|
||||
.and_then(|t| t.as_str())
|
||||
.map(clean_text)
|
||||
.filter(|s| !s.is_empty())
|
||||
.unwrap_or_else(|| "Memories".to_string());
|
||||
|
||||
let mut lines: Vec<String> = value
|
||||
.as_ref()
|
||||
.and_then(|v| v.get("segments"))
|
||||
.and_then(|s| s.as_array())
|
||||
.map(|arr| {
|
||||
arr.iter()
|
||||
.map(|item| {
|
||||
let text = item
|
||||
.as_str()
|
||||
.map(|s| s.to_string())
|
||||
.or_else(|| {
|
||||
item.get("narration")
|
||||
.and_then(|n| n.as_str())
|
||||
.map(|s| s.to_string())
|
||||
})
|
||||
.unwrap_or_default();
|
||||
clean_text(&text)
|
||||
})
|
||||
.collect()
|
||||
})
|
||||
.unwrap_or_default();
|
||||
|
||||
// Align to exactly n: drop extras, pad shortfalls with a neutral line so
|
||||
// every photo still gets spoken audio.
|
||||
lines.truncate(n);
|
||||
while lines.len() < n {
|
||||
lines.push(fallback_line.to_string());
|
||||
}
|
||||
for line in lines.iter_mut() {
|
||||
if line.is_empty() {
|
||||
*line = fallback_line.to_string();
|
||||
}
|
||||
}
|
||||
|
||||
ReelScript { title, lines }
|
||||
}
|
||||
|
||||
/// Pull the first balanced top-level JSON object out of a possibly-noisy model
|
||||
/// response (code fences, leading prose). Returns None if nothing parses.
|
||||
fn extract_json_object(raw: &str) -> Option<serde_json::Value> {
|
||||
// Fast path: the whole thing is valid JSON.
|
||||
if let Ok(v) = serde_json::from_str::<serde_json::Value>(raw.trim()) {
|
||||
return Some(v);
|
||||
}
|
||||
// Otherwise scan for the first '{' ... matching '}' span, ignoring braces
|
||||
// inside strings.
|
||||
let bytes = raw.as_bytes();
|
||||
let start = raw.find('{')?;
|
||||
let mut depth = 0i32;
|
||||
let mut in_str = false;
|
||||
let mut escaped = false;
|
||||
for i in start..bytes.len() {
|
||||
let c = bytes[i] as char;
|
||||
if in_str {
|
||||
if escaped {
|
||||
escaped = false;
|
||||
} else if c == '\\' {
|
||||
escaped = true;
|
||||
} else if c == '"' {
|
||||
in_str = false;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
match c {
|
||||
'"' => in_str = true,
|
||||
'{' => depth += 1,
|
||||
'}' => {
|
||||
depth -= 1;
|
||||
if depth == 0 {
|
||||
return serde_json::from_str(&raw[start..=i]).ok();
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
None
|
||||
}
|
||||
|
||||
/// Collapse whitespace and strip stray markdown/quote decorations a model
|
||||
/// sometimes leaves around a line.
|
||||
fn clean_text(s: &str) -> String {
|
||||
let trimmed = s.trim().trim_matches('"').trim();
|
||||
trimmed.split_whitespace().collect::<Vec<_>>().join(" ")
|
||||
}
|
||||
|
||||
/// Generate the reel script via the LLM. Text-only (no images) — the per-photo
|
||||
/// context comes from cached insights. The call takes the GPU read lease
|
||||
/// internally (see `LlamaCppClient::generate`).
|
||||
pub async fn generate_script(
|
||||
client: &Arc<LlamaCppClient>,
|
||||
meta: &ReelMeta,
|
||||
planned: &[PlannedSegment],
|
||||
) -> Result<ReelScript> {
|
||||
let (system, user) = build_script_messages(meta, planned);
|
||||
let raw = client
|
||||
.generate(&user, Some(&system), None)
|
||||
.await
|
||||
.context("LLM script generation failed")?;
|
||||
Ok(parse_script_response(&raw, planned.len()))
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
use crate::memories::MemoriesSpan;
|
||||
|
||||
fn meta() -> ReelMeta {
|
||||
ReelMeta {
|
||||
span: MemoriesSpan::Day,
|
||||
years: vec![2019, 2021],
|
||||
}
|
||||
}
|
||||
|
||||
fn planned(n: usize) -> Vec<PlannedSegment> {
|
||||
(0..n)
|
||||
.map(|i| PlannedSegment {
|
||||
media: super::super::SegmentMedia::Photo {
|
||||
rel_path: format!("p{i}.jpg"),
|
||||
library_id: 1,
|
||||
},
|
||||
date: Some(1_560_000_000 + i as i64 * 86_400),
|
||||
insight_title: None,
|
||||
insight_summary: None,
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn prompt_states_exact_segment_count_and_span() {
|
||||
let (sys, user) = build_script_messages(&meta(), &planned(3));
|
||||
assert!(sys.contains("memory reel"));
|
||||
assert!(user.contains("3 photos"));
|
||||
assert!(user.contains("on this day"));
|
||||
assert!(user.contains("exactly 3 items"));
|
||||
// Each photo gets an indexed entry.
|
||||
assert!(user.contains("[1]") && user.contains("[2]") && user.contains("[3]"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn prompt_includes_insight_context_when_present() {
|
||||
let mut p = planned(1);
|
||||
p[0].insight_title = Some("Lake house weekend".into());
|
||||
p[0].insight_summary = Some("Swimming with the dogs.".into());
|
||||
let (_sys, user) = build_script_messages(&meta(), &p);
|
||||
assert!(user.contains("Lake house weekend — Swimming with the dogs."));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_plain_json_object() {
|
||||
let raw = r#"{"title":"Summer Days","segments":["First line.","Second line."]}"#;
|
||||
let script = parse_script_response(raw, 2);
|
||||
assert_eq!(script.title, "Summer Days");
|
||||
assert_eq!(script.lines, vec!["First line.", "Second line."]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_tolerates_code_fences_and_prose() {
|
||||
let raw = "Sure! Here's your reel:\n```json\n{\"title\": \"Trip\", \"segments\": [\"A.\", \"B.\"]}\n```\nEnjoy!";
|
||||
let script = parse_script_response(raw, 2);
|
||||
assert_eq!(script.title, "Trip");
|
||||
assert_eq!(script.lines, vec!["A.", "B."]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_accepts_object_segment_shape() {
|
||||
let raw = r#"{"title":"T","segments":[{"narration":"One."},{"narration":"Two."}]}"#;
|
||||
let script = parse_script_response(raw, 2);
|
||||
assert_eq!(script.lines, vec!["One.", "Two."]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_pads_short_and_truncates_long_to_n() {
|
||||
// Model returned 1 line but we have 3 segments → pad with neutral lines.
|
||||
let short = parse_script_response(r#"{"title":"T","segments":["Only one."]}"#, 3);
|
||||
assert_eq!(short.lines.len(), 3);
|
||||
assert_eq!(short.lines[0], "Only one.");
|
||||
assert!(!short.lines[1].is_empty());
|
||||
|
||||
// Model returned 3 but we have 2 → truncate.
|
||||
let long = parse_script_response(r#"{"title":"T","segments":["a","b","c"]}"#, 2);
|
||||
assert_eq!(long.lines, vec!["a", "b"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_falls_back_on_garbage() {
|
||||
let script = parse_script_response("the model said no", 2);
|
||||
assert_eq!(script.title, "Memories");
|
||||
assert_eq!(script.lines.len(), 2);
|
||||
assert!(script.lines.iter().all(|l| !l.is_empty()));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parse_blank_line_replaced_with_fallback() {
|
||||
let script = parse_script_response(r#"{"title":"T","segments":[" ","Real."]}"#, 2);
|
||||
assert!(!script.lines[0].is_empty());
|
||||
assert_eq!(script.lines[1], "Real.");
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user