Add memory-reel backend: on-demand narrated photo slideshow

New POST /reels + GET /reels/{id} (+ /video) build an MP4 slideshow of a memory span (day/week/month), narrated by the LLM in a cloned voice. Pipeline (src/reels/): a selector resolves which photos + reel metadata, the scripter writes one narration line per photo via a single LLM call (reusing each photo's cached insight as context — no fresh vision calls, so reel generation stays off the GPU's vision slot), each line is synthesized to speech, and the renderer assembles stills + narration via ffmpeg. Jobs run in the background (mirroring the TTS speech-job registry) since a reel takes minutes; the finished MP4 is cached on disk keyed by the selection so a repeat request is instant. The segment model is media-typed (Photo today) so a video-clip segment (phase 2) and a nightly pre-render (phase 3) slot in without reworking the pipeline. Ken Burns motion is implemented but defaulted off pending a visual check on the GPU box. Supporting changes: - memories: extract gather_memory_items() so the reel selector reuses the exact window/exclusion/tz/sort logic behind /memories. - ai::tts: add synthesize_serialized() so reel narration honors the same single-GPU permit + write lease as user TTS requests. - video::ffmpeg: make get_duration_seconds() pub for narration timing. - AppState: reels_path (REELS_DIRECTORY, defaults beside preview clips). Pure logic (cache key, script parsing, ffmpeg arg/filter construction, even sampling, segment timing) is unit-tested (26 tests). The runtime path (ffmpeg render, TTS, LLM) needs a real run on the GPU host to verify end-to-end — not exercisable in CI. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-12 22:31:08 -04:00
parent 98274c3301
commit e3f731b3b2
9 changed files with 1615 additions and 30 deletions
@@ -0,0 +1,338 @@
+//! ffmpeg assembly for memory reels.
+//!
+//! Two-stage, per-segment design: each segment is rendered to its own
+//! normalized MP4 (identical codec/resolution/fps/timebase), then the segments
+//! are joined with the concat demuxer (stream copy, no re-encode). Rendering
+//! per segment — rather than one monster filtergraph — keeps each ffmpeg
+//! invocation simple to reason about, parallelizes naturally, and means a
+//! video-clip segment type (phase 2) slots in as just a different per-segment
+//! builder without touching the concat stage.
+//!
+//! The arg builders are pure (`Vec<String>` out) so the exact ffmpeg command
+//! is unit-testable; the runners spawn ffmpeg and surface stderr on failure.
+
+use anyhow::{Context, Result, bail};
+use std::path::Path;
+use tokio::process::Command;
+
+/// Re-exported so the reel pipeline reaches NVENC detection through this module
+/// rather than depending on `video::ffmpeg` directly.
+pub use crate::video::ffmpeg::is_nvenc_available;
+
+/// Reel canvas. Landscape matches the majority of camera photos; portrait
+/// shots are letterboxed by the `pad` in [`segment_filter`] rather than
+/// cropped, so faces never get cut off.
+pub const REEL_WIDTH: u32 = 1920;
+pub const REEL_HEIGHT: u32 = 1080;
+pub const REEL_FPS: u32 = 30;
+
+/// A still's screen time is its narration length plus a short breath, with a
+/// floor so a terse line still lingers. No ceiling: the segment always covers
+/// the full narration so speech is never truncated — the scripter is asked to
+/// keep lines short instead.
+pub const MIN_SEGMENT_SECONDS: f64 = 2.5;
+const NARRATION_TAIL_SECONDS: f64 = 0.6;
+
+/// Screen time for a photo segment given its narration audio length.
+pub fn segment_duration(narration_secs: f64) -> f64 {
+    let d = narration_secs + NARRATION_TAIL_SECONDS;
+    if d.is_finite() && d > MIN_SEGMENT_SECONDS {
+        d
+    } else {
+        MIN_SEGMENT_SECONDS
+    }
+}
+
+/// Options controlling per-segment rendering. `ken_burns` adds a slow zoom for
+/// motion; it's defaulted off until the effect is eyeballed on the GPU box,
+/// since a wrong zoompan expression reads as jitter and can't be verified here.
+#[derive(Debug, Clone, Copy)]
+pub struct SegmentOpts {
+    pub width: u32,
+    pub height: u32,
+    pub fps: u32,
+    pub nvenc: bool,
+    pub ken_burns: bool,
+}
+
+impl Default for SegmentOpts {
+    fn default() -> Self {
+        Self {
+            width: REEL_WIDTH,
+            height: REEL_HEIGHT,
+            fps: REEL_FPS,
+            nvenc: false,
+            ken_burns: false,
+        }
+    }
+}
+
+/// Video filter for a photo segment: fit the image inside the canvas
+/// (preserving aspect, padding the rest), normalize SAR/fps/pixel format, and
+/// optionally apply a gentle Ken Burns zoom.
+pub fn segment_filter(opts: &SegmentOpts, duration: f64) -> String {
+    let (w, h, fps) = (opts.width, opts.height, opts.fps);
+    if opts.ken_burns {
+        // Upscale first so zoompan samples from a larger frame (avoids
+        // shimmer), drift the zoom from 1.0→~1.12 across the segment, hold the
+        // crop centered, then settle to the canvas.
+        let frames = (duration * fps as f64).round().max(1.0) as u64;
+        format!(
+            "scale={w}*2:{h}*2:force_original_aspect_ratio=increase,\
+             crop={w}*2:{h}*2,\
+             zoompan=z='min(zoom+0.0009,1.12)':d={frames}:\
+             x='iw/2-(iw/zoom/2)':y='ih/2-(ih/zoom/2)':s={w}x{h}:fps={fps},\
+             setsar=1,format=yuv420p"
+        )
+    } else {
+        format!(
+            "scale={w}:{h}:force_original_aspect_ratio=decrease,\
+             pad={w}:{h}:(ow-iw)/2:(oh-ih)/2,\
+             setsar=1,fps={fps},format=yuv420p"
+        )
+    }
+}
+
+fn video_encoder_args(nvenc: bool) -> Vec<String> {
+    if nvenc {
+        // p4 ≈ balanced; cq 23 ≈ libx264 crf 21. Matches the HLS transcode path.
+        [
+            "-c:v",
+            "h264_nvenc",
+            "-preset",
+            "p4",
+            "-cq",
+            "23",
+            "-pix_fmt",
+            "yuv420p",
+        ]
+    } else {
+        [
+            "-c:v", "libx264", "-crf", "21", "-preset", "veryfast", "-pix_fmt", "yuv420p",
+        ]
+    }
+    .iter()
+    .map(|s| s.to_string())
+    .collect()
+}
+
+/// Build the ffmpeg args that render one photo segment: a still looped for
+/// `duration` seconds with its narration muxed in. The narration is padded
+/// with trailing silence (`apad`) so short lines don't end the segment early;
+/// `-t` bounds both streams to the segment length.
+pub fn build_segment_args(
+    image_path: &str,
+    audio_path: &str,
+    out_path: &str,
+    duration: f64,
+    opts: &SegmentOpts,
+) -> Vec<String> {
+    let mut args: Vec<String> = vec!["-y".into()];
+    if opts.nvenc {
+        args.extend(["-hwaccel".into(), "cuda".into()]);
+    }
+    args.extend([
+        "-loop".into(),
+        "1".into(),
+        "-i".into(),
+        image_path.into(),
+        "-i".into(),
+        audio_path.into(),
+        "-filter_complex".into(),
+        format!("[0:v]{}[v];[1:a]apad[a]", segment_filter(opts, duration)),
+        "-map".into(),
+        "[v]".into(),
+        "-map".into(),
+        "[a]".into(),
+        "-t".into(),
+        format!("{duration:.3}"),
+    ]);
+    args.extend(video_encoder_args(opts.nvenc));
+    args.extend(
+        ["-c:a", "aac", "-b:a", "160k", "-ar", "48000", "-shortest"]
+            .iter()
+            .map(|s| s.to_string()),
+    );
+    args.push(out_path.into());
+    args
+}
+
+/// Build the concat-demuxer args that join rendered segments losslessly.
+/// `+faststart` moves the moov atom up front so the reel streams immediately
+/// on the mobile client.
+pub fn build_concat_args(list_path: &str, out_path: &str) -> Vec<String> {
+    [
+        "-y",
+        "-f",
+        "concat",
+        "-safe",
+        "0",
+        "-i",
+        list_path,
+        "-c",
+        "copy",
+        "-movflags",
+        "+faststart",
+        out_path,
+    ]
+    .iter()
+    .map(|s| s.to_string())
+    .collect()
+}
+
+/// Render the concat list file body. Each line points the demuxer at one
+/// segment; single quotes in paths are escaped per ffmpeg's concat syntax.
+pub fn build_concat_list(segment_paths: &[String]) -> String {
+    let mut out = String::new();
+    for p in segment_paths {
+        let escaped = p.replace('\'', r"'\''");
+        out.push_str(&format!("file '{escaped}'\n"));
+    }
+    out
+}
+
+async fn run_ffmpeg(args: &[String], what: &str) -> Result<()> {
+    let output = Command::new("ffmpeg")
+        .args(args)
+        .output()
+        .await
+        .with_context(|| format!("spawning ffmpeg for {what}"))?;
+    if !output.status.success() {
+        bail!(
+            "ffmpeg {what} failed: {}",
+            String::from_utf8_lossy(&output.stderr)
+        );
+    }
+    Ok(())
+}
+
+/// Render one photo segment to `out_path`.
+pub async fn render_segment(
+    image_path: &Path,
+    audio_path: &Path,
+    out_path: &Path,
+    duration: f64,
+    opts: &SegmentOpts,
+) -> Result<()> {
+    let args = build_segment_args(
+        &image_path.to_string_lossy(),
+        &audio_path.to_string_lossy(),
+        &out_path.to_string_lossy(),
+        duration,
+        opts,
+    );
+    run_ffmpeg(&args, "segment render").await
+}
+
+/// Join rendered segments into the final reel. Writes the concat list into the
+/// same directory as the output so relative paths and cleanup stay local.
+pub async fn concat_segments(segment_paths: &[String], out_path: &Path) -> Result<()> {
+    let list_path = out_path.with_extension("concat.txt");
+    let body = build_concat_list(segment_paths);
+    tokio::fs::write(&list_path, body)
+        .await
+        .context("writing concat list")?;
+    let args = build_concat_args(&list_path.to_string_lossy(), &out_path.to_string_lossy());
+    let result = run_ffmpeg(&args, "concat").await;
+    let _ = tokio::fs::remove_file(&list_path).await;
+    result
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn segment_duration_floors_short_lines() {
+        // A one-word narration still lingers at the floor.
+        assert_eq!(segment_duration(0.5), MIN_SEGMENT_SECONDS);
+        assert_eq!(segment_duration(0.0), MIN_SEGMENT_SECONDS);
+    }
+
+    #[test]
+    fn segment_duration_covers_full_narration_plus_tail() {
+        // No ceiling: a long line gets its full length so speech isn't cut.
+        assert!((segment_duration(5.0) - 5.6).abs() < 1e-9);
+        assert!((segment_duration(20.0) - 20.6).abs() < 1e-9);
+    }
+
+    #[test]
+    fn segment_duration_rejects_nonfinite() {
+        assert_eq!(segment_duration(f64::NAN), MIN_SEGMENT_SECONDS);
+        assert_eq!(segment_duration(f64::INFINITY), MIN_SEGMENT_SECONDS);
+    }
+
+    #[test]
+    fn static_filter_fits_and_pads_without_cropping() {
+        let f = segment_filter(&SegmentOpts::default(), 4.0);
+        assert!(f.contains("force_original_aspect_ratio=decrease"));
+        assert!(f.contains("pad=1920:1080"));
+        assert!(f.contains("format=yuv420p"));
+        // No zoompan when ken_burns is off.
+        assert!(!f.contains("zoompan"));
+    }
+
+    #[test]
+    fn ken_burns_filter_uses_duration_scaled_frame_count() {
+        let opts = SegmentOpts {
+            ken_burns: true,
+            ..SegmentOpts::default()
+        };
+        // 4s * 30fps = 120 frames in the zoompan d= term.
+        let f = segment_filter(&opts, 4.0);
+        assert!(f.contains("zoompan"));
+        assert!(f.contains("d=120:"));
+        assert!(f.contains("s=1920x1080"));
+    }
+
+    #[test]
+    fn segment_args_loop_still_and_bound_with_t() {
+        let args = build_segment_args(
+            "/img.jpg",
+            "/a.wav",
+            "/out.mp4",
+            4.0,
+            &SegmentOpts::default(),
+        );
+        let joined = args.join(" ");
+        assert!(joined.contains("-loop 1 -i /img.jpg"));
+        assert!(joined.contains("-i /a.wav"));
+        assert!(joined.contains("apad"));
+        assert!(joined.contains("-t 4.000"));
+        assert!(joined.contains("libx264"));
+        assert!(joined.ends_with("/out.mp4"));
+    }
+
+    #[test]
+    fn segment_args_use_nvenc_and_cuda_when_enabled() {
+        let opts = SegmentOpts {
+            nvenc: true,
+            ..SegmentOpts::default()
+        };
+        let args = build_segment_args("/img.jpg", "/a.wav", "/out.mp4", 3.0, &opts);
+        let joined = args.join(" ");
+        assert!(joined.contains("-hwaccel cuda"));
+        assert!(joined.contains("h264_nvenc"));
+        assert!(!joined.contains("libx264"));
+    }
+
+    #[test]
+    fn concat_args_stream_copy_with_faststart() {
+        let args = build_concat_args("/tmp/list.txt", "/out.mp4");
+        let joined = args.join(" ");
+        assert!(joined.contains("-f concat -safe 0 -i /tmp/list.txt"));
+        assert!(joined.contains("-c copy"));
+        assert!(joined.contains("+faststart"));
+    }
+
+    #[test]
+    fn concat_list_escapes_single_quotes() {
+        let body = build_concat_list(&[
+            "/tmp/seg_000.mp4".into(),
+            "/tmp/own's dir/seg_001.mp4".into(),
+        ]);
+        assert!(body.contains("file '/tmp/seg_000.mp4'\n"));
+        // The apostrophe is closed-escaped-reopened per ffmpeg concat syntax.
+        assert!(body.contains(r"own'\''s"));
+    }
+}