Add memory-reel backend: on-demand narrated photo slideshow
New POST /reels + GET /reels/{id} (+ /video) build an MP4 slideshow of a
memory span (day/week/month), narrated by the LLM in a cloned voice.
Pipeline (src/reels/): a selector resolves which photos + reel metadata,
the scripter writes one narration line per photo via a single LLM call
(reusing each photo's cached insight as context — no fresh vision calls,
so reel generation stays off the GPU's vision slot), each line is
synthesized to speech, and the renderer assembles stills + narration via
ffmpeg. Jobs run in the background (mirroring the TTS speech-job
registry) since a reel takes minutes; the finished MP4 is cached on disk
keyed by the selection so a repeat request is instant.
The segment model is media-typed (Photo today) so a video-clip segment
(phase 2) and a nightly pre-render (phase 3) slot in without reworking
the pipeline. Ken Burns motion is implemented but defaulted off pending a
visual check on the GPU box.
Supporting changes:
- memories: extract gather_memory_items() so the reel selector reuses the
exact window/exclusion/tz/sort logic behind /memories.
- ai::tts: add synthesize_serialized() so reel narration honors the same
single-GPU permit + write lease as user TTS requests.
- video::ffmpeg: make get_duration_seconds() pub for narration timing.
- AppState: reels_path (REELS_DIRECTORY, defaults beside preview clips).
Pure logic (cache key, script parsing, ffmpeg arg/filter construction,
even sampling, segment timing) is unit-tested (26 tests). The runtime
path (ffmpeg render, TTS, LLM) needs a real run on the GPU host to verify
end-to-end — not exercisable in CI.
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,338 @@
|
||||
//! ffmpeg assembly for memory reels.
|
||||
//!
|
||||
//! Two-stage, per-segment design: each segment is rendered to its own
|
||||
//! normalized MP4 (identical codec/resolution/fps/timebase), then the segments
|
||||
//! are joined with the concat demuxer (stream copy, no re-encode). Rendering
|
||||
//! per segment — rather than one monster filtergraph — keeps each ffmpeg
|
||||
//! invocation simple to reason about, parallelizes naturally, and means a
|
||||
//! video-clip segment type (phase 2) slots in as just a different per-segment
|
||||
//! builder without touching the concat stage.
|
||||
//!
|
||||
//! The arg builders are pure (`Vec<String>` out) so the exact ffmpeg command
|
||||
//! is unit-testable; the runners spawn ffmpeg and surface stderr on failure.
|
||||
|
||||
use anyhow::{Context, Result, bail};
|
||||
use std::path::Path;
|
||||
use tokio::process::Command;
|
||||
|
||||
/// Re-exported so the reel pipeline reaches NVENC detection through this module
|
||||
/// rather than depending on `video::ffmpeg` directly.
|
||||
pub use crate::video::ffmpeg::is_nvenc_available;
|
||||
|
||||
/// Reel canvas. Landscape matches the majority of camera photos; portrait
|
||||
/// shots are letterboxed by the `pad` in [`segment_filter`] rather than
|
||||
/// cropped, so faces never get cut off.
|
||||
pub const REEL_WIDTH: u32 = 1920;
|
||||
pub const REEL_HEIGHT: u32 = 1080;
|
||||
pub const REEL_FPS: u32 = 30;
|
||||
|
||||
/// A still's screen time is its narration length plus a short breath, with a
|
||||
/// floor so a terse line still lingers. No ceiling: the segment always covers
|
||||
/// the full narration so speech is never truncated — the scripter is asked to
|
||||
/// keep lines short instead.
|
||||
pub const MIN_SEGMENT_SECONDS: f64 = 2.5;
|
||||
const NARRATION_TAIL_SECONDS: f64 = 0.6;
|
||||
|
||||
/// Screen time for a photo segment given its narration audio length.
|
||||
pub fn segment_duration(narration_secs: f64) -> f64 {
|
||||
let d = narration_secs + NARRATION_TAIL_SECONDS;
|
||||
if d.is_finite() && d > MIN_SEGMENT_SECONDS {
|
||||
d
|
||||
} else {
|
||||
MIN_SEGMENT_SECONDS
|
||||
}
|
||||
}
|
||||
|
||||
/// Options controlling per-segment rendering. `ken_burns` adds a slow zoom for
|
||||
/// motion; it's defaulted off until the effect is eyeballed on the GPU box,
|
||||
/// since a wrong zoompan expression reads as jitter and can't be verified here.
|
||||
#[derive(Debug, Clone, Copy)]
|
||||
pub struct SegmentOpts {
|
||||
pub width: u32,
|
||||
pub height: u32,
|
||||
pub fps: u32,
|
||||
pub nvenc: bool,
|
||||
pub ken_burns: bool,
|
||||
}
|
||||
|
||||
impl Default for SegmentOpts {
|
||||
fn default() -> Self {
|
||||
Self {
|
||||
width: REEL_WIDTH,
|
||||
height: REEL_HEIGHT,
|
||||
fps: REEL_FPS,
|
||||
nvenc: false,
|
||||
ken_burns: false,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/// Video filter for a photo segment: fit the image inside the canvas
|
||||
/// (preserving aspect, padding the rest), normalize SAR/fps/pixel format, and
|
||||
/// optionally apply a gentle Ken Burns zoom.
|
||||
pub fn segment_filter(opts: &SegmentOpts, duration: f64) -> String {
|
||||
let (w, h, fps) = (opts.width, opts.height, opts.fps);
|
||||
if opts.ken_burns {
|
||||
// Upscale first so zoompan samples from a larger frame (avoids
|
||||
// shimmer), drift the zoom from 1.0→~1.12 across the segment, hold the
|
||||
// crop centered, then settle to the canvas.
|
||||
let frames = (duration * fps as f64).round().max(1.0) as u64;
|
||||
format!(
|
||||
"scale={w}*2:{h}*2:force_original_aspect_ratio=increase,\
|
||||
crop={w}*2:{h}*2,\
|
||||
zoompan=z='min(zoom+0.0009,1.12)':d={frames}:\
|
||||
x='iw/2-(iw/zoom/2)':y='ih/2-(ih/zoom/2)':s={w}x{h}:fps={fps},\
|
||||
setsar=1,format=yuv420p"
|
||||
)
|
||||
} else {
|
||||
format!(
|
||||
"scale={w}:{h}:force_original_aspect_ratio=decrease,\
|
||||
pad={w}:{h}:(ow-iw)/2:(oh-ih)/2,\
|
||||
setsar=1,fps={fps},format=yuv420p"
|
||||
)
|
||||
}
|
||||
}
|
||||
|
||||
fn video_encoder_args(nvenc: bool) -> Vec<String> {
|
||||
if nvenc {
|
||||
// p4 ≈ balanced; cq 23 ≈ libx264 crf 21. Matches the HLS transcode path.
|
||||
[
|
||||
"-c:v",
|
||||
"h264_nvenc",
|
||||
"-preset",
|
||||
"p4",
|
||||
"-cq",
|
||||
"23",
|
||||
"-pix_fmt",
|
||||
"yuv420p",
|
||||
]
|
||||
} else {
|
||||
[
|
||||
"-c:v", "libx264", "-crf", "21", "-preset", "veryfast", "-pix_fmt", "yuv420p",
|
||||
]
|
||||
}
|
||||
.iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Build the ffmpeg args that render one photo segment: a still looped for
|
||||
/// `duration` seconds with its narration muxed in. The narration is padded
|
||||
/// with trailing silence (`apad`) so short lines don't end the segment early;
|
||||
/// `-t` bounds both streams to the segment length.
|
||||
pub fn build_segment_args(
|
||||
image_path: &str,
|
||||
audio_path: &str,
|
||||
out_path: &str,
|
||||
duration: f64,
|
||||
opts: &SegmentOpts,
|
||||
) -> Vec<String> {
|
||||
let mut args: Vec<String> = vec!["-y".into()];
|
||||
if opts.nvenc {
|
||||
args.extend(["-hwaccel".into(), "cuda".into()]);
|
||||
}
|
||||
args.extend([
|
||||
"-loop".into(),
|
||||
"1".into(),
|
||||
"-i".into(),
|
||||
image_path.into(),
|
||||
"-i".into(),
|
||||
audio_path.into(),
|
||||
"-filter_complex".into(),
|
||||
format!("[0:v]{}[v];[1:a]apad[a]", segment_filter(opts, duration)),
|
||||
"-map".into(),
|
||||
"[v]".into(),
|
||||
"-map".into(),
|
||||
"[a]".into(),
|
||||
"-t".into(),
|
||||
format!("{duration:.3}"),
|
||||
]);
|
||||
args.extend(video_encoder_args(opts.nvenc));
|
||||
args.extend(
|
||||
["-c:a", "aac", "-b:a", "160k", "-ar", "48000", "-shortest"]
|
||||
.iter()
|
||||
.map(|s| s.to_string()),
|
||||
);
|
||||
args.push(out_path.into());
|
||||
args
|
||||
}
|
||||
|
||||
/// Build the concat-demuxer args that join rendered segments losslessly.
|
||||
/// `+faststart` moves the moov atom up front so the reel streams immediately
|
||||
/// on the mobile client.
|
||||
pub fn build_concat_args(list_path: &str, out_path: &str) -> Vec<String> {
|
||||
[
|
||||
"-y",
|
||||
"-f",
|
||||
"concat",
|
||||
"-safe",
|
||||
"0",
|
||||
"-i",
|
||||
list_path,
|
||||
"-c",
|
||||
"copy",
|
||||
"-movflags",
|
||||
"+faststart",
|
||||
out_path,
|
||||
]
|
||||
.iter()
|
||||
.map(|s| s.to_string())
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Render the concat list file body. Each line points the demuxer at one
|
||||
/// segment; single quotes in paths are escaped per ffmpeg's concat syntax.
|
||||
pub fn build_concat_list(segment_paths: &[String]) -> String {
|
||||
let mut out = String::new();
|
||||
for p in segment_paths {
|
||||
let escaped = p.replace('\'', r"'\''");
|
||||
out.push_str(&format!("file '{escaped}'\n"));
|
||||
}
|
||||
out
|
||||
}
|
||||
|
||||
async fn run_ffmpeg(args: &[String], what: &str) -> Result<()> {
|
||||
let output = Command::new("ffmpeg")
|
||||
.args(args)
|
||||
.output()
|
||||
.await
|
||||
.with_context(|| format!("spawning ffmpeg for {what}"))?;
|
||||
if !output.status.success() {
|
||||
bail!(
|
||||
"ffmpeg {what} failed: {}",
|
||||
String::from_utf8_lossy(&output.stderr)
|
||||
);
|
||||
}
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Render one photo segment to `out_path`.
|
||||
pub async fn render_segment(
|
||||
image_path: &Path,
|
||||
audio_path: &Path,
|
||||
out_path: &Path,
|
||||
duration: f64,
|
||||
opts: &SegmentOpts,
|
||||
) -> Result<()> {
|
||||
let args = build_segment_args(
|
||||
&image_path.to_string_lossy(),
|
||||
&audio_path.to_string_lossy(),
|
||||
&out_path.to_string_lossy(),
|
||||
duration,
|
||||
opts,
|
||||
);
|
||||
run_ffmpeg(&args, "segment render").await
|
||||
}
|
||||
|
||||
/// Join rendered segments into the final reel. Writes the concat list into the
|
||||
/// same directory as the output so relative paths and cleanup stay local.
|
||||
pub async fn concat_segments(segment_paths: &[String], out_path: &Path) -> Result<()> {
|
||||
let list_path = out_path.with_extension("concat.txt");
|
||||
let body = build_concat_list(segment_paths);
|
||||
tokio::fs::write(&list_path, body)
|
||||
.await
|
||||
.context("writing concat list")?;
|
||||
let args = build_concat_args(&list_path.to_string_lossy(), &out_path.to_string_lossy());
|
||||
let result = run_ffmpeg(&args, "concat").await;
|
||||
let _ = tokio::fs::remove_file(&list_path).await;
|
||||
result
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn segment_duration_floors_short_lines() {
|
||||
// A one-word narration still lingers at the floor.
|
||||
assert_eq!(segment_duration(0.5), MIN_SEGMENT_SECONDS);
|
||||
assert_eq!(segment_duration(0.0), MIN_SEGMENT_SECONDS);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn segment_duration_covers_full_narration_plus_tail() {
|
||||
// No ceiling: a long line gets its full length so speech isn't cut.
|
||||
assert!((segment_duration(5.0) - 5.6).abs() < 1e-9);
|
||||
assert!((segment_duration(20.0) - 20.6).abs() < 1e-9);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn segment_duration_rejects_nonfinite() {
|
||||
assert_eq!(segment_duration(f64::NAN), MIN_SEGMENT_SECONDS);
|
||||
assert_eq!(segment_duration(f64::INFINITY), MIN_SEGMENT_SECONDS);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn static_filter_fits_and_pads_without_cropping() {
|
||||
let f = segment_filter(&SegmentOpts::default(), 4.0);
|
||||
assert!(f.contains("force_original_aspect_ratio=decrease"));
|
||||
assert!(f.contains("pad=1920:1080"));
|
||||
assert!(f.contains("format=yuv420p"));
|
||||
// No zoompan when ken_burns is off.
|
||||
assert!(!f.contains("zoompan"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn ken_burns_filter_uses_duration_scaled_frame_count() {
|
||||
let opts = SegmentOpts {
|
||||
ken_burns: true,
|
||||
..SegmentOpts::default()
|
||||
};
|
||||
// 4s * 30fps = 120 frames in the zoompan d= term.
|
||||
let f = segment_filter(&opts, 4.0);
|
||||
assert!(f.contains("zoompan"));
|
||||
assert!(f.contains("d=120:"));
|
||||
assert!(f.contains("s=1920x1080"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn segment_args_loop_still_and_bound_with_t() {
|
||||
let args = build_segment_args(
|
||||
"/img.jpg",
|
||||
"/a.wav",
|
||||
"/out.mp4",
|
||||
4.0,
|
||||
&SegmentOpts::default(),
|
||||
);
|
||||
let joined = args.join(" ");
|
||||
assert!(joined.contains("-loop 1 -i /img.jpg"));
|
||||
assert!(joined.contains("-i /a.wav"));
|
||||
assert!(joined.contains("apad"));
|
||||
assert!(joined.contains("-t 4.000"));
|
||||
assert!(joined.contains("libx264"));
|
||||
assert!(joined.ends_with("/out.mp4"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn segment_args_use_nvenc_and_cuda_when_enabled() {
|
||||
let opts = SegmentOpts {
|
||||
nvenc: true,
|
||||
..SegmentOpts::default()
|
||||
};
|
||||
let args = build_segment_args("/img.jpg", "/a.wav", "/out.mp4", 3.0, &opts);
|
||||
let joined = args.join(" ");
|
||||
assert!(joined.contains("-hwaccel cuda"));
|
||||
assert!(joined.contains("h264_nvenc"));
|
||||
assert!(!joined.contains("libx264"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn concat_args_stream_copy_with_faststart() {
|
||||
let args = build_concat_args("/tmp/list.txt", "/out.mp4");
|
||||
let joined = args.join(" ");
|
||||
assert!(joined.contains("-f concat -safe 0 -i /tmp/list.txt"));
|
||||
assert!(joined.contains("-c copy"));
|
||||
assert!(joined.contains("+faststart"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn concat_list_escapes_single_quotes() {
|
||||
let body = build_concat_list(&[
|
||||
"/tmp/seg_000.mp4".into(),
|
||||
"/tmp/own's dir/seg_001.mp4".into(),
|
||||
]);
|
||||
assert!(body.contains("file '/tmp/seg_000.mp4'\n"));
|
||||
// The apostrophe is closed-escaped-reopened per ffmpeg concat syntax.
|
||||
assert!(body.contains(r"own'\''s"));
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user