//! ffmpeg assembly for memory reels. //! //! Two-stage, per-segment design: each segment is rendered to its own //! normalized MP4 (identical codec/resolution/fps/timebase), then the segments //! are joined with the concat demuxer (stream copy, no re-encode). Rendering //! per segment — rather than one monster filtergraph — keeps each ffmpeg //! invocation simple to reason about, parallelizes naturally, and means a //! video-clip segment type (phase 2) slots in as just a different per-segment //! builder without touching the concat stage. //! //! The arg builders are pure (`Vec` out) so the exact ffmpeg command //! is unit-testable; the runners spawn ffmpeg and surface stderr on failure. use anyhow::{Context, Result, bail}; use std::path::Path; use tokio::process::Command; /// Re-exported so the reel pipeline reaches NVENC detection through this module /// rather than depending on `video::ffmpeg` directly. pub use crate::video::ffmpeg::is_nvenc_available; /// Reel canvas. Portrait, because reels are watched on a phone held upright — /// a landscape canvas letterboxes to a thin ~25%-height band there. Each photo /// is fitted sharp and centered over a blurred, zoomed copy of itself (see /// [`photo_filter_chain`]) so the frame is always filled regardless of the /// photo's orientation, without cropping the subject. pub const REEL_WIDTH: u32 = 1080; pub const REEL_HEIGHT: u32 = 1920; pub const REEL_FPS: u32 = 30; /// A beat's screen time is its narration length plus a short breath, with a /// floor so a terse line still lingers. No ceiling: the beat always covers the /// full narration so speech is never truncated — the scripter is asked to keep /// lines short instead. pub const MIN_SEGMENT_SECONDS: f64 = 2.5; const NARRATION_TAIL_SECONDS: f64 = 0.6; /// Fade durations baked into each photo. A held (single-photo) beat gets a /// gentle dip; burst photos get a much snappier fade so the difference between /// a held shot and a quick burst is obvious. const SINGLE_FADE_SECONDS: f64 = 0.35; const BURST_FADE_SECONDS: f64 = 0.08; /// Video-clip framing. A clip plays at most this long, with its live audio /// ducked to `CLIP_DUCK_VOLUME` under the narration. pub const CLIP_SECONDS: f64 = 5.0; const CLIP_DUCK_VOLUME: f64 = 0.35; /// Floor on how long each burst photo stays up, so a long line over many photos /// doesn't flash them subliminally. If the narration is too short to give every /// photo this much, the beat is stretched to fit. const MIN_BURST_PHOTO_SECONDS: f64 = 0.6; /// Base screen time for a beat given its narration length: narration + breath, /// floored. Used as the lower bound on a beat's total duration. pub fn segment_duration(narration_secs: f64) -> f64 { let d = narration_secs + NARRATION_TAIL_SECONDS; if d.is_finite() && d > MIN_SEGMENT_SECONDS { d } else { MIN_SEGMENT_SECONDS } } /// Split a beat into per-photo durations. The beat lasts at least its narration /// (so speech isn't cut) and at least `n × MIN_BURST_PHOTO_SECONDS` (so a fast /// burst stays legible); the photos share that total evenly. Returns /// `(total_seconds, per_photo_seconds)`. pub fn beat_durations(narration_secs: f64, n_photos: usize) -> (f64, Vec) { let n = n_photos.max(1); let base = segment_duration(narration_secs); let min_total = n as f64 * MIN_BURST_PHOTO_SECONDS; let total = if base > min_total { base } else { min_total }; let each = total / n as f64; (total, vec![each; n]) } /// Fade length to use for a beat of `n_photos` (gentle when held, snappy in a /// burst). fn fade_for(n_photos: usize) -> f64 { if n_photos > 1 { BURST_FADE_SECONDS } else { SINGLE_FADE_SECONDS } } /// Options controlling per-segment rendering. #[derive(Debug, Clone, Copy)] pub struct SegmentOpts { pub width: u32, pub height: u32, pub fps: u32, pub nvenc: bool, } impl Default for SegmentOpts { fn default() -> Self { Self { width: REEL_WIDTH, height: REEL_HEIGHT, fps: REEL_FPS, nvenc: false, } } } /// Filter chain for one photo (input `idx`) producing the labelled output /// `[v{idx}]`. Splits the still into a background and foreground: the background /// is scaled to *cover* the canvas and heavily blurred; the foreground is /// scaled to *fit* and overlaid centered. This fills the portrait frame for any /// photo orientation — no black bars, no cropping of the subject — then a fade /// in/out softens the cut. Intermediate labels are suffixed with `idx` so /// several chains coexist in one `filter_complex`. /// /// `fps` is normalized BEFORE the fades so the brightness ramp is computed on a /// true {fps}-frame timeline; otherwise the fade is sampled at the looped /// still's coarse cadence and duplicated up, which reads as a steppy dip. fn photo_filter_chain(idx: usize, opts: &SegmentOpts, duration: f64, fade: f64) -> String { let (w, h, fps) = (opts.width, opts.height, opts.fps); let fade_out_start = (duration - fade).max(0.0); format!( "[{idx}:v]split=2[bg{idx}][fg{idx}];\ [bg{idx}]scale={w}:{h}:force_original_aspect_ratio=increase,\ crop={w}:{h},boxblur=20:2[bgb{idx}];\ [fg{idx}]scale={w}:{h}:force_original_aspect_ratio=decrease[fgs{idx}];\ [bgb{idx}][fgs{idx}]overlay=(W-w)/2:(H-h)/2,\ fps={fps},\ fade=t=in:st=0:d={fade},\ fade=t=out:st={fade_out_start:.3}:d={fade},\ setsar=1,format=yuv420p[v{idx}]" ) } /// Full `filter_complex` for a beat of `per_photo` durations: one chain per /// photo, concatenated into `[v]`, with the narration (the last input, index /// `per_photo.len()`) padded with trailing silence into `[a]`. A single-photo /// beat degenerates to one chain + `concat=n=1` (a passthrough). pub fn beat_filtergraph(opts: &SegmentOpts, per_photo: &[f64]) -> String { let n = per_photo.len().max(1); let fade = fade_for(n); let chains: Vec = per_photo .iter() .enumerate() .map(|(i, &d)| photo_filter_chain(i, opts, d, fade)) .collect(); let concat_inputs: String = (0..n).map(|i| format!("[v{i}]")).collect(); format!( "{chains};{concat_inputs}concat=n={n}:v=1:a=0[v];[{n}:a]apad[a]", chains = chains.join(";") ) } fn video_encoder_args(nvenc: bool) -> Vec { if nvenc { // p4 ≈ balanced; cq 23 ≈ libx264 crf 21. Matches the HLS transcode path. [ "-c:v", "h264_nvenc", "-preset", "p4", "-cq", "23", "-pix_fmt", "yuv420p", ] } else { [ "-c:v", "libx264", "-crf", "21", "-preset", "veryfast", "-pix_fmt", "yuv420p", ] } .iter() .map(|s| s.to_string()) .collect() } /// Build the ffmpeg args that render one beat: each photo looped for its slice /// of the beat (filled to the portrait canvas with a blurred backdrop), the /// slices concatenated, and the single narration muxed over the whole thing. /// `total` bounds the output (and the apad'd audio) to the beat length. pub fn build_beat_args( image_paths: &[String], audio_path: &str, out_path: &str, per_photo: &[f64], total: f64, opts: &SegmentOpts, ) -> Vec { let fps = opts.fps.to_string(); let mut args: Vec = vec!["-y".into()]; if opts.nvenc { args.extend(["-hwaccel".into(), "cuda".into()]); } // One looped-still input per photo, each bounded to its slice by an input // `-t`; reading at the target `-framerate` gives the fades real frames to // ramp across. for (path, &dur) in image_paths.iter().zip(per_photo.iter()) { args.extend([ "-framerate".into(), fps.clone(), "-loop".into(), "1".into(), "-t".into(), format!("{dur:.3}"), "-i".into(), path.clone(), ]); } args.extend([ "-i".into(), audio_path.into(), "-filter_complex".into(), beat_filtergraph(opts, per_photo), "-map".into(), "[v]".into(), "-map".into(), "[a]".into(), "-t".into(), format!("{total:.3}"), // Force constant frame rate so the beat (and the concatenated reel) // plays at a steady {fps} rather than a variable cadence. "-r".into(), fps, ]); args.extend(video_encoder_args(opts.nvenc)); args.extend( ["-c:a", "aac", "-b:a", "160k", "-ar", "48000", "-shortest"] .iter() .map(|s| s.to_string()), ); args.push(out_path.into()); args } /// Build the concat-demuxer args that join rendered segments losslessly. /// `+faststart` moves the moov atom up front so the reel streams immediately /// on the mobile client. The output muxer is forced with `-f mp4` because we /// write to a `.tmp` path (atomic publish) whose extension ffmpeg can't map to /// a format on its own. pub fn build_concat_args(list_path: &str, out_path: &str) -> Vec { [ "-y", "-f", "concat", "-safe", "0", "-i", list_path, "-c", "copy", "-movflags", "+faststart", "-f", "mp4", out_path, ] .iter() .map(|s| s.to_string()) .collect() } /// Render the concat list file body. Each line points the demuxer at one /// segment; single quotes in paths are escaped per ffmpeg's concat syntax. pub fn build_concat_list(segment_paths: &[String]) -> String { let mut out = String::new(); for p in segment_paths { let escaped = p.replace('\'', r"'\''"); out.push_str(&format!("file '{escaped}'\n")); } out } async fn run_ffmpeg(args: &[String], what: &str) -> Result<()> { let output = Command::new("ffmpeg") .args(args) .output() .await .with_context(|| format!("spawning ffmpeg for {what}"))?; if !output.status.success() { bail!( "ffmpeg {what} failed: {}", String::from_utf8_lossy(&output.stderr) ); } Ok(()) } /// Render one beat to `out_path`: its photos shown in sequence (a held shot for /// one photo, a quick burst for several) under the single narration in /// `audio_path`, whose measured length sets the beat's pacing. pub async fn render_beat( image_paths: &[std::path::PathBuf], audio_path: &Path, out_path: &Path, narration_secs: f64, opts: &SegmentOpts, ) -> Result<()> { if image_paths.is_empty() { bail!("render_beat called with no images"); } let (total, per_photo) = beat_durations(narration_secs, image_paths.len()); let paths: Vec = image_paths .iter() .map(|p| p.to_string_lossy().to_string()) .collect(); let args = build_beat_args( &paths, &audio_path.to_string_lossy(), &out_path.to_string_lossy(), &per_photo, total, opts, ); run_ffmpeg(&args, "beat render").await } // --- Video-clip beats -------------------------------------------------------- /// Video chain for a clip beat: fill the clip to the portrait canvas (blurred /// backdrop, same look as photos), normalize fps, hold the last frame if the /// narration outlasts the clip (`tpad`), then fade. Produces `[v]`. fn clip_video_filter(opts: &SegmentOpts, clip_dur: f64, beat_total: f64) -> String { let (w, h, fps) = (opts.width, opts.height, opts.fps); let fade = SINGLE_FADE_SECONDS; let hold = (beat_total - clip_dur).max(0.0); let fade_out_start = (beat_total - fade).max(0.0); // Freeze the final frame to cover narration that runs past the clip. let tpad = if hold > 0.05 { format!(",tpad=stop_mode=clone:stop_duration={hold:.3}") } else { String::new() }; format!( "[0:v]split=2[bg][fg];\ [bg]scale={w}:{h}:force_original_aspect_ratio=increase,\ crop={w}:{h},boxblur=20:2[bgb];\ [fg]scale={w}:{h}:force_original_aspect_ratio=decrease[fgs];\ [bgb][fgs]overlay=(W-w)/2:(H-h)/2,fps={fps}{tpad},\ fade=t=in:st=0:d={fade},fade=t=out:st={fade_out_start:.3}:d={fade},\ setsar=1,format=yuv420p[v]" ) } /// Audio chain for a clip beat. With a clip audio track, duck it under the /// narration and mix; without one, just the narration. Produces `[a]`. fn clip_audio_filter(has_audio: bool) -> String { if has_audio { format!( "[0:a]volume={CLIP_DUCK_VOLUME}[duck];[1:a]apad[narr];\ [duck][narr]amix=inputs=2:duration=longest:normalize=0[a]" ) } else { "[1:a]apad[a]".to_string() } } /// Full `filter_complex` for a clip beat (input 0 = clip, input 1 = narration). pub fn clip_beat_filtergraph( opts: &SegmentOpts, clip_dur: f64, beat_total: f64, has_audio: bool, ) -> String { format!( "{};{}", clip_video_filter(opts, clip_dur, beat_total), clip_audio_filter(has_audio) ) } /// Build the ffmpeg args for a clip beat: the first `clip_dur` seconds of the /// source video, filled to the portrait canvas with its live audio ducked under /// the narration, bounded to `beat_total`. pub fn build_clip_beat_args( clip_path: &str, audio_path: &str, out_path: &str, clip_dur: f64, beat_total: f64, has_audio: bool, opts: &SegmentOpts, ) -> Vec { let fps = opts.fps.to_string(); let mut args: Vec = vec!["-y".into()]; if opts.nvenc { args.extend(["-hwaccel".into(), "cuda".into()]); } args.extend([ // Input `-t` limits the clip to its window; audio has none (apad fills). "-t".into(), format!("{clip_dur:.3}"), "-i".into(), clip_path.into(), "-i".into(), audio_path.into(), "-filter_complex".into(), clip_beat_filtergraph(opts, clip_dur, beat_total, has_audio), "-map".into(), "[v]".into(), "-map".into(), "[a]".into(), "-t".into(), format!("{beat_total:.3}"), "-r".into(), fps, ]); args.extend(video_encoder_args(opts.nvenc)); args.extend( ["-c:a", "aac", "-b:a", "160k", "-ar", "48000"] .iter() .map(|s| s.to_string()), ); args.push(out_path.into()); args } /// Whether a media file has at least one audio stream (so a clip beat knows /// whether to mix in live audio). Defaults to `false` on any probe failure. pub async fn has_audio_stream(path: &str) -> bool { Command::new("ffprobe") .args([ "-v", "error", "-select_streams", "a", "-show_entries", "stream=index", "-of", "csv=p=0", path, ]) .output() .await .map(|out| !out.stdout.is_empty()) .unwrap_or(false) } /// Render one clip beat: a section of `clip_path` (capped at [`CLIP_SECONDS`], /// and to the source length) under the narration in `audio_path`. The beat /// lasts at least the narration, freezing the clip's last frame if needed. pub async fn render_clip_beat( clip_path: &Path, audio_path: &Path, out_path: &Path, narration_secs: f64, opts: &SegmentOpts, ) -> Result<()> { let clip_str = clip_path.to_string_lossy().to_string(); // Clamp the clip to its own length so a short video isn't padded to the cap. let source_dur = crate::video::ffmpeg::get_duration_seconds(&clip_str) .await .ok() .flatten(); let clip_dur = match source_dur { Some(d) if d > 0.0 && d < CLIP_SECONDS => d, _ => CLIP_SECONDS, }; let beat_total = clip_dur.max(segment_duration(narration_secs)); let has_audio = has_audio_stream(&clip_str).await; let args = build_clip_beat_args( &clip_str, &audio_path.to_string_lossy(), &out_path.to_string_lossy(), clip_dur, beat_total, has_audio, opts, ); run_ffmpeg(&args, "clip beat render").await } /// Join rendered segments into the final reel. Writes the concat list into the /// same directory as the output so relative paths and cleanup stay local. pub async fn concat_segments(segment_paths: &[String], out_path: &Path) -> Result<()> { let list_path = out_path.with_extension("concat.txt"); let body = build_concat_list(segment_paths); tokio::fs::write(&list_path, body) .await .context("writing concat list")?; let args = build_concat_args(&list_path.to_string_lossy(), &out_path.to_string_lossy()); let result = run_ffmpeg(&args, "concat").await; let _ = tokio::fs::remove_file(&list_path).await; result } #[cfg(test)] mod tests { use super::*; #[test] fn segment_duration_floors_short_lines() { // A one-word narration still lingers at the floor. assert_eq!(segment_duration(0.5), MIN_SEGMENT_SECONDS); assert_eq!(segment_duration(0.0), MIN_SEGMENT_SECONDS); } #[test] fn segment_duration_covers_full_narration_plus_tail() { // No ceiling: a long line gets its full length so speech isn't cut. assert!((segment_duration(5.0) - 5.6).abs() < 1e-9); assert!((segment_duration(20.0) - 20.6).abs() < 1e-9); } #[test] fn segment_duration_rejects_nonfinite() { assert_eq!(segment_duration(f64::NAN), MIN_SEGMENT_SECONDS); assert_eq!(segment_duration(f64::INFINITY), MIN_SEGMENT_SECONDS); } #[test] fn beat_durations_single_photo_matches_base() { let (total, per) = beat_durations(4.0, 1); assert!((total - 4.6).abs() < 1e-9); // narration + tail assert_eq!(per.len(), 1); assert!((per[0] - 4.6).abs() < 1e-9); } #[test] fn beat_durations_burst_splits_evenly() { // 5 photos, narration 4.6s base → ~0.92s each (above the 0.6 floor). let (total, per) = beat_durations(4.0, 5); assert!((total - 4.6).abs() < 1e-9); assert_eq!(per.len(), 5); assert!((per.iter().sum::() - total).abs() < 1e-9); assert!(per.iter().all(|&d| d >= MIN_BURST_PHOTO_SECONDS)); } #[test] fn beat_durations_stretches_when_narration_too_short_for_burst() { // Floor narration (2.5s) over 10 photos would be 0.25s each — below the // legibility floor, so the beat stretches to 10 × 0.6 = 6s. let (total, per) = beat_durations(0.0, 10); assert!((total - 6.0).abs() < 1e-9); assert!(per.iter().all(|&d| (d - 0.6).abs() < 1e-9)); } #[test] fn beat_filtergraph_single_photo_fills_portrait_and_holds() { let (_t, per) = beat_durations(4.0, 1); let g = beat_filtergraph(&SegmentOpts::default(), &per); assert!(g.contains("[0:v]split=2[bg0][fg0]")); assert!(g.contains("scale=1080:1920:force_original_aspect_ratio=increase")); assert!(g.contains("crop=1080:1920")); assert!(g.contains("scale=1080:1920:force_original_aspect_ratio=decrease")); assert!(g.contains("overlay=(W-w)/2:(H-h)/2")); // Single photo → concat of one, gentle fade, audio is input 1. assert!(g.contains("concat=n=1:v=1:a=0[v]")); assert!(g.contains("d=0.35")); // SINGLE_FADE assert!(g.contains("[1:a]apad[a]")); } #[test] fn beat_filtergraph_burst_chains_concats_and_snappy_fade() { let (_t, per) = beat_durations(4.0, 3); let g = beat_filtergraph(&SegmentOpts::default(), &per); // One chain per photo with index-suffixed labels. assert!(g.contains("[0:v]split") && g.contains("[1:v]split") && g.contains("[2:v]split")); // Concatenated in order, audio is the 4th input (index 3). assert!(g.contains("[v0][v1][v2]concat=n=3:v=1:a=0[v]")); assert!(g.contains("[3:a]apad[a]")); // Burst uses the much snappier fade (vs 0.35 for a held shot). assert!(g.contains("d=0.08")); assert!(!g.contains("d=0.35")); } #[test] fn beat_filtergraph_normalizes_fps_before_fading() { // fps must precede the fades on every chain (else the dip looks steppy). let (_t, per) = beat_durations(4.0, 1); let g = beat_filtergraph(&SegmentOpts::default(), &per); let fps_at = g.find("fps=30").expect("fps in graph"); let fade_at = g.find("fade=t=in").expect("fade in graph"); assert!(fps_at < fade_at); } #[test] fn beat_args_one_input_per_photo_plus_audio_bound_by_total() { let (total, per) = beat_durations(4.0, 2); let args = build_beat_args( &["/a.jpg".into(), "/b.jpg".into()], "/n.wav", "/out.mp4", &per, total, &SegmentOpts::default(), ); let joined = args.join(" "); // A looped-still input per photo, each with its slice -t, then the audio. assert!(joined.contains("-framerate 30 -loop 1 -t 2.300 -i /a.jpg")); assert!(joined.contains("-framerate 30 -loop 1 -t 2.300 -i /b.jpg")); assert!(joined.contains("-i /n.wav")); // Output bounded to the beat total and forced CFR. assert!(joined.contains("-t 4.600")); assert!(joined.contains("-r 30")); assert!(joined.ends_with("/out.mp4")); } #[test] fn beat_args_use_nvenc_and_cuda_when_enabled() { let opts = SegmentOpts { nvenc: true, ..SegmentOpts::default() }; let (total, per) = beat_durations(3.0, 1); let args = build_beat_args( &["/img.jpg".into()], "/a.wav", "/out.mp4", &per, total, &opts, ); let joined = args.join(" "); assert!(joined.contains("-hwaccel cuda")); assert!(joined.contains("h264_nvenc")); assert!(!joined.contains("libx264")); } #[test] fn clip_filter_ducks_audio_and_holds_last_frame_when_narration_longer() { // 5s clip, 7s beat → 2s freeze of the last frame, ducked-audio mix. let g = clip_beat_filtergraph(&SegmentOpts::default(), 5.0, 7.0, true); assert!(g.contains("tpad=stop_mode=clone:stop_duration=2.000")); assert!(g.contains("volume=0.35")); assert!(g.contains("amix=inputs=2")); assert!(g.contains("[1:a]apad[narr]")); // Fill applied to the clip too. assert!(g.contains("boxblur")); assert!(g.contains("overlay=(W-w)/2:(H-h)/2")); } #[test] fn clip_filter_no_tpad_when_clip_covers_the_beat() { // Clip at least as long as the beat → no freeze. let g = clip_beat_filtergraph(&SegmentOpts::default(), 5.0, 5.0, true); assert!(!g.contains("tpad")); } #[test] fn clip_filter_narration_only_without_clip_audio() { let g = clip_beat_filtergraph(&SegmentOpts::default(), 5.0, 5.0, false); assert!(!g.contains("amix")); assert!(!g.contains("volume=")); assert!(g.contains("[1:a]apad[a]")); } #[test] fn clip_beat_args_bound_clip_and_output() { let args = build_clip_beat_args( "/v.mp4", "/n.wav", "/out.mp4", 5.0, 6.6, true, &SegmentOpts::default(), ); let joined = args.join(" "); // Input -t bounds the clip read; output -t bounds the beat. assert!(joined.contains("-t 5.000 -i /v.mp4")); assert!(joined.contains("-i /n.wav")); assert!(joined.contains("-t 6.600")); assert!(joined.contains("-r 30")); assert!(joined.ends_with("/out.mp4")); } #[test] fn concat_args_stream_copy_with_faststart_and_forced_muxer() { // Output goes to a .tmp path, so the muxer must be forced — ffmpeg // can't infer mp4 from the extension (the bug this guards against). let args = build_concat_args("/tmp/list.txt", "/out.mp4.tmp"); let joined = args.join(" "); assert!(joined.contains("-f concat -safe 0 -i /tmp/list.txt")); assert!(joined.contains("-c copy")); assert!(joined.contains("+faststart")); assert!(joined.contains("-f mp4")); // The forced muxer must come before the output path. let f_mp4 = args.windows(2).position(|w| w == ["-f", "mp4"]).unwrap(); let out = args.iter().position(|a| a == "/out.mp4.tmp").unwrap(); assert!(f_mp4 < out); } #[test] fn concat_list_escapes_single_quotes() { let body = build_concat_list(&[ "/tmp/seg_000.mp4".into(), "/tmp/own's dir/seg_001.mp4".into(), ]); assert!(body.contains("file '/tmp/seg_000.mp4'\n")); // The apostrophe is closed-escaped-reopened per ffmpeg concat syntax. assert!(body.contains(r"own'\''s")); } }