Files
ImageApi/src/video/ffmpeg.rs
T
Cameron Cordes e3f731b3b2 Add memory-reel backend: on-demand narrated photo slideshow
New POST /reels + GET /reels/{id} (+ /video) build an MP4 slideshow of a
memory span (day/week/month), narrated by the LLM in a cloned voice.

Pipeline (src/reels/): a selector resolves which photos + reel metadata,
the scripter writes one narration line per photo via a single LLM call
(reusing each photo's cached insight as context — no fresh vision calls,
so reel generation stays off the GPU's vision slot), each line is
synthesized to speech, and the renderer assembles stills + narration via
ffmpeg. Jobs run in the background (mirroring the TTS speech-job
registry) since a reel takes minutes; the finished MP4 is cached on disk
keyed by the selection so a repeat request is instant.

The segment model is media-typed (Photo today) so a video-clip segment
(phase 2) and a nightly pre-render (phase 3) slot in without reworking
the pipeline. Ken Burns motion is implemented but defaulted off pending a
visual check on the GPU box.

Supporting changes:
- memories: extract gather_memory_items() so the reel selector reuses the
  exact window/exclusion/tz/sort logic behind /memories.
- ai::tts: add synthesize_serialized() so reel narration honors the same
  single-GPU permit + write lease as user TTS requests.
- video::ffmpeg: make get_duration_seconds() pub for narration timing.
- AppState: reels_path (REELS_DIRECTORY, defaults beside preview clips).

Pure logic (cache key, script parsing, ffmpeg arg/filter construction,
even sampling, segment timing) is unit-tested (26 tests). The runtime
path (ffmpeg render, TTS, LLM) needs a real run on the GPU host to verify
end-to-end — not exercisable in CI.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-12 22:31:08 -04:00

473 lines
17 KiB
Rust

use futures::TryFutureExt;
use log::{debug, error, info, warn};
use std::io::Result;
use std::process::{Output, Stdio};
use std::sync::OnceLock;
use std::time::Instant;
use tokio::process::Command;
static NVENC_AVAILABLE: OnceLock<bool> = OnceLock::new();
/// Check if NVIDIA NVENC hardware encoder is available via ffmpeg.
async fn check_nvenc_available() -> bool {
Command::new("ffmpeg")
.args(["-hide_banner", "-encoders"])
.output()
.await
.map(|out| {
let stdout = String::from_utf8_lossy(&out.stdout);
stdout.contains("h264_nvenc")
})
.unwrap_or(false)
}
/// Returns whether NVENC is available, caching the result after first check.
pub async fn is_nvenc_available() -> bool {
if let Some(&available) = NVENC_AVAILABLE.get() {
return available;
}
let available = check_nvenc_available().await;
let _ = NVENC_AVAILABLE.set(available);
if available {
info!("CUDA NVENC hardware acceleration detected and enabled");
} else {
info!("NVENC not available, using CPU encoding");
}
available
}
pub struct Ffmpeg;
pub enum GifType {
Overview,
#[allow(dead_code)]
OverviewVideo {
duration: u32,
},
}
impl Ffmpeg {
async fn _generate_playlist(&self, input_file: &str, output_file: &str) -> Result<String> {
let ffmpeg_result: Result<Output> = Command::new("ffmpeg")
.arg("-i")
.arg(input_file)
.arg("-c:v")
.arg("h264")
.arg("-crf")
.arg("21")
.arg("-preset")
.arg("veryfast")
.arg("-hls_time")
.arg("3")
.arg("-hls_list_size")
.arg("100")
.arg("-vf")
.arg("scale=1080:-2,setsar=1:1")
.arg(output_file)
.stdout(Stdio::null())
.stderr(Stdio::piped())
.output()
.inspect_err(|e| error!("Failed to run ffmpeg on child process: {}", e))
.map_err(|e| std::io::Error::other(e.to_string()))
.await;
if let Ok(ref res) = ffmpeg_result {
debug!("ffmpeg output: {:?}", res);
}
ffmpeg_result.map(|_| output_file.to_string())
}
async fn get_video_duration(&self, input_file: &str) -> Result<u32> {
Command::new("ffprobe")
.args(["-i", input_file])
.args(["-show_entries", "format=duration"])
.args(["-v", "quiet"])
.args(["-of", "csv=p=0"])
.output()
.await
.map(|out| String::from_utf8_lossy(&out.stdout).trim().to_string())
.inspect(|duration| debug!("Found video duration: {:?}", duration))
.and_then(|duration| {
duration
.parse::<f32>()
.map(|duration| duration as u32)
.map_err(|e| std::io::Error::other(e.to_string()))
})
.inspect(|duration| debug!("Found video duration: {:?}", duration))
}
pub async fn generate_video_gif(
&self,
input_file: &str,
output_file: &str,
gif_type: GifType,
) -> Result<String> {
info!("Creating gif for: '{}'", input_file);
match gif_type {
GifType::Overview => {
let temp_dir = tempfile::tempdir()?;
let temp_path = temp_dir
.path()
.to_str()
.expect("Unable to make temp_dir a string");
match self
.get_video_duration(input_file)
.and_then(|duration| {
debug!("Creating gif frames for '{}'", input_file);
Command::new("ffmpeg")
.args(["-i", input_file])
.args(["-vf", &format!("fps=20/{}", duration)])
.args(["-q:v", "2"])
.stderr(Stdio::null())
.arg(format!("{}/frame_%03d.jpg", temp_path))
.status()
})
.and_then(|_| {
debug!("Generating palette");
Command::new("ffmpeg")
.args(["-i", &format!("{}/frame_%03d.jpg", temp_path)])
.args(["-vf", "palettegen"])
.arg(format!("{}/palette.png", temp_path))
.stderr(Stdio::null())
.status()
})
.and_then(|_| {
debug!("Creating gif for: '{}'", input_file);
self.create_gif_from_frames(temp_path, output_file)
})
.await
{
Ok(exit_code) => {
if exit_code == 0 {
info!("Created gif for '{}' -> '{}'", input_file, output_file);
} else {
warn!(
"Failed to create gif for '{}' with exit code: {}",
input_file, exit_code
);
}
}
Err(e) => {
error!("Error creating gif for '{}': {:?}", input_file, e);
}
}
}
GifType::OverviewVideo { duration } => {
let start = Instant::now();
match self
.get_video_duration(input_file)
.and_then(|input_duration| {
Command::new("ffmpeg")
.args(["-i", input_file])
.args([
"-vf",
// Grab 1 second of frames equally spaced to create a 'duration' second long video scaled to 720px on longest side
&format!(
"select='lt(mod(t,{}),1)',setpts=N/FRAME_RATE/TB,scale='if(gt(iw,ih),720,-2)':'if(gt(ih,iw),720,-2)",
input_duration / duration
),
])
.arg("-an")
.arg(output_file)
.status()
})
.await
{
Ok(out) => info!("Finished clip '{}' with code {:?} in {:?}", output_file, out.code(), start.elapsed()),
Err(e) => error!("Error creating video overview: {}", e),
}
}
}
Ok(output_file.to_string())
}
pub async fn create_gif_from_frames(
&self,
frame_base_dir: &str,
output_file: &str,
) -> Result<i32> {
let output = Command::new("ffmpeg")
.arg("-y")
.args(["-framerate", "4"])
.args(["-i", &format!("{}/frame_%03d.jpg", frame_base_dir)])
.args(["-i", &format!("{}/palette.png", frame_base_dir)])
.args([
"-filter_complex",
// Scale to 480x480 with a center crop
"[0:v]scale=480:-1:flags=lanczos,crop='min(in_w,in_h)':'min(in_w,in_h)':(in_w-out_w)/2:(in_h-out_h)/2, paletteuse",
])
.args(["-loop", "0"]) // loop forever
.args(["-final_delay", "75"])
.arg(output_file)
.stderr(Stdio::piped()) // Change this to capture stderr
.stdout(Stdio::piped()) // Optionally capture stdout too
.output()
.await?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
error!("FFmpeg error: {}", stderr);
let stdout = String::from_utf8_lossy(&output.stdout);
debug!("FFmpeg stdout: {}", stdout);
} else {
debug!("FFmpeg successful with exit code: {}", output.status);
}
Ok(output.status.code().unwrap_or(-1))
}
}
/// Get video duration in seconds as f64 for precise interval calculation.
///
/// Returns `Ok(None)` when ffprobe runs successfully but the container has no
/// readable duration (notably GoPro `LRV` low-res preview files, some
/// fragmented MP4s, and short Snapchat clips with stripped headers). Callers
/// can fall back to a duration-agnostic encode rather than treating this as
/// a hard failure — previously the `parse::<f64>` on empty stdout produced
/// "cannot parse float from empty string" and poisoned the preview-clip row
/// with status=failed, which the watcher would re-queue every full scan.
pub async fn get_duration_seconds(input_file: &str) -> Result<Option<f64>> {
if let Some(d) = probe_duration(input_file, "format=duration").await? {
return Ok(Some(d));
}
// Fall back to the per-stream duration — populated for some MP4s where
// the format-level duration tag is missing.
probe_duration(input_file, "stream=duration").await
}
/// Synchronous cousin of `get_duration_seconds`, for callers running on
/// blocking thread pools (Rayon). Same fallback strategy: tries
/// `format=duration`, then `stream=duration`. Returns `None` for any
/// failure — ffprobe missing, container without a duration tag, parse
/// error — so callers can pick a duration-agnostic default.
pub fn get_duration_seconds_blocking(input_file: &std::path::Path) -> Option<f64> {
if let Some(d) = probe_duration_blocking(input_file, "format=duration") {
return Some(d);
}
probe_duration_blocking(input_file, "stream=duration")
}
fn probe_duration_blocking(input_file: &std::path::Path, show_entries: &str) -> Option<f64> {
let out = std::process::Command::new("ffprobe")
.args(["-v", "quiet"])
.args(["-show_entries", show_entries])
.args(["-of", "csv=p=0"])
.arg("-i")
.arg(input_file)
.output()
.ok()?;
let raw = String::from_utf8_lossy(&out.stdout);
parse_ffprobe_duration(&raw)
}
async fn probe_duration(input_file: &str, show_entries: &str) -> Result<Option<f64>> {
let out = Command::new("ffprobe")
.args(["-v", "quiet"])
.args(["-show_entries", show_entries])
.args(["-of", "csv=p=0"])
.args(["-i", input_file])
.output()
.await?;
let raw = String::from_utf8_lossy(&out.stdout);
Ok(parse_ffprobe_duration(&raw))
}
/// Parse ffprobe's `csv=p=0` duration output. Returns the first valid
/// positive finite duration, or `None` when there isn't one.
///
/// Stream-level queries (`-show_entries stream=duration`) emit one value per
/// stream, one per line; format-level queries emit a single line. The shape
/// also varies — `N/A` for streams without a known duration, empty string
/// for containers without the tag at all, and (rarely) `0`/`-1` for
/// fragmented MP4s. All of those have to map to `None` so the caller can
/// fall back to a duration-agnostic encode.
fn parse_ffprobe_duration(stdout: &str) -> Option<f64> {
for line in stdout.lines() {
let trimmed = line.trim();
if trimmed.is_empty() || trimmed == "N/A" {
continue;
}
if let Ok(d) = trimmed.parse::<f64>()
&& d.is_finite()
&& d > 0.0
{
return Some(d);
}
}
None
}
/// Generate a preview clip from a video file.
///
/// Creates a ~10 second MP4 by extracting up to 10 equally-spaced 1-second segments
/// at 480p with H.264 video and AAC audio. For short videos (<10s), uses fewer segments.
/// For very short videos (<1s), transcodes the entire video.
///
/// Returns (duration_seconds, file_size_bytes) on success.
pub async fn generate_preview_clip(input_file: &str, output_file: &str) -> Result<(f64, u64)> {
info!("Generating preview clip for: '{}'", input_file);
let start = Instant::now();
let duration = get_duration_seconds(input_file).await?;
let use_nvenc = is_nvenc_available().await;
// Create parent directories for output
if let Some(parent) = std::path::Path::new(output_file).parent() {
std::fs::create_dir_all(parent)?;
}
let mut cmd = Command::new("ffmpeg");
cmd.arg("-y");
// Use CUDA hardware-accelerated decoding when available
if use_nvenc {
cmd.args(["-hwaccel", "cuda"]);
}
cmd.arg("-i").arg(input_file);
// Branch on duration. `None` means ffprobe couldn't tell us — we treat
// it like the <1s case and just transcode the whole file. The selected
// clip-duration we report back is computed alongside, so callers don't
// need to re-probe.
let clip_duration = match duration {
None => {
warn!(
"Unknown duration for '{}', transcoding whole file as preview",
input_file
);
cmd.args(["-vf", "scale=-2:480,format=yuv420p"]);
// Cap the encode at 10s so a long video with stripped duration
// metadata doesn't spend forever generating a "preview".
cmd.args(["-t", "10"]);
10.0
}
Some(d) if d < 1.0 => {
cmd.args(["-vf", "scale=-2:480,format=yuv420p"]);
d
}
Some(d) => {
let segment_count = if d < 10.0 { d.floor() as u32 } else { 10 };
let interval = d / segment_count as f64;
let vf = format!(
"select='lt(mod(t,{:.4}),1)',setpts=N/FRAME_RATE/TB,fps=30,scale=-2:480,format=yuv420p",
interval
);
let af = format!("aselect='lt(mod(t,{:.4}),1)',asetpts=N/SR/TB", interval);
cmd.args(["-vf", &vf]);
cmd.args(["-af", &af]);
if d < 10.0 { d.floor() } else { 10.0 }
}
};
// Force 30fps output so high-framerate sources (60fps) don't play back
// at double speed due to select/setpts timestamp mismatches.
cmd.args(["-r", "30"]);
// Use NVENC for encoding when available, otherwise fall back to libx264
if use_nvenc {
cmd.args(["-c:v", "h264_nvenc", "-preset", "p4", "-cq:v", "28"]);
} else {
cmd.args(["-c:v", "libx264", "-crf", "28", "-preset", "veryfast"]);
}
cmd.args(["-c:a", "aac"]);
cmd.arg(output_file);
cmd.stdout(Stdio::null());
cmd.stderr(Stdio::piped());
let output = cmd.output().await?;
if !output.status.success() {
let stderr = String::from_utf8_lossy(&output.stderr);
return Err(std::io::Error::other(format!(
"ffmpeg preview generation failed: {}",
stderr
)));
}
let metadata = std::fs::metadata(output_file)?;
let file_size = metadata.len();
info!(
"Generated preview clip '{}' ({:.1}s, {} bytes) in {:?}",
output_file,
clip_duration,
file_size,
start.elapsed()
);
Ok((clip_duration, file_size))
}
#[cfg(test)]
mod tests {
use super::parse_ffprobe_duration;
#[test]
fn empty_output_returns_none() {
// The original bug: ffprobe -show_entries format=duration returned
// "" for some GoPro LRV files, and `parse::<f64>` panicked with
// "cannot parse float from empty string".
assert_eq!(parse_ffprobe_duration(""), None);
assert_eq!(parse_ffprobe_duration("\n"), None);
assert_eq!(parse_ffprobe_duration(" \n \n"), None);
}
#[test]
fn na_returns_none() {
// ffprobe emits "N/A" for streams without a known duration.
assert_eq!(parse_ffprobe_duration("N/A"), None);
assert_eq!(parse_ffprobe_duration("N/A\nN/A\n"), None);
}
#[test]
fn parses_simple_duration() {
assert_eq!(parse_ffprobe_duration("12.345"), Some(12.345));
assert_eq!(parse_ffprobe_duration("12.345\n"), Some(12.345));
assert_eq!(parse_ffprobe_duration("0.5"), Some(0.5));
}
#[test]
fn rejects_non_positive_durations() {
// Fragmented MP4s and broken containers occasionally report 0 or a
// negative duration. Treat as "unknown" so the caller falls back to
// whole-file transcoding rather than dividing by zero downstream.
assert_eq!(parse_ffprobe_duration("0"), None);
assert_eq!(parse_ffprobe_duration("0.0"), None);
assert_eq!(parse_ffprobe_duration("-1.5"), None);
}
#[test]
fn rejects_non_finite_durations() {
assert_eq!(parse_ffprobe_duration("inf"), None);
assert_eq!(parse_ffprobe_duration("nan"), None);
}
#[test]
fn first_valid_line_wins_for_stream_query() {
// `-show_entries stream=duration` emits one value per stream. For a
// video file the video stream is first; we accept it and ignore
// any audio-stream values that follow.
assert_eq!(parse_ffprobe_duration("12.5\n8.3\n"), Some(12.5));
}
#[test]
fn skips_leading_na_and_blank_lines() {
// Stream queries can put N/A first (e.g. data stream before the
// video stream); the parser should keep scanning.
assert_eq!(parse_ffprobe_duration("N/A\n\n7.25\n"), Some(7.25));
}
#[test]
fn rejects_garbage() {
assert_eq!(parse_ffprobe_duration("not a number"), None);
assert_eq!(parse_ffprobe_duration("12.5abc"), None);
}
}