From 6e90f24307209561f626d30a39af2e09afcb97e0 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Fri, 12 Jun 2026 23:43:18 -0400 Subject: [PATCH] Reels: burst beats + duration budget for week/month, plus step logging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Restructures a reel around beats — one narration line over one or more photos — instead of one line per photo. A single-photo beat is a held shot; a multi-photo beat is a quick burst that flashes through several moments of an event while the line is read. So a week/month reel can show everything it spans without a narrated (and timed) segment per photo. Selection (selector.rs): - Duration budget: cap the number of narrated beats to ~REEL_TARGET_SECONDS (default 90, env-tunable) so week/month reels don't run minutes long. - Event clustering by time gap; when there are more events than the beat budget, adjacent events merge so the whole span stays covered. Each beat bursts up to MAX_BURST_PHOTOS (an even spread), so a 40-shot dinner contributes a handful of quick frames, not forty narrated seconds. Render (render.rs): a beat renders its photos as a concat of per-photo fills (blurred-bg portrait, fps-before-fade) under one muxed narration; burst photos get a snappier fade. beat_durations splits the narration across the photos, stretching only if a long burst would flash too fast. Adds high-level info logs across the steps (request → script → per-beat narrate/render → join → done with elapsed) for visibility. Bumps RENDER_VERSION to re-render cached reels. Co-Authored-By: Claude Fable 5 --- src/reels/mod.rs | 131 ++++++++++++++----- src/reels/render.rs | 296 ++++++++++++++++++++++++++++-------------- src/reels/script.rs | 95 +++++++++----- src/reels/selector.rs | 262 ++++++++++++++++++++++++++++++++----- 4 files changed, 580 insertions(+), 204 deletions(-) diff --git a/src/reels/mod.rs b/src/reels/mod.rs index 4cfe24b..be3f52d 100644 --- a/src/reels/mod.rs +++ b/src/reels/mod.rs @@ -36,24 +36,27 @@ use crate::otel::extract_context_from_request; use crate::state::AppState; use selector::ReelSelector; -/// The media behind one reel segment. Photos-only for now; a `Clip` variant -/// (a section of a source video) is the phase-2 extension point. +/// The media behind one shot. Photos-only for now; a `Clip` variant (a section +/// of a source video) is the phase-2 extension point. #[derive(Debug, Clone)] pub enum SegmentMedia { Photo { rel_path: String, library_id: i32 }, } -/// A segment before narration: which photo, when it was taken, and any cached -/// insight to feed the scripter. +/// A beat: one narration line over one or more photos. A single-photo beat is a +/// held shot; a multi-photo beat is a quick burst that flashes through several +/// moments of the same event while the line is read — so a week/month reel can +/// *show* everything it spans without a narration line (and the seconds that +/// come with it) per photo. #[derive(Debug, Clone)] -pub struct PlannedSegment { - pub media: SegmentMedia, +pub struct PlannedBeat { + pub photos: Vec, pub date: Option, pub insight_title: Option, pub insight_summary: Option, } -impl PlannedSegment { +impl PlannedBeat { /// Human date for the prompt, e.g. "June 12, 2019". `None` when undated. pub fn date_label(&self) -> Option { let ts = self.date?; @@ -180,7 +183,7 @@ fn finish_job( /// Render version: bump to invalidate every cached reel after a rendering / /// scripting change that should produce a fresh result. -const RENDER_VERSION: u32 = 3; +const RENDER_VERSION: u32 = 4; /// Narration expressiveness — Chatterbox's `exaggeration` knob. A slight bump /// over the ~0.5 default warms up otherwise-flat narration without over-acting; @@ -306,16 +309,25 @@ pub async fn create_reel_handler( })); } - let media: Vec = planned.iter().map(|p| p.media.clone()).collect(); + // Flatten every photo across beats (in order) into the cache key — the key + // tracks exactly which photos appear and in what sequence. + let media: Vec = planned.iter().flat_map(|b| b.photos.clone()).collect(); let voice = req.voice.clone().filter(|s| !s.is_empty()); let key = cache_key(&selector, &media, voice.as_deref()); let job_id = Uuid::new_v4(); + log::info!( + "reel {job_id}: request span={:?} → {} beats, {} photos", + span, + planned.len(), + media.len() + ); // Cache hit: register an already-Done job pointing at the existing MP4 so // the client's first poll returns the video URL immediately. let mp4 = reel_mp4_path(&app_state, &key); if mp4.exists() { + log::info!("reel {job_id}: cache hit, serving existing reel"); let title = std::fs::read(reel_sidecar_path(&app_state, &key)) .ok() .and_then(|b| serde_json::from_slice::(&b).ok()) @@ -358,6 +370,7 @@ pub async fn create_reel_handler( }, ); } + log::info!("reel {job_id}: queued for generation"); let state = app_state.clone(); let insight_dao = insight_dao.clone(); @@ -441,45 +454,73 @@ async fn run_reel_job( app_state: &AppState, insight_dao: &Mutex>, job_id: Uuid, - mut planned: Vec, + mut planned: Vec, meta: ReelMeta, voice: Option, key: &str, ) -> anyhow::Result<(String, PathBuf)> { use anyhow::{Context, anyhow}; + let started = Instant::now(); + let total_photos: usize = planned.iter().map(|b| b.photos.len()).sum(); + log::info!( + "reel {job_id}: starting — span {:?}, {} beats, {} photos, voice={}", + meta.span, + planned.len(), + total_photos, + voice.as_deref().unwrap_or("default") + ); + let client = app_state .llamacpp .as_ref() .ok_or_else(|| anyhow!("TTS/LLM backend not configured"))? .clone(); - // 1. Enrich with cached insights, then script (one LLM call). + // 1. Enrich each beat with its lead photo's cached insight, then script + // (one LLM call → one narration line per beat). set_stage(job_id, "scripting"); + log::info!("reel {job_id}: scripting narration via LLM…"); let span_context = opentelemetry::Context::new(); selector::enrich(insight_dao, &span_context, &mut planned); let script = script::generate_script(&client, &meta, &planned).await?; + log::info!( + "reel {job_id}: scripted \"{}\" ({} lines)", + script.title, + script.lines.len() + ); - // 2. Narrate each line to speech and 3. render each photo segment. A - // segment whose audio or render fails is skipped (logged) rather than - // sinking the whole reel — handles an odd HEIC/corrupt file gracefully. + // 2. Narrate each beat's line and 3. render the beat (its photos shown in + // sequence under that one narration). A beat whose audio or render fails + // is skipped (logged) rather than sinking the whole reel — handles an + // odd HEIC/corrupt file gracefully. set_stage(job_id, "narrating"); let work = tempfile::tempdir().context("creating reel work dir")?; let nvenc = render::is_nvenc_available().await; + log::info!( + "reel {job_id}: narrating + rendering {} beats (encoder: {})", + planned.len(), + if nvenc { "nvenc" } else { "cpu" } + ); let opts = render::SegmentOpts { nvenc, ..Default::default() }; - let mut segment_files: Vec = Vec::new(); - for (i, (seg, line)) in planned.iter().zip(script.lines.iter()).enumerate() { - let image_path = match resolve_image_path(app_state, &seg.media) { - Some(p) => p, - None => { - log::warn!("reel {job_id}: skipping segment {i}, image path unresolved"); - continue; - } - }; + let beat_total = planned.len(); + let mut beat_files: Vec = Vec::new(); + for (i, (beat, line)) in planned.iter().zip(script.lines.iter()).enumerate() { + // Resolve all of the beat's photos to absolute paths; drop any that + // don't resolve. An empty beat is skipped. + let image_paths: Vec = beat + .photos + .iter() + .filter_map(|m| resolve_image_path(app_state, m)) + .collect(); + if image_paths.is_empty() { + log::warn!("reel {job_id}: skipping beat {i}, no image paths resolved"); + continue; + } let audio_bytes = match crate::ai::tts::synthesize_serialized( &client, @@ -492,13 +533,13 @@ async fn run_reel_job( { Ok(b) => b, Err(e) => { - log::warn!("reel {job_id}: skipping segment {i}, TTS failed: {e}"); + log::warn!("reel {job_id}: skipping beat {i}, TTS failed: {e}"); continue; } }; let audio_path = work.path().join(format!("narration_{i:03}.wav")); if let Err(e) = tokio::fs::write(&audio_path, &audio_bytes).await { - log::warn!("reel {job_id}: skipping segment {i}, writing audio failed: {e}"); + log::warn!("reel {job_id}: skipping beat {i}, writing audio failed: {e}"); continue; } @@ -508,25 +549,37 @@ async fn run_reel_job( .ok() .flatten() .unwrap_or(render::MIN_SEGMENT_SECONDS); - let duration = render::segment_duration(narration_secs); set_stage(job_id, "rendering"); - let seg_out = work.path().join(format!("seg_{i:03}.mp4")); + log::info!( + "reel {job_id}: beat {}/{} — {} photo(s), narration {:.1}s", + i + 1, + beat_total, + image_paths.len(), + narration_secs + ); + let beat_out = work.path().join(format!("beat_{i:03}.mp4")); if let Err(e) = - render::render_segment(&image_path, &audio_path, &seg_out, duration, &opts).await + render::render_beat(&image_paths, &audio_path, &beat_out, narration_secs, &opts).await { - log::warn!("reel {job_id}: skipping segment {i}, render failed: {e}"); + log::warn!("reel {job_id}: skipping beat {i}, render failed: {e}"); continue; } - segment_files.push(seg_out.to_string_lossy().to_string()); + beat_files.push(beat_out.to_string_lossy().to_string()); } + let segment_files = beat_files; if segment_files.is_empty() { - return Err(anyhow!("no segments rendered successfully")); + return Err(anyhow!("no beats rendered successfully")); } // 4. Concat into the cache. Write to a temp name in the reels dir, then // rename atomically (same filesystem) so a reader never sees a partial. + set_stage(job_id, "rendering"); + log::info!( + "reel {job_id}: joining {} rendered beats into the final reel", + segment_files.len() + ); std::fs::create_dir_all(&app_state.reels_path).context("creating reels dir")?; let final_path = reel_mp4_path(app_state, key); let tmp_path = final_path.with_extension("mp4.tmp"); @@ -541,6 +594,12 @@ async fn run_reel_job( .context("serializing reel sidecar")?; let _ = std::fs::write(reel_sidecar_path(app_state, key), sidecar); + log::info!( + "reel {job_id}: done in {:.1}s — {} beats → {}", + started.elapsed().as_secs_f64(), + segment_files.len(), + final_path.display() + ); Ok((script.title, final_path)) } @@ -622,16 +681,16 @@ mod tests { #[test] fn date_label_formats_or_none() { - let seg = PlannedSegment { - media: photo("a.jpg", 1), + let beat = PlannedBeat { + photos: vec![photo("a.jpg", 1)], date: Some(1_560_384_000), // 2019-06-13 UTC insight_title: None, insight_summary: None, }; - assert!(seg.date_label().unwrap().contains("2019")); + assert!(beat.date_label().unwrap().contains("2019")); - let undated = PlannedSegment { - media: photo("a.jpg", 1), + let undated = PlannedBeat { + photos: vec![photo("a.jpg", 1)], date: None, insight_title: None, insight_summary: None, diff --git a/src/reels/render.rs b/src/reels/render.rs index 3cca6ac..a36f6f1 100644 --- a/src/reels/render.rs +++ b/src/reels/render.rs @@ -22,25 +22,31 @@ pub use crate::video::ffmpeg::is_nvenc_available; /// Reel canvas. Portrait, because reels are watched on a phone held upright — /// a landscape canvas letterboxes to a thin ~25%-height band there. Each photo /// is fitted sharp and centered over a blurred, zoomed copy of itself (see -/// [`segment_filtergraph`]) so the frame is always filled regardless of the +/// [`photo_filter_chain`]) so the frame is always filled regardless of the /// photo's orientation, without cropping the subject. pub const REEL_WIDTH: u32 = 1080; pub const REEL_HEIGHT: u32 = 1920; pub const REEL_FPS: u32 = 30; -/// A still's screen time is its narration length plus a short breath, with a -/// floor so a terse line still lingers. No ceiling: the segment always covers -/// the full narration so speech is never truncated — the scripter is asked to -/// keep lines short instead. +/// A beat's screen time is its narration length plus a short breath, with a +/// floor so a terse line still lingers. No ceiling: the beat always covers the +/// full narration so speech is never truncated — the scripter is asked to keep +/// lines short instead. pub const MIN_SEGMENT_SECONDS: f64 = 2.5; const NARRATION_TAIL_SECONDS: f64 = 0.6; -/// Quick fade in/out baked into each segment so concatenated photos dip -/// smoothly instead of hard-cutting. The fade-out lands inside the narration's -/// silent tail, so speech is never clipped. -const FADE_SECONDS: f64 = 0.35; +/// Fade durations baked into each photo. A held (single-photo) beat gets a +/// gentle dip; burst photos get a snappier fade so the montage feels quick. +const SINGLE_FADE_SECONDS: f64 = 0.35; +const BURST_FADE_SECONDS: f64 = 0.15; -/// Screen time for a photo segment given its narration audio length. +/// Floor on how long each burst photo stays up, so a long line over many photos +/// doesn't flash them subliminally. If the narration is too short to give every +/// photo this much, the beat is stretched to fit. +const MIN_BURST_PHOTO_SECONDS: f64 = 0.6; + +/// Base screen time for a beat given its narration length: narration + breath, +/// floored. Used as the lower bound on a beat's total duration. pub fn segment_duration(narration_secs: f64) -> f64 { let d = narration_secs + NARRATION_TAIL_SECONDS; if d.is_finite() && d > MIN_SEGMENT_SECONDS { @@ -50,6 +56,29 @@ pub fn segment_duration(narration_secs: f64) -> f64 { } } +/// Split a beat into per-photo durations. The beat lasts at least its narration +/// (so speech isn't cut) and at least `n × MIN_BURST_PHOTO_SECONDS` (so a fast +/// burst stays legible); the photos share that total evenly. Returns +/// `(total_seconds, per_photo_seconds)`. +pub fn beat_durations(narration_secs: f64, n_photos: usize) -> (f64, Vec) { + let n = n_photos.max(1); + let base = segment_duration(narration_secs); + let min_total = n as f64 * MIN_BURST_PHOTO_SECONDS; + let total = if base > min_total { base } else { min_total }; + let each = total / n as f64; + (total, vec![each; n]) +} + +/// Fade length to use for a beat of `n_photos` (gentle when held, snappy in a +/// burst). +fn fade_for(n_photos: usize) -> f64 { + if n_photos > 1 { + BURST_FADE_SECONDS + } else { + SINGLE_FADE_SECONDS + } +} + /// Options controlling per-segment rendering. #[derive(Debug, Clone, Copy)] pub struct SegmentOpts { @@ -70,38 +99,49 @@ impl Default for SegmentOpts { } } -/// Full `filter_complex` for one photo segment, producing labelled `[v]` (video) -/// and `[a]` (audio) outputs. Input 0 is the looped still, input 1 the -/// narration. +/// Filter chain for one photo (input `idx`) producing the labelled output +/// `[v{idx}]`. Splits the still into a background and foreground: the background +/// is scaled to *cover* the canvas and heavily blurred; the foreground is +/// scaled to *fit* and overlaid centered. This fills the portrait frame for any +/// photo orientation — no black bars, no cropping of the subject — then a fade +/// in/out softens the cut. Intermediate labels are suffixed with `idx` so +/// several chains coexist in one `filter_complex`. /// -/// Video: split the still into a background and foreground. The background is -/// scaled to *cover* the canvas and heavily blurred; the foreground is scaled to -/// *fit* inside it and overlaid centered. This fills the portrait frame for any -/// photo orientation — no black bars, no cropping of the subject — then a quick -/// fade in/out softens the cut to the next segment. -/// -/// Audio: pad the narration with trailing silence so a short line doesn't end -/// the segment early; `-t` bounds it to the segment duration. -pub fn segment_filtergraph(opts: &SegmentOpts, duration: f64) -> String { +/// `fps` is normalized BEFORE the fades so the brightness ramp is computed on a +/// true {fps}-frame timeline; otherwise the fade is sampled at the looped +/// still's coarse cadence and duplicated up, which reads as a steppy dip. +fn photo_filter_chain(idx: usize, opts: &SegmentOpts, duration: f64, fade: f64) -> String { let (w, h, fps) = (opts.width, opts.height, opts.fps); - // Fade-out begins one fade-length before the end; clamp so a floor-length - // segment still gets a valid (non-negative) start time. - let fade_out_start = (duration - FADE_SECONDS).max(0.0); - // `fps` is normalized BEFORE the fades so the brightness ramp is computed - // on a true {fps}-frame timeline. If fps came after, the fade would be - // sampled at the looped still's coarse input cadence and then duplicated up - // to {fps}, which reads as a steppy / low-frame-rate dip. + let fade_out_start = (duration - fade).max(0.0); format!( - "[0:v]split=2[bg][fg];\ - [bg]scale={w}:{h}:force_original_aspect_ratio=increase,\ - crop={w}:{h},boxblur=20:2[bgb];\ - [fg]scale={w}:{h}:force_original_aspect_ratio=decrease[fgs];\ - [bgb][fgs]overlay=(W-w)/2:(H-h)/2,\ + "[{idx}:v]split=2[bg{idx}][fg{idx}];\ + [bg{idx}]scale={w}:{h}:force_original_aspect_ratio=increase,\ + crop={w}:{h},boxblur=20:2[bgb{idx}];\ + [fg{idx}]scale={w}:{h}:force_original_aspect_ratio=decrease[fgs{idx}];\ + [bgb{idx}][fgs{idx}]overlay=(W-w)/2:(H-h)/2,\ fps={fps},\ - fade=t=in:st=0:d={FADE_SECONDS},\ - fade=t=out:st={fade_out_start:.3}:d={FADE_SECONDS},\ - setsar=1,format=yuv420p[v];\ - [1:a]apad[a]" + fade=t=in:st=0:d={fade},\ + fade=t=out:st={fade_out_start:.3}:d={fade},\ + setsar=1,format=yuv420p[v{idx}]" + ) +} + +/// Full `filter_complex` for a beat of `per_photo` durations: one chain per +/// photo, concatenated into `[v]`, with the narration (the last input, index +/// `per_photo.len()`) padded with trailing silence into `[a]`. A single-photo +/// beat degenerates to one chain + `concat=n=1` (a passthrough). +pub fn beat_filtergraph(opts: &SegmentOpts, per_photo: &[f64]) -> String { + let n = per_photo.len().max(1); + let fade = fade_for(n); + let chains: Vec = per_photo + .iter() + .enumerate() + .map(|(i, &d)| photo_filter_chain(i, opts, d, fade)) + .collect(); + let concat_inputs: String = (0..n).map(|i| format!("[v{i}]")).collect(); + format!( + "{chains};{concat_inputs}concat=n={n}:v=1:a=0[v];[{n}:a]apad[a]", + chains = chains.join(";") ) } @@ -128,15 +168,16 @@ fn video_encoder_args(nvenc: bool) -> Vec { .collect() } -/// Build the ffmpeg args that render one photo segment: a still looped for -/// `duration` seconds, filled to the portrait canvas with a blurred backdrop -/// (see [`segment_filtergraph`]) and the narration muxed in. `-t` bounds both -/// streams to the segment length. -pub fn build_segment_args( - image_path: &str, +/// Build the ffmpeg args that render one beat: each photo looped for its slice +/// of the beat (filled to the portrait canvas with a blurred backdrop), the +/// slices concatenated, and the single narration muxed over the whole thing. +/// `total` bounds the output (and the apad'd audio) to the beat length. +pub fn build_beat_args( + image_paths: &[String], audio_path: &str, out_path: &str, - duration: f64, + per_photo: &[f64], + total: f64, opts: &SegmentOpts, ) -> Vec { let fps = opts.fps.to_string(); @@ -144,26 +185,33 @@ pub fn build_segment_args( if opts.nvenc { args.extend(["-hwaccel".into(), "cuda".into()]); } + // One looped-still input per photo, each bounded to its slice by an input + // `-t`; reading at the target `-framerate` gives the fades real frames to + // ramp across. + for (path, &dur) in image_paths.iter().zip(per_photo.iter()) { + args.extend([ + "-framerate".into(), + fps.clone(), + "-loop".into(), + "1".into(), + "-t".into(), + format!("{dur:.3}"), + "-i".into(), + path.clone(), + ]); + } args.extend([ - // Read the looped still at the target rate so frames exist for the - // fade to ramp across (paired with the in-graph `fps` and CFR output). - "-framerate".into(), - fps.clone(), - "-loop".into(), - "1".into(), - "-i".into(), - image_path.into(), "-i".into(), audio_path.into(), "-filter_complex".into(), - segment_filtergraph(opts, duration), + beat_filtergraph(opts, per_photo), "-map".into(), "[v]".into(), "-map".into(), "[a]".into(), "-t".into(), - format!("{duration:.3}"), - // Force constant frame rate so the segment (and the concatenated reel) + format!("{total:.3}"), + // Force constant frame rate so the beat (and the concatenated reel) // plays at a steady {fps} rather than a variable cadence. "-r".into(), fps, @@ -231,22 +279,33 @@ async fn run_ffmpeg(args: &[String], what: &str) -> Result<()> { Ok(()) } -/// Render one photo segment to `out_path`. -pub async fn render_segment( - image_path: &Path, +/// Render one beat to `out_path`: its photos shown in sequence (a held shot for +/// one photo, a quick burst for several) under the single narration in +/// `audio_path`, whose measured length sets the beat's pacing. +pub async fn render_beat( + image_paths: &[std::path::PathBuf], audio_path: &Path, out_path: &Path, - duration: f64, + narration_secs: f64, opts: &SegmentOpts, ) -> Result<()> { - let args = build_segment_args( - &image_path.to_string_lossy(), + if image_paths.is_empty() { + bail!("render_beat called with no images"); + } + let (total, per_photo) = beat_durations(narration_secs, image_paths.len()); + let paths: Vec = image_paths + .iter() + .map(|p| p.to_string_lossy().to_string()) + .collect(); + let args = build_beat_args( + &paths, &audio_path.to_string_lossy(), &out_path.to_string_lossy(), - duration, + &per_photo, + total, opts, ); - run_ffmpeg(&args, "segment render").await + run_ffmpeg(&args, "beat render").await } /// Join rendered segments into the final reel. Writes the concat list into the @@ -288,73 +347,108 @@ mod tests { } #[test] - fn filtergraph_fills_portrait_with_blurred_bg_and_fitted_fg() { - let g = segment_filtergraph(&SegmentOpts::default(), 4.0); - // Background covers + blurs; foreground fits and is centered over it. - assert!(g.contains("split=2[bg][fg]")); + fn beat_durations_single_photo_matches_base() { + let (total, per) = beat_durations(4.0, 1); + assert!((total - 4.6).abs() < 1e-9); // narration + tail + assert_eq!(per.len(), 1); + assert!((per[0] - 4.6).abs() < 1e-9); + } + + #[test] + fn beat_durations_burst_splits_evenly() { + // 5 photos, narration 4.6s base → ~0.92s each (above the 0.6 floor). + let (total, per) = beat_durations(4.0, 5); + assert!((total - 4.6).abs() < 1e-9); + assert_eq!(per.len(), 5); + assert!((per.iter().sum::() - total).abs() < 1e-9); + assert!(per.iter().all(|&d| d >= MIN_BURST_PHOTO_SECONDS)); + } + + #[test] + fn beat_durations_stretches_when_narration_too_short_for_burst() { + // Floor narration (2.5s) over 10 photos would be 0.25s each — below the + // legibility floor, so the beat stretches to 10 × 0.6 = 6s. + let (total, per) = beat_durations(0.0, 10); + assert!((total - 6.0).abs() < 1e-9); + assert!(per.iter().all(|&d| (d - 0.6).abs() < 1e-9)); + } + + #[test] + fn beat_filtergraph_single_photo_fills_portrait_and_holds() { + let (_t, per) = beat_durations(4.0, 1); + let g = beat_filtergraph(&SegmentOpts::default(), &per); + assert!(g.contains("[0:v]split=2[bg0][fg0]")); assert!(g.contains("scale=1080:1920:force_original_aspect_ratio=increase")); assert!(g.contains("crop=1080:1920")); - assert!(g.contains("boxblur")); assert!(g.contains("scale=1080:1920:force_original_aspect_ratio=decrease")); assert!(g.contains("overlay=(W-w)/2:(H-h)/2")); - // Produces the labelled outputs build_segment_args maps. - assert!(g.contains("[v]")); + // Single photo → concat of one, gentle fade, audio is input 1. + assert!(g.contains("concat=n=1:v=1:a=0[v]")); + assert!(g.contains("d=0.35")); // SINGLE_FADE assert!(g.contains("[1:a]apad[a]")); - assert!(g.contains("format=yuv420p")); } #[test] - fn filtergraph_fades_in_and_out_within_duration() { - // 4s segment, 0.35s fade → fade-out starts at 3.65s. - let g = segment_filtergraph(&SegmentOpts::default(), 4.0); - assert!(g.contains("fade=t=in:st=0:d=0.35")); - assert!(g.contains("fade=t=out:st=3.650:d=0.35")); + fn beat_filtergraph_burst_chains_concats_and_snappy_fade() { + let (_t, per) = beat_durations(4.0, 3); + let g = beat_filtergraph(&SegmentOpts::default(), &per); + // One chain per photo with index-suffixed labels. + assert!(g.contains("[0:v]split") && g.contains("[1:v]split") && g.contains("[2:v]split")); + // Concatenated in order, audio is the 4th input (index 3). + assert!(g.contains("[v0][v1][v2]concat=n=3:v=1:a=0[v]")); + assert!(g.contains("[3:a]apad[a]")); + // Burst uses the snappier fade. + assert!(g.contains("d=0.15")); + assert!(!g.contains("d=0.35")); } #[test] - fn filtergraph_normalizes_fps_before_fading() { - // The fps filter must precede the fades, else the brightness ramp is - // sampled at the still's coarse cadence and looks steppy. - let g = segment_filtergraph(&SegmentOpts::default(), 4.0); + fn beat_filtergraph_normalizes_fps_before_fading() { + // fps must precede the fades on every chain (else the dip looks steppy). + let (_t, per) = beat_durations(4.0, 1); + let g = beat_filtergraph(&SegmentOpts::default(), &per); let fps_at = g.find("fps=30").expect("fps in graph"); let fade_at = g.find("fade=t=in").expect("fade in graph"); assert!(fps_at < fade_at); } #[test] - fn filtergraph_fade_out_start_never_negative_at_floor() { - // A floor-length segment shorter than a fade still yields st >= 0. - let g = segment_filtergraph(&SegmentOpts::default(), 0.2); - assert!(g.contains("fade=t=out:st=0.000:d=0.35")); - } - - #[test] - fn segment_args_loop_still_and_bound_with_t() { - let args = build_segment_args( - "/img.jpg", - "/a.wav", + fn beat_args_one_input_per_photo_plus_audio_bound_by_total() { + let (total, per) = beat_durations(4.0, 2); + let args = build_beat_args( + &["/a.jpg".into(), "/b.jpg".into()], + "/n.wav", "/out.mp4", - 4.0, + &per, + total, &SegmentOpts::default(), ); let joined = args.join(" "); - assert!(joined.contains("-framerate 30 -loop 1 -i /img.jpg")); - assert!(joined.contains("-i /a.wav")); - assert!(joined.contains("apad")); - assert!(joined.contains("-t 4.000")); - // Constant frame rate forced on the output. + // A looped-still input per photo, each with its slice -t, then the audio. + assert!(joined.contains("-framerate 30 -loop 1 -t 2.300 -i /a.jpg")); + assert!(joined.contains("-framerate 30 -loop 1 -t 2.300 -i /b.jpg")); + assert!(joined.contains("-i /n.wav")); + // Output bounded to the beat total and forced CFR. + assert!(joined.contains("-t 4.600")); assert!(joined.contains("-r 30")); - assert!(joined.contains("libx264")); assert!(joined.ends_with("/out.mp4")); } #[test] - fn segment_args_use_nvenc_and_cuda_when_enabled() { + fn beat_args_use_nvenc_and_cuda_when_enabled() { let opts = SegmentOpts { nvenc: true, ..SegmentOpts::default() }; - let args = build_segment_args("/img.jpg", "/a.wav", "/out.mp4", 3.0, &opts); + let (total, per) = beat_durations(3.0, 1); + let args = build_beat_args( + &["/img.jpg".into()], + "/a.wav", + "/out.mp4", + &per, + total, + &opts, + ); let joined = args.join(" "); assert!(joined.contains("-hwaccel cuda")); assert!(joined.contains("h264_nvenc")); diff --git a/src/reels/script.rs b/src/reels/script.rs index 1cf3189..85fff7c 100644 --- a/src/reels/script.rs +++ b/src/reels/script.rs @@ -1,10 +1,11 @@ //! Narration scripting for memory reels. //! -//! One LLM call turns the planned segments (each carrying its date and, where +//! One LLM call turns the planned beats (each carrying its date and, where //! available, its cached insight) into a short first-person narration line per -//! photo plus a title for the reel. We reuse the cached insight summary as the -//! richest per-photo signal rather than re-running vision at reel time — that -//! keeps reel generation off the GPU's vision slot entirely. +//! beat plus a title for the reel. A beat may show several photos in a quick +//! burst, so a line narrates the *moment*, not a single frame. We reuse the +//! cached insight summary as the richest signal rather than re-running vision +//! at reel time — that keeps reel generation off the GPU's vision slot. //! //! The prompt builder and response parser are pure so the contract is //! unit-testable; `generate_script` wires them to the LLM client. @@ -12,11 +13,11 @@ use anyhow::{Context, Result}; use std::sync::Arc; -use super::{PlannedSegment, ReelMeta}; +use super::{PlannedBeat, ReelMeta}; use crate::ai::llamacpp::LlamaCppClient; use crate::ai::llm_client::LlmClient; -/// The narration for a whole reel: a title and one line per segment, in order. +/// The narration for a whole reel: a title and one line per beat, in order. #[derive(Debug, Clone, PartialEq)] pub struct ReelScript { pub title: String, @@ -26,33 +27,38 @@ pub struct ReelScript { const SYSTEM_PROMPT: &str = "You are narrating a personal memory reel — a short \ slideshow of someone's own photos set to a spoken voiceover. Write warm, \ specific, first-person narration as if the person is gently looking back on \ -their own memories. Be concrete and grounded in the details given; never \ -invent names, places, or events that aren't supported. Keep each line to one \ -or two short sentences that can be read aloud in a few seconds. Avoid generic \ -filler like \"what a wonderful day\" — if you have little to go on, simply \ -describe the moment plainly."; +their own memories. Each line plays over one moment, which may be a quick burst \ +of several photos, so narrate the moment as a whole rather than a single frame. \ +Be concrete and grounded in the details given; never invent names, places, or \ +events that aren't supported. Keep each line to one or two short sentences that \ +can be read aloud in a few seconds. Avoid generic filler like \"what a \ +wonderful day\" — if you have little to go on, simply describe the moment \ +plainly."; /// Build the (system, user) prompt pair for the scripter. The user message -/// describes each segment in order and asks for strict JSON back. -pub fn build_script_messages(meta: &ReelMeta, planned: &[PlannedSegment]) -> (String, String) { +/// describes each beat in order and asks for strict JSON back. +pub fn build_script_messages(meta: &ReelMeta, beats: &[PlannedBeat]) -> (String, String) { let mut user = String::new(); user.push_str(&format!( - "These are {} photos surfaced as memories {}.\n\n", - planned.len(), + "This reel has {} moments surfaced as memories {}.\n\n", + beats.len(), meta.span_phrase() )); if !meta.years.is_empty() { let years: Vec = meta.years.iter().map(|y| y.to_string()).collect(); user.push_str(&format!("They span the years: {}.\n\n", years.join(", "))); } - user.push_str("Photos, in the order they will appear:\n"); - for (i, seg) in planned.iter().enumerate() { + user.push_str("Moments, in the order they will appear:\n"); + for (i, beat) in beats.iter().enumerate() { user.push_str(&format!("\n[{}]", i + 1)); - if let Some(date) = seg.date_label() { + if let Some(date) = beat.date_label() { user.push_str(&format!(" {date}")); } + if beat.photos.len() > 1 { + user.push_str(&format!(" (a burst of {} photos)", beat.photos.len())); + } user.push('\n'); - match (&seg.insight_title, &seg.insight_summary) { + match (&beat.insight_title, &beat.insight_summary) { (Some(t), Some(s)) if !s.trim().is_empty() => { user.push_str(&format!(" Known context: {t} — {s}\n")); } @@ -65,10 +71,10 @@ pub fn build_script_messages(meta: &ReelMeta, planned: &[PlannedSegment]) -> (St } user.push_str(&format!( "\nReturn ONLY a JSON object, no prose or code fences, shaped exactly:\n\ - {{\"title\": \"\", \"segments\": [\"\", \ - \"\", ... ]}}\n\ - The \"segments\" array MUST have exactly {} items, one per photo in order.", - planned.len() + {{\"title\": \"\", \"segments\": [\"\", \ + \"\", ... ]}}\n\ + The \"segments\" array MUST have exactly {} items, one per moment in order.", + beats.len() )); (SYSTEM_PROMPT.to_string(), user) } @@ -174,20 +180,20 @@ fn clean_text(s: &str) -> String { trimmed.split_whitespace().collect::>().join(" ") } -/// Generate the reel script via the LLM. Text-only (no images) — the per-photo +/// Generate the reel script via the LLM. Text-only (no images) — the per-beat /// context comes from cached insights. The call takes the GPU read lease /// internally (see `LlamaCppClient::generate`). pub async fn generate_script( client: &Arc, meta: &ReelMeta, - planned: &[PlannedSegment], + beats: &[PlannedBeat], ) -> Result { - let (system, user) = build_script_messages(meta, planned); + let (system, user) = build_script_messages(meta, beats); let raw = client .generate(&user, Some(&system), None) .await .context("LLM script generation failed")?; - Ok(parse_script_response(&raw, planned.len())) + Ok(parse_script_response(&raw, beats.len())) } #[cfg(test)] @@ -202,13 +208,13 @@ mod tests { } } - fn planned(n: usize) -> Vec { + fn planned(n: usize) -> Vec { (0..n) - .map(|i| PlannedSegment { - media: super::super::SegmentMedia::Photo { + .map(|i| PlannedBeat { + photos: vec![super::super::SegmentMedia::Photo { rel_path: format!("p{i}.jpg"), library_id: 1, - }, + }], date: Some(1_560_000_000 + i as i64 * 86_400), insight_title: None, insight_summary: None, @@ -217,16 +223,37 @@ mod tests { } #[test] - fn prompt_states_exact_segment_count_and_span() { + fn prompt_states_exact_moment_count_and_span() { let (sys, user) = build_script_messages(&meta(), &planned(3)); assert!(sys.contains("memory reel")); - assert!(user.contains("3 photos")); + assert!(user.contains("3 moments")); assert!(user.contains("on this day")); assert!(user.contains("exactly 3 items")); - // Each photo gets an indexed entry. + // Each moment gets an indexed entry. assert!(user.contains("[1]") && user.contains("[2]") && user.contains("[3]")); } + #[test] + fn prompt_notes_burst_photo_count() { + let mut p = planned(1); + p[0].photos = vec![ + super::super::SegmentMedia::Photo { + rel_path: "a.jpg".into(), + library_id: 1, + }, + super::super::SegmentMedia::Photo { + rel_path: "b.jpg".into(), + library_id: 1, + }, + super::super::SegmentMedia::Photo { + rel_path: "c.jpg".into(), + library_id: 1, + }, + ]; + let (_sys, user) = build_script_messages(&meta(), &p); + assert!(user.contains("a burst of 3 photos")); + } + #[test] fn prompt_includes_insight_context_when_present() { let mut p = planned(1); diff --git a/src/reels/selector.rs b/src/reels/selector.rs index 0a53ee5..fb83e38 100644 --- a/src/reels/selector.rs +++ b/src/reels/selector.rs @@ -13,18 +13,51 @@ use std::sync::Mutex; use chrono::{DateTime, Datelike, FixedOffset}; -use super::{PlannedSegment, ReelMeta, SegmentMedia}; +use super::{PlannedBeat, ReelMeta, SegmentMedia}; use crate::database::{ExifDao, InsightDao}; use crate::file_types::is_image_file; use crate::memories::{self, MemoriesSpan}; use crate::state::AppState; -/// Default and hard caps on how many photos a reel covers. The cap bounds the -/// LLM/TTS/ffmpeg work per reel; when a span has more, [`sample_evenly`] keeps -/// a representative spread across the years rather than just the oldest. -pub const DEFAULT_MAX_SEGMENTS: usize = 24; +/// Default and hard caps on how many photos a reel covers. The default is an +/// upper bound on the request; the effective count is usually smaller, set by +/// the duration budget (see [`budget_segments`]). The hard cap bounds work per +/// reel regardless. +pub const DEFAULT_MAX_SEGMENTS: usize = 40; pub const HARD_MAX_SEGMENTS: usize = 40; +/// Target reel length. Week and especially month spans can surface hundreds of +/// photos; at a few seconds of narration each, a naive reel runs minutes. We +/// cap the segment count to keep the reel near this length. Tunable via +/// `REEL_TARGET_SECONDS`. +const DEFAULT_TARGET_REEL_SECONDS: f64 = 90.0; + +/// Rough average wall-time per photo segment (a short narration line + the +/// silent tail). Only used to turn the duration target into a segment count; +/// the real per-segment time is the measured narration length. +const EST_SECONDS_PER_SEGMENT: f64 = 5.0; + +/// Time gap that separates one "event/moment" from the next when clustering a +/// span's photos. Photos within a few hours are treated as the same occasion +/// (and across years/days the gaps are far larger, so each instance clusters +/// on its own). 4 hours splits e.g. a morning hike from an evening dinner. +const EVENT_GAP_SECONDS: i64 = 4 * 3600; + +fn target_reel_seconds() -> f64 { + std::env::var("REEL_TARGET_SECONDS") + .ok() + .and_then(|s| s.trim().parse::().ok()) + .filter(|x| x.is_finite() && *x > 0.0) + .unwrap_or(DEFAULT_TARGET_REEL_SECONDS) +} + +/// How many photo segments fit the duration budget, bounded by the request's +/// max and the hard cap. This is what keeps week/month reels from running long. +pub fn budget_segments(requested_max: usize) -> usize { + let by_budget = (target_reel_seconds() / EST_SECONDS_PER_SEGMENT).floor() as usize; + by_budget.min(requested_max).clamp(1, HARD_MAX_SEGMENTS) +} + /// What a reel is built from. v1 ships the memories (on this day/week/month) /// selector; tag and date-range variants slot in here later. #[derive(Debug, Clone)] @@ -81,6 +114,104 @@ pub fn sample_evenly(items: &[T], max: usize) -> Vec { .collect() } +/// Group time-sorted items into events by gap: a new event starts whenever the +/// jump from the previous photo exceeds `gap_seconds`. Preserves order; items +/// without a timestamp extend the current event. +fn cluster_by_gap( + items: &[memories::MemoryItem], + gap_seconds: i64, +) -> Vec> { + let mut clusters: Vec> = Vec::new(); + let mut prev_ts: Option = None; + for it in items { + let starts_new = match (prev_ts, it.created) { + (Some(p), Some(c)) => c - p > gap_seconds, + _ => false, + }; + if starts_new || clusters.is_empty() { + clusters.push(Vec::new()); + } + clusters.last_mut().unwrap().push(it.clone()); + if let Some(c) = it.created { + prev_ts = Some(c); + } + } + clusters +} + +/// Most photos a single beat will flash through. Bounds the burst so one huge +/// event doesn't dominate, and keeps each photo on screen long enough to +/// register at the per-beat narration length (see render's beat timing). +pub const MAX_BURST_PHOTOS: usize = 10; + +/// Merge a list of (time-ordered) event clusters into exactly `n` contiguous +/// groups, so a span with more events than the beat budget still covers the +/// whole timeline — adjacent events fold together into one beat rather than +/// getting dropped. `n` must be ≥ 1 and ≤ clusters.len(). +fn partition_into_groups( + clusters: Vec>, + n: usize, +) -> Vec> { + let c = clusters.len(); + let mut clusters = clusters.into_iter(); + (0..n) + .map(|j| { + // Even contiguous split of c clusters into n groups. + let start = j * c / n; + let end = (j + 1) * c / n; + let take = end.saturating_sub(start).max(1); + (0..take) + .flat_map(|_| clusters.next().into_iter().flatten()) + .collect() + }) + .collect() +} + +/// Turn a span's photos into `n_beats` beats. Clusters photos into events by +/// time gap; if there are more events than beats, adjacent events are merged so +/// the whole span is still covered. Each beat then flashes up to +/// `max_burst` photos (an even spread of its group) under one narration line — +/// so a week/month reel *shows* all its moments without a narrated (and timed) +/// segment per photo. +pub fn form_beats( + items: &[memories::MemoryItem], + n_beats: usize, + max_burst: usize, +) -> Vec { + if n_beats == 0 || items.is_empty() { + return Vec::new(); + } + let clusters = cluster_by_gap(items, EVENT_GAP_SECONDS); + // One beat per event when they fit; otherwise fold adjacent events together + // into exactly n_beats groups. + let groups = if clusters.len() <= n_beats { + clusters + } else { + partition_into_groups(clusters, n_beats) + }; + + groups + .into_iter() + .filter(|g| !g.is_empty()) + .map(|group| { + let shown = sample_evenly(&group, max_burst); + let date = shown.first().and_then(|it| it.created); + PlannedBeat { + photos: shown + .into_iter() + .map(|it| SegmentMedia::Photo { + rel_path: it.path, + library_id: it.library_id, + }) + .collect(), + date, + insight_title: None, + insight_summary: None, + } + }) + .collect() +} + /// Cheap pass: resolve the selector into an ordered list of media (no insight /// lookups yet) plus reel metadata. `Err` only on an invalid library param. pub fn resolve( @@ -88,7 +219,7 @@ pub fn resolve( exif_dao: &Mutex>, span_context: &opentelemetry::Context, selector: &ReelSelector, -) -> Result<(Vec, ReelMeta), String> { +) -> Result<(Vec, ReelMeta), String> { match selector { ReelSelector::Memories { span, @@ -108,32 +239,23 @@ pub fn resolve( )?; // Phase 1 is photos-only: drop videos (a clip segment type lands - // in phase 2). Filter before sampling so the spread is over the - // photos that will actually appear. + // in phase 2). let items: Vec = items .into_iter() .filter(|it| is_image_file(Path::new(&it.path))) .collect(); - let cap = (*max_segments).clamp(1, HARD_MAX_SEGMENTS); - let items = sample_evenly(&items, cap); - + // Years are derived from the whole span (what the reel represents), + // before the budget narrows it down to beats. let years = distinct_years(&items, client_tz); let meta = ReelMeta { span: *span, years }; - let planned = items - .into_iter() - .map(|it| PlannedSegment { - media: SegmentMedia::Photo { - rel_path: it.path, - library_id: it.library_id, - }, - date: it.created, - insight_title: None, - insight_summary: None, - }) - .collect(); - Ok((planned, meta)) + // The budget caps the number of narrated beats (≈ reel length); + // each beat then bursts through several photos, so the reel covers + // the span's moments without running minutes long. + let n_beats = budget_segments(*max_segments); + let beats = form_beats(&items, n_beats, MAX_BURST_PHOTOS); + Ok((beats, meta)) } } } @@ -155,24 +277,24 @@ fn distinct_years(items: &[memories::MemoryItem], tz: Option) -> Ve years } -/// Background pass: fill each segment's cached insight (title + summary) where -/// one exists. Best-effort — a missing or errored lookup leaves the fields -/// `None` and the scripter narrates from the date alone. +/// Background pass: fill each beat's cached insight (title + summary) from its +/// lead photo, where one exists. Best-effort — a missing or errored lookup +/// leaves the fields `None` and the scripter narrates from the date alone. pub fn enrich( insight_dao: &Mutex>, span_context: &opentelemetry::Context, - planned: &mut [PlannedSegment], + beats: &mut [PlannedBeat], ) { let Ok(mut dao) = insight_dao.lock() else { return; }; - for seg in planned.iter_mut() { - let rel_path = match &seg.media { - SegmentMedia::Photo { rel_path, .. } => rel_path, + for beat in beats.iter_mut() { + let Some(SegmentMedia::Photo { rel_path, .. }) = beat.photos.first() else { + continue; }; if let Ok(Some(insight)) = dao.get_insight(span_context, rel_path) { - seg.insight_title = Some(insight.title); - seg.insight_summary = Some(insight.summary); + beat.insight_title = Some(insight.title); + beat.insight_summary = Some(insight.summary); } } } @@ -249,4 +371,78 @@ mod tests { ]; assert_eq!(distinct_years(&items, None), vec![2019, 2021]); } + + // Build an item at a given unix timestamp (seconds). + fn item_at(ts: i64, name: &str) -> memories::MemoryItem { + memories::MemoryItem { + path: format!("{name}.jpg"), + created: Some(ts), + modified: None, + library_id: 1, + } + } + + #[test] + fn budget_segments_caps_to_duration_target() { + // 90s / 5s ≈ 18, bounded by the request max and hard cap. + assert_eq!(budget_segments(40), 18); + assert_eq!(budget_segments(5), 5); // request asked for fewer + assert_eq!(budget_segments(1000), 18); // hard cap / budget wins + } + + #[test] + fn cluster_by_gap_splits_on_large_jumps() { + // Two photos minutes apart, then one a day later → two events. + let items = vec![ + item_at(1_000_000, "a"), + item_at(1_000_300, "b"), // +5 min → same event + item_at(1_100_000, "c"), // +~27h → new event + ]; + let clusters = cluster_by_gap(&items, EVENT_GAP_SECONDS); + assert_eq!(clusters.len(), 2); + assert_eq!(clusters[0].len(), 2); + assert_eq!(clusters[1].len(), 1); + } + + #[test] + fn form_beats_one_beat_per_event_when_they_fit() { + // Three well-separated events, budget of 10 → three beats, each holding + // all of its (few) photos. + let items = vec![ + item_at(0, "a"), + item_at(50, "b"), // same event as a + item_at(1_000_000, "c"), + item_at(2_000_000, "d"), + ]; + let beats = form_beats(&items, 10, MAX_BURST_PHOTOS); + assert_eq!(beats.len(), 3); + assert_eq!(beats[0].photos.len(), 2); // burst of the first event + assert_eq!(beats[1].photos.len(), 1); + assert_eq!(beats[2].photos.len(), 1); + } + + #[test] + fn form_beats_merges_events_when_over_budget() { + // Six distinct events but only two beats → adjacent events fold in, and + // every event's photos still appear (capped by the burst max). + let items: Vec = (0..6) + .map(|i| item_at(i as i64 * 1_000_000, &format!("e{i}"))) + .collect(); + let beats = form_beats(&items, 2, MAX_BURST_PHOTOS); + assert_eq!(beats.len(), 2); + let shown: usize = beats.iter().map(|b| b.photos.len()).sum(); + assert_eq!(shown, 6); // all six moments still shown across two beats + } + + #[test] + fn form_beats_caps_burst_to_max() { + // One dense event of 30 photos, generous budget → a single beat that + // bursts at most MAX_BURST_PHOTOS, not all 30. + let items: Vec = (0..30) + .map(|i| item_at(i as i64, &format!("p{i}"))) + .collect(); + let beats = form_beats(&items, 18, MAX_BURST_PHOTOS); + assert_eq!(beats.len(), 1); + assert_eq!(beats[0].photos.len(), MAX_BURST_PHOTOS); + } }