Reels: mixed-media (video clip beats) + faster burst fade

Videos in a span now appear as clip beats: the first few seconds of the video (capped at CLIP_SECONDS=5, and to the source length) filled to the portrait canvas like photos, with its live audio ducked under the narration (amix at 0.35). If the narration outlasts the clip, the last frame is held (tpad); clips with no audio track just play under narration. Selection splits the beat budget between photo beats and clip beats — clips get up to half (≥1 when present), photos the rest — then merges both back into chronological order. SegmentMedia gains a Clip variant; beats carry `media` (photos or one clip) and the cache key tags P/C so a path used as a still vs a clip differ. Also drops the burst fade from 0.15s to 0.08s so a quick burst reads clearly differently from a held shot. Bumps RENDER_VERSION. The clip filtergraph (fill + duck-mix + last-frame hold) is unit-tested but, like the rest of the ffmpeg path, wants a real render check on the GPU host. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-13 00:02:51 -04:00
parent 299e32b014
commit 65793a2dda
4 changed files with 479 additions and 91 deletions
@@ -36,21 +36,40 @@ use crate::otel::extract_context_from_request;
 use crate::state::AppState;
 use selector::ReelSelector;

-/// The media behind one shot. Photos-only for now; a `Clip` variant (a section
-/// of a source video) is the phase-2 extension point.
+/// The media behind one shot: a still photo, or a short section of a source
+/// video (played with its live audio ducked under the narration). Both carry
+/// just the library-relative path; the renderer applies fixed clip framing
+/// (start/length) from constants.
 #[derive(Debug, Clone)]
 pub enum SegmentMedia {
    Photo { rel_path: String, library_id: i32 },
+    Clip { rel_path: String, library_id: i32 },
 }

-/// A beat: one narration line over one or more photos. A single-photo beat is a
-/// held shot; a multi-photo beat is a quick burst that flashes through several
-/// moments of the same event while the line is read — so a week/month reel can
-/// *show* everything it spans without a narration line (and the seconds that
-/// come with it) per photo.
+impl SegmentMedia {
+    fn rel_path(&self) -> &str {
+        match self {
+            SegmentMedia::Photo { rel_path, .. } | SegmentMedia::Clip { rel_path, .. } => rel_path,
+        }
+    }
+    fn library_id(&self) -> i32 {
+        match self {
+            SegmentMedia::Photo { library_id, .. } | SegmentMedia::Clip { library_id, .. } => {
+                *library_id
+            }
+        }
+    }
+}
+
+/// A beat: one narration line over its media. A photo beat holds one still (a
+/// held shot) or several (a quick burst that flashes through moments of an
+/// event while the line is read). A clip beat holds a single video clip. Either
+/// way one narration line covers the whole beat, so a week/month reel can
+/// *show* everything it spans without a narration line — and the seconds that
+/// come with it — per item.
 #[derive(Debug, Clone)]
 pub struct PlannedBeat {
-    pub photos: Vec<SegmentMedia>,
+    pub media: Vec<SegmentMedia>,
    pub date: Option<i64>,
    pub insight_title: Option<String>,
    pub insight_summary: Option<String>,
@@ -63,6 +82,11 @@ impl PlannedBeat {
        let dt = DateTime::from_timestamp(ts, 0)?;
        Some(dt.format("%B %-d, %Y").to_string())
    }
+
+    /// True when this beat is a single video clip (vs one or more photos).
+    pub fn is_clip(&self) -> bool {
+        matches!(self.media.as_slice(), [SegmentMedia::Clip { .. }])
+    }
 }

 /// Reel-wide metadata the scripter uses for framing.
@@ -183,7 +207,7 @@ fn finish_job(

 /// Render version: bump to invalidate every cached reel after a rendering /
 /// scripting change that should produce a fresh result.
-const RENDER_VERSION: u32 = 4;
+const RENDER_VERSION: u32 = 5;

 /// Narration expressiveness — Chatterbox's `exaggeration` knob. A slight bump
 /// over the ~0.5 default warms up otherwise-flat narration without over-acting;
@@ -207,12 +231,13 @@ fn cache_key(selector: &ReelSelector, media: &[SegmentMedia], voice: Option<&str
        voice.unwrap_or("default")
    );
    for m in media {
-        match m {
-            SegmentMedia::Photo {
-                rel_path,
-                library_id,
-            } => buf.push_str(&format!("{library_id}:{rel_path}|")),
-        }
+        // Tag photo vs clip so the same path used as a still and as a video
+        // clip produce different keys.
+        let tag = match m {
+            SegmentMedia::Photo { .. } => 'P',
+            SegmentMedia::Clip { .. } => 'C',
+        };
+        buf.push_str(&format!("{tag}{}:{}|", m.library_id(), m.rel_path()));
    }
    blake3::hash(buf.as_bytes()).to_hex().to_string()
 }
@@ -309,9 +334,9 @@ pub async fn create_reel_handler(
        }));
    }

-    // Flatten every photo across beats (in order) into the cache key — the key
-    // tracks exactly which photos appear and in what sequence.
-    let media: Vec<SegmentMedia> = planned.iter().flat_map(|b| b.photos.clone()).collect();
+    // Flatten every media item across beats (in order) into the cache key — the
+    // key tracks exactly which photos/clips appear and in what sequence.
+    let media: Vec<SegmentMedia> = planned.iter().flat_map(|b| b.media.clone()).collect();
    let voice = req.voice.clone().filter(|s| !s.is_empty());
    let key = cache_key(&selector, &media, voice.as_deref());

@@ -462,7 +487,7 @@ async fn run_reel_job(
    use anyhow::{Context, anyhow};

    let started = Instant::now();
-    let total_photos: usize = planned.iter().map(|b| b.photos.len()).sum();
+    let total_photos: usize = planned.iter().map(|b| b.media.len()).sum();
    log::info!(
        "reel {job_id}: starting — span {:?}, {} beats, {} photos, voice={}",
        meta.span,
@@ -510,15 +535,15 @@ async fn run_reel_job(
    let beat_total = planned.len();
    let mut beat_files: Vec<String> = Vec::new();
    for (i, (beat, line)) in planned.iter().zip(script.lines.iter()).enumerate() {
-        // Resolve all of the beat's photos to absolute paths; drop any that
-        // don't resolve. An empty beat is skipped.
-        let image_paths: Vec<PathBuf> = beat
-            .photos
+        // Resolve the beat's media to absolute paths; drop any that don't
+        // resolve. An empty beat is skipped.
+        let paths: Vec<PathBuf> = beat
+            .media
            .iter()
-            .filter_map(|m| resolve_image_path(app_state, m))
+            .filter_map(|m| resolve_media_path(app_state, m))
            .collect();
-        if image_paths.is_empty() {
-            log::warn!("reel {job_id}: skipping beat {i}, no image paths resolved");
+        if paths.is_empty() {
+            log::warn!("reel {job_id}: skipping beat {i}, no media paths resolved");
            continue;
        }

@@ -551,17 +576,26 @@ async fn run_reel_job(
                .unwrap_or(render::MIN_SEGMENT_SECONDS);

        set_stage(job_id, "rendering");
+        let beat_out = work.path().join(format!("beat_{i:03}.mp4"));
+        let render_result = if beat.is_clip() {
+            log::info!(
+                "reel {job_id}: beat {}/{} — video clip, narration {:.1}s",
+                i + 1,
+                beat_total,
+                narration_secs
+            );
+            render::render_clip_beat(&paths[0], &audio_path, &beat_out, narration_secs, &opts).await
+        } else {
            log::info!(
                "reel {job_id}: beat {}/{} — {} photo(s), narration {:.1}s",
                i + 1,
                beat_total,
-            image_paths.len(),
+                paths.len(),
                narration_secs
            );
-        let beat_out = work.path().join(format!("beat_{i:03}.mp4"));
-        if let Err(e) =
-            render::render_beat(&image_paths, &audio_path, &beat_out, narration_secs, &opts).await
-        {
+            render::render_beat(&paths, &audio_path, &beat_out, narration_secs, &opts).await
+        };
+        if let Err(e) = render_result {
            log::warn!("reel {job_id}: skipping beat {i}, render failed: {e}");
            continue;
        }
@@ -603,15 +637,12 @@ async fn run_reel_job(
    Ok((script.title, final_path))
 }

-/// Resolve a photo segment's library-relative path to a validated absolute
-/// path under its library root.
-fn resolve_image_path(app_state: &AppState, media: &SegmentMedia) -> Option<PathBuf> {
-    let SegmentMedia::Photo {
-        rel_path,
-        library_id,
-    } = media;
-    let lib = app_state.library_by_id(*library_id)?;
-    crate::files::is_valid_full_path(&lib.root_path, rel_path, false)
+/// Resolve a media item's library-relative path to a validated absolute path
+/// under its library root (works for both photos and clips).
+fn resolve_media_path(app_state: &AppState, media: &SegmentMedia) -> Option<PathBuf> {
+    let lib = app_state.library_by_id(media.library_id())?;
+    let rel = media.rel_path().to_string();
+    crate::files::is_valid_full_path(&lib.root_path, &rel, false)
 }

 #[cfg(test)]
@@ -625,6 +656,13 @@ mod tests {
        }
    }

+    fn clip(p: &str, lib: i32) -> SegmentMedia {
+        SegmentMedia::Clip {
+            rel_path: p.to_string(),
+            library_id: lib,
+        }
+    }
+
    fn day_selector() -> ReelSelector {
        ReelSelector::Memories {
            span: MemoriesSpan::Day,
@@ -668,6 +706,35 @@ mod tests {
        assert_ne!(base, cache_key(&week, &media, Some("grandma")));
    }

+    #[test]
+    fn cache_key_distinguishes_photo_from_clip() {
+        // Same path/library used as a still vs a video clip must differ.
+        let as_photo = vec![photo("v.mp4", 1)];
+        let as_clip = vec![clip("v.mp4", 1)];
+        assert_ne!(
+            cache_key(&day_selector(), &as_photo, None),
+            cache_key(&day_selector(), &as_clip, None)
+        );
+    }
+
+    #[test]
+    fn is_clip_only_for_single_clip_beat() {
+        let clip_beat = PlannedBeat {
+            media: vec![clip("v.mp4", 1)],
+            date: None,
+            insight_title: None,
+            insight_summary: None,
+        };
+        let photo_beat = PlannedBeat {
+            media: vec![photo("a.jpg", 1), photo("b.jpg", 1)],
+            date: None,
+            insight_title: None,
+            insight_summary: None,
+        };
+        assert!(clip_beat.is_clip());
+        assert!(!photo_beat.is_clip());
+    }
+
    #[test]
    fn span_phrase_maps_each_span() {
        let mk = |span| ReelMeta {
@@ -682,7 +749,7 @@ mod tests {
    #[test]
    fn date_label_formats_or_none() {
        let beat = PlannedBeat {
-            photos: vec![photo("a.jpg", 1)],
+            media: vec![photo("a.jpg", 1)],
            date: Some(1_560_384_000), // 2019-06-13 UTC
            insight_title: None,
            insight_summary: None,
@@ -690,7 +757,7 @@ mod tests {
        assert!(beat.date_label().unwrap().contains("2019"));

        let undated = PlannedBeat {
-            photos: vec![photo("a.jpg", 1)],
+            media: vec![photo("a.jpg", 1)],
            date: None,
            insight_title: None,
            insight_summary: None,
@@ -36,9 +36,15 @@ pub const MIN_SEGMENT_SECONDS: f64 = 2.5;
 const NARRATION_TAIL_SECONDS: f64 = 0.6;

 /// Fade durations baked into each photo. A held (single-photo) beat gets a
-/// gentle dip; burst photos get a snappier fade so the montage feels quick.
+/// gentle dip; burst photos get a much snappier fade so the difference between
+/// a held shot and a quick burst is obvious.
 const SINGLE_FADE_SECONDS: f64 = 0.35;
-const BURST_FADE_SECONDS: f64 = 0.15;
+const BURST_FADE_SECONDS: f64 = 0.08;
+
+/// Video-clip framing. A clip plays at most this long, with its live audio
+/// ducked to `CLIP_DUCK_VOLUME` under the narration.
+pub const CLIP_SECONDS: f64 = 5.0;
+const CLIP_DUCK_VOLUME: f64 = 0.35;

 /// Floor on how long each burst photo stays up, so a long line over many photos
 /// doesn't flash them subliminally. If the narration is too short to give every
@@ -308,6 +314,162 @@ pub async fn render_beat(
    run_ffmpeg(&args, "beat render").await
 }

+// --- Video-clip beats --------------------------------------------------------
+
+/// Video chain for a clip beat: fill the clip to the portrait canvas (blurred
+/// backdrop, same look as photos), normalize fps, hold the last frame if the
+/// narration outlasts the clip (`tpad`), then fade. Produces `[v]`.
+fn clip_video_filter(opts: &SegmentOpts, clip_dur: f64, beat_total: f64) -> String {
+    let (w, h, fps) = (opts.width, opts.height, opts.fps);
+    let fade = SINGLE_FADE_SECONDS;
+    let hold = (beat_total - clip_dur).max(0.0);
+    let fade_out_start = (beat_total - fade).max(0.0);
+    // Freeze the final frame to cover narration that runs past the clip.
+    let tpad = if hold > 0.05 {
+        format!(",tpad=stop_mode=clone:stop_duration={hold:.3}")
+    } else {
+        String::new()
+    };
+    format!(
+        "[0:v]split=2[bg][fg];\
+         [bg]scale={w}:{h}:force_original_aspect_ratio=increase,\
+         crop={w}:{h},boxblur=20:2[bgb];\
+         [fg]scale={w}:{h}:force_original_aspect_ratio=decrease[fgs];\
+         [bgb][fgs]overlay=(W-w)/2:(H-h)/2,fps={fps}{tpad},\
+         fade=t=in:st=0:d={fade},fade=t=out:st={fade_out_start:.3}:d={fade},\
+         setsar=1,format=yuv420p[v]"
+    )
+}
+
+/// Audio chain for a clip beat. With a clip audio track, duck it under the
+/// narration and mix; without one, just the narration. Produces `[a]`.
+fn clip_audio_filter(has_audio: bool) -> String {
+    if has_audio {
+        format!(
+            "[0:a]volume={CLIP_DUCK_VOLUME}[duck];[1:a]apad[narr];\
+             [duck][narr]amix=inputs=2:duration=longest:normalize=0[a]"
+        )
+    } else {
+        "[1:a]apad[a]".to_string()
+    }
+}
+
+/// Full `filter_complex` for a clip beat (input 0 = clip, input 1 = narration).
+pub fn clip_beat_filtergraph(
+    opts: &SegmentOpts,
+    clip_dur: f64,
+    beat_total: f64,
+    has_audio: bool,
+) -> String {
+    format!(
+        "{};{}",
+        clip_video_filter(opts, clip_dur, beat_total),
+        clip_audio_filter(has_audio)
+    )
+}
+
+/// Build the ffmpeg args for a clip beat: the first `clip_dur` seconds of the
+/// source video, filled to the portrait canvas with its live audio ducked under
+/// the narration, bounded to `beat_total`.
+pub fn build_clip_beat_args(
+    clip_path: &str,
+    audio_path: &str,
+    out_path: &str,
+    clip_dur: f64,
+    beat_total: f64,
+    has_audio: bool,
+    opts: &SegmentOpts,
+) -> Vec<String> {
+    let fps = opts.fps.to_string();
+    let mut args: Vec<String> = vec!["-y".into()];
+    if opts.nvenc {
+        args.extend(["-hwaccel".into(), "cuda".into()]);
+    }
+    args.extend([
+        // Input `-t` limits the clip to its window; audio has none (apad fills).
+        "-t".into(),
+        format!("{clip_dur:.3}"),
+        "-i".into(),
+        clip_path.into(),
+        "-i".into(),
+        audio_path.into(),
+        "-filter_complex".into(),
+        clip_beat_filtergraph(opts, clip_dur, beat_total, has_audio),
+        "-map".into(),
+        "[v]".into(),
+        "-map".into(),
+        "[a]".into(),
+        "-t".into(),
+        format!("{beat_total:.3}"),
+        "-r".into(),
+        fps,
+    ]);
+    args.extend(video_encoder_args(opts.nvenc));
+    args.extend(
+        ["-c:a", "aac", "-b:a", "160k", "-ar", "48000"]
+            .iter()
+            .map(|s| s.to_string()),
+    );
+    args.push(out_path.into());
+    args
+}
+
+/// Whether a media file has at least one audio stream (so a clip beat knows
+/// whether to mix in live audio). Defaults to `false` on any probe failure.
+pub async fn has_audio_stream(path: &str) -> bool {
+    Command::new("ffprobe")
+        .args([
+            "-v",
+            "error",
+            "-select_streams",
+            "a",
+            "-show_entries",
+            "stream=index",
+            "-of",
+            "csv=p=0",
+            path,
+        ])
+        .output()
+        .await
+        .map(|out| !out.stdout.is_empty())
+        .unwrap_or(false)
+}
+
+/// Render one clip beat: a section of `clip_path` (capped at [`CLIP_SECONDS`],
+/// and to the source length) under the narration in `audio_path`. The beat
+/// lasts at least the narration, freezing the clip's last frame if needed.
+pub async fn render_clip_beat(
+    clip_path: &Path,
+    audio_path: &Path,
+    out_path: &Path,
+    narration_secs: f64,
+    opts: &SegmentOpts,
+) -> Result<()> {
+    let clip_str = clip_path.to_string_lossy().to_string();
+    // Clamp the clip to its own length so a short video isn't padded to the cap.
+    let source_dur = crate::video::ffmpeg::get_duration_seconds(&clip_str)
+        .await
+        .ok()
+        .flatten();
+    let clip_dur = match source_dur {
+        Some(d) if d > 0.0 && d < CLIP_SECONDS => d,
+        _ => CLIP_SECONDS,
+    };
+    let beat_total = clip_dur.max(segment_duration(narration_secs));
+    let has_audio = has_audio_stream(&clip_str).await;
+
+    let args = build_clip_beat_args(
+        &clip_str,
+        &audio_path.to_string_lossy(),
+        &out_path.to_string_lossy(),
+        clip_dur,
+        beat_total,
+        has_audio,
+        opts,
+    );
+    run_ffmpeg(&args, "clip beat render").await
+}
+
 /// Join rendered segments into the final reel. Writes the concat list into the
 /// same directory as the output so relative paths and cleanup stay local.
 pub async fn concat_segments(segment_paths: &[String], out_path: &Path) -> Result<()> {
@@ -397,8 +559,8 @@ mod tests {
        // Concatenated in order, audio is the 4th input (index 3).
        assert!(g.contains("[v0][v1][v2]concat=n=3:v=1:a=0[v]"));
        assert!(g.contains("[3:a]apad[a]"));
-        // Burst uses the snappier fade.
-        assert!(g.contains("d=0.15"));
+        // Burst uses the much snappier fade (vs 0.35 for a held shot).
+        assert!(g.contains("d=0.08"));
        assert!(!g.contains("d=0.35"));
    }

@@ -455,6 +617,54 @@ mod tests {
        assert!(!joined.contains("libx264"));
    }

+    #[test]
+    fn clip_filter_ducks_audio_and_holds_last_frame_when_narration_longer() {
+        // 5s clip, 7s beat → 2s freeze of the last frame, ducked-audio mix.
+        let g = clip_beat_filtergraph(&SegmentOpts::default(), 5.0, 7.0, true);
+        assert!(g.contains("tpad=stop_mode=clone:stop_duration=2.000"));
+        assert!(g.contains("volume=0.35"));
+        assert!(g.contains("amix=inputs=2"));
+        assert!(g.contains("[1:a]apad[narr]"));
+        // Fill applied to the clip too.
+        assert!(g.contains("boxblur"));
+        assert!(g.contains("overlay=(W-w)/2:(H-h)/2"));
+    }
+
+    #[test]
+    fn clip_filter_no_tpad_when_clip_covers_the_beat() {
+        // Clip at least as long as the beat → no freeze.
+        let g = clip_beat_filtergraph(&SegmentOpts::default(), 5.0, 5.0, true);
+        assert!(!g.contains("tpad"));
+    }
+
+    #[test]
+    fn clip_filter_narration_only_without_clip_audio() {
+        let g = clip_beat_filtergraph(&SegmentOpts::default(), 5.0, 5.0, false);
+        assert!(!g.contains("amix"));
+        assert!(!g.contains("volume="));
+        assert!(g.contains("[1:a]apad[a]"));
+    }
+
+    #[test]
+    fn clip_beat_args_bound_clip_and_output() {
+        let args = build_clip_beat_args(
+            "/v.mp4",
+            "/n.wav",
+            "/out.mp4",
+            5.0,
+            6.6,
+            true,
+            &SegmentOpts::default(),
+        );
+        let joined = args.join(" ");
+        // Input -t bounds the clip read; output -t bounds the beat.
+        assert!(joined.contains("-t 5.000 -i /v.mp4"));
+        assert!(joined.contains("-i /n.wav"));
+        assert!(joined.contains("-t 6.600"));
+        assert!(joined.contains("-r 30"));
+        assert!(joined.ends_with("/out.mp4"));
+    }
+
    #[test]
    fn concat_args_stream_copy_with_faststart_and_forced_muxer() {
        // Output goes to a .tmp path, so the muxer must be forced — ffmpeg
@@ -54,8 +54,10 @@ pub fn build_script_messages(meta: &ReelMeta, beats: &[PlannedBeat]) -> (String,
        if let Some(date) = beat.date_label() {
            user.push_str(&format!(" {date}"));
        }
-        if beat.photos.len() > 1 {
-            user.push_str(&format!(" (a burst of {} photos)", beat.photos.len()));
+        if beat.is_clip() {
+            user.push_str(" (a video clip)");
+        } else if beat.media.len() > 1 {
+            user.push_str(&format!(" (a burst of {} photos)", beat.media.len()));
        }
        user.push('\n');
        match (&beat.insight_title, &beat.insight_summary) {
@@ -211,7 +213,7 @@ mod tests {
    fn planned(n: usize) -> Vec<PlannedBeat> {
        (0..n)
            .map(|i| PlannedBeat {
-                photos: vec![super::super::SegmentMedia::Photo {
+                media: vec![super::super::SegmentMedia::Photo {
                    rel_path: format!("p{i}.jpg"),
                    library_id: 1,
                }],
@@ -236,7 +238,7 @@ mod tests {
    #[test]
    fn prompt_notes_burst_photo_count() {
        let mut p = planned(1);
-        p[0].photos = vec![
+        p[0].media = vec![
            super::super::SegmentMedia::Photo {
                rel_path: "a.jpg".into(),
                library_id: 1,
@@ -254,6 +256,17 @@ mod tests {
        assert!(user.contains("a burst of 3 photos"));
    }

+    #[test]
+    fn prompt_marks_clip_beats() {
+        let mut p = planned(1);
+        p[0].media = vec![super::super::SegmentMedia::Clip {
+            rel_path: "v.mp4".into(),
+            library_id: 1,
+        }];
+        let (_sys, user) = build_script_messages(&meta(), &p);
+        assert!(user.contains("a video clip"));
+    }
+
    #[test]
    fn prompt_includes_insight_context_when_present() {
        let mut p = planned(1);
@@ -15,7 +15,7 @@ use chrono::{DateTime, Datelike, FixedOffset};

 use super::{PlannedBeat, ReelMeta, SegmentMedia};
 use crate::database::{ExifDao, InsightDao};
-use crate::file_types::is_image_file;
+use crate::file_types::{is_image_file, is_video_file};
 use crate::memories::{self, MemoriesSpan};
 use crate::state::AppState;

@@ -167,13 +167,13 @@ fn partition_into_groups(
        .collect()
 }

-/// Turn a span's photos into `n_beats` beats. Clusters photos into events by
+/// Turn photo items into `n_beats` photo beats. Clusters photos into events by
 /// time gap; if there are more events than beats, adjacent events are merged so
-/// the whole span is still covered. Each beat then flashes up to
-/// `max_burst` photos (an even spread of its group) under one narration line —
-/// so a week/month reel *shows* all its moments without a narrated (and timed)
+/// the whole span is still covered. Each beat then flashes up to `max_burst`
+/// photos (an even spread of its group) under one narration line — so a
+/// week/month reel *shows* all its moments without a narrated (and timed)
 /// segment per photo.
-pub fn form_beats(
+fn form_photo_beats(
    items: &[memories::MemoryItem],
    n_beats: usize,
    max_burst: usize,
@@ -197,7 +197,7 @@ pub fn form_beats(
            let shown = sample_evenly(&group, max_burst);
            let date = shown.first().and_then(|it| it.created);
            PlannedBeat {
-                photos: shown
+                media: shown
                    .into_iter()
                    .map(|it| SegmentMedia::Photo {
                        rel_path: it.path,
@@ -212,6 +212,62 @@ pub fn form_beats(
        .collect()
 }

+/// Split the beat budget between photo beats and video-clip beats. Clips are
+/// individually valuable (motion + live audio) so they get up to half the
+/// budget (at least one if any exist); photos take the rest. With only one
+/// kind present, it gets the whole budget.
+fn split_beat_budget(n_photos: usize, n_videos: usize, n_beats: usize) -> (usize, usize) {
+    if n_videos == 0 {
+        return (n_beats, 0);
+    }
+    if n_photos == 0 {
+        return (0, n_beats.min(n_videos));
+    }
+    let clip_beats = n_videos.min((n_beats / 2).max(1));
+    let photo_beats = n_beats.saturating_sub(clip_beats);
+    (photo_beats, clip_beats)
+}
+
+/// Build the reel's beats from a span's photos and videos under a beat budget.
+/// Videos become one-clip beats (sampled across time if there are more than the
+/// clip budget); photos cluster into burst beats. The two are merged back into
+/// chronological order so the reel reads as the span unfolded.
+pub fn form_beats(
+    photos: &[memories::MemoryItem],
+    videos: &[memories::MemoryItem],
+    n_beats: usize,
+    max_burst: usize,
+) -> Vec<PlannedBeat> {
+    if n_beats == 0 {
+        return Vec::new();
+    }
+    let (photo_budget, clip_budget) = split_beat_budget(photos.len(), videos.len(), n_beats);
+
+    let mut beats = form_photo_beats(photos, photo_budget, max_burst);
+
+    // One clip beat per chosen video, spread across the span's videos.
+    for v in sample_evenly(videos, clip_budget) {
+        beats.push(PlannedBeat {
+            media: vec![SegmentMedia::Clip {
+                rel_path: v.path,
+                library_id: v.library_id,
+            }],
+            date: v.created,
+            insight_title: None,
+            insight_summary: None,
+        });
+    }
+
+    // Merge photo and clip beats back into chronological order (undated last).
+    beats.sort_by(|a, b| match (a.date, b.date) {
+        (Some(x), Some(y)) => x.cmp(&y),
+        (Some(_), None) => std::cmp::Ordering::Less,
+        (None, Some(_)) => std::cmp::Ordering::Greater,
+        (None, None) => std::cmp::Ordering::Equal,
+    });
+    beats
+}
+
 /// Cheap pass: resolve the selector into an ordered list of media (no insight
 /// lookups yet) plus reel metadata. `Err` only on an invalid library param.
 pub fn resolve(
@@ -238,23 +294,24 @@ pub fn resolve(
                library.as_deref(),
            )?;

-            // Phase 1 is photos-only: drop videos (a clip segment type lands
-            // in phase 2).
-            let items: Vec<memories::MemoryItem> = items
-                .into_iter()
-                .filter(|it| is_image_file(Path::new(&it.path)))
-                .collect();
-
-            // Years are derived from the whole span (what the reel represents),
-            // before the budget narrows it down to beats.
+            // Split into photos and video clips; anything that's neither is
+            // dropped. Years span both, computed before the budget narrows it.
            let years = distinct_years(&items, client_tz);
            let meta = ReelMeta { span: *span, years };

+            let (photos, videos): (Vec<_>, Vec<_>) = items
+                .into_iter()
+                .filter(|it| {
+                    is_image_file(Path::new(&it.path)) || is_video_file(Path::new(&it.path))
+                })
+                .partition(|it| is_image_file(Path::new(&it.path)));
+
            // The budget caps the number of narrated beats (≈ reel length);
-            // each beat then bursts through several photos, so the reel covers
-            // the span's moments without running minutes long.
+            // photo beats then burst through several photos and video beats
+            // play a short clip, so the reel covers the span without running
+            // minutes long.
            let n_beats = budget_segments(*max_segments);
-            let beats = form_beats(&items, n_beats, MAX_BURST_PHOTOS);
+            let beats = form_beats(&photos, &videos, n_beats, MAX_BURST_PHOTOS);
            Ok((beats, meta))
        }
    }
@@ -289,10 +346,13 @@ pub fn enrich(
        return;
    };
    for beat in beats.iter_mut() {
-        let Some(SegmentMedia::Photo { rel_path, .. }) = beat.photos.first() else {
-            continue;
+        let rel_path = match beat.media.first() {
+            Some(SegmentMedia::Photo { rel_path, .. } | SegmentMedia::Clip { rel_path, .. }) => {
+                rel_path.clone()
+            }
+            None => continue,
        };
-        if let Ok(Some(insight)) = dao.get_insight(span_context, rel_path) {
+        if let Ok(Some(insight)) = dao.get_insight(span_context, &rel_path) {
            beat.insight_title = Some(insight.title);
            beat.insight_summary = Some(insight.summary);
        }
@@ -372,15 +432,18 @@ mod tests {
        assert_eq!(distinct_years(&items, None), vec![2019, 2021]);
    }

-    // Build an item at a given unix timestamp (seconds).
-    fn item_at(ts: i64, name: &str) -> memories::MemoryItem {
+    // Build an item at a given unix timestamp (seconds) with a chosen extension.
+    fn item_ext(ts: i64, name: &str, ext: &str) -> memories::MemoryItem {
        memories::MemoryItem {
-            path: format!("{name}.jpg"),
+            path: format!("{name}.{ext}"),
            created: Some(ts),
            modified: None,
            library_id: 1,
        }
    }
+    fn item_at(ts: i64, name: &str) -> memories::MemoryItem {
+        item_ext(ts, name, "jpg")
+    }

    #[test]
    fn budget_segments_caps_to_duration_target() {
@@ -405,7 +468,7 @@ mod tests {
    }

    #[test]
-    fn form_beats_one_beat_per_event_when_they_fit() {
+    fn photo_beats_one_per_event_when_they_fit() {
        // Three well-separated events, budget of 10 → three beats, each holding
        // all of its (few) photos.
        let items = vec![
@@ -414,35 +477,70 @@ mod tests {
            item_at(1_000_000, "c"),
            item_at(2_000_000, "d"),
        ];
-        let beats = form_beats(&items, 10, MAX_BURST_PHOTOS);
+        let beats = form_photo_beats(&items, 10, MAX_BURST_PHOTOS);
        assert_eq!(beats.len(), 3);
-        assert_eq!(beats[0].photos.len(), 2); // burst of the first event
-        assert_eq!(beats[1].photos.len(), 1);
-        assert_eq!(beats[2].photos.len(), 1);
+        assert_eq!(beats[0].media.len(), 2); // burst of the first event
+        assert_eq!(beats[1].media.len(), 1);
+        assert_eq!(beats[2].media.len(), 1);
    }

    #[test]
-    fn form_beats_merges_events_when_over_budget() {
+    fn photo_beats_merge_events_when_over_budget() {
        // Six distinct events but only two beats → adjacent events fold in, and
        // every event's photos still appear (capped by the burst max).
        let items: Vec<memories::MemoryItem> = (0..6)
            .map(|i| item_at(i as i64 * 1_000_000, &format!("e{i}")))
            .collect();
-        let beats = form_beats(&items, 2, MAX_BURST_PHOTOS);
+        let beats = form_photo_beats(&items, 2, MAX_BURST_PHOTOS);
        assert_eq!(beats.len(), 2);
-        let shown: usize = beats.iter().map(|b| b.photos.len()).sum();
+        let shown: usize = beats.iter().map(|b| b.media.len()).sum();
        assert_eq!(shown, 6); // all six moments still shown across two beats
    }

    #[test]
-    fn form_beats_caps_burst_to_max() {
+    fn photo_beats_cap_burst_to_max() {
        // One dense event of 30 photos, generous budget → a single beat that
        // bursts at most MAX_BURST_PHOTOS, not all 30.
        let items: Vec<memories::MemoryItem> = (0..30)
            .map(|i| item_at(i as i64, &format!("p{i}")))
            .collect();
-        let beats = form_beats(&items, 18, MAX_BURST_PHOTOS);
+        let beats = form_photo_beats(&items, 18, MAX_BURST_PHOTOS);
        assert_eq!(beats.len(), 1);
-        assert_eq!(beats[0].photos.len(), MAX_BURST_PHOTOS);
+        assert_eq!(beats[0].media.len(), MAX_BURST_PHOTOS);
+    }
+
+    #[test]
+    fn split_beat_budget_handles_each_mix() {
+        // Only photos / only videos → that kind gets the whole budget.
+        assert_eq!(split_beat_budget(10, 0, 18), (18, 0));
+        assert_eq!(split_beat_budget(0, 10, 18), (0, 10)); // capped at n_videos
+        assert_eq!(split_beat_budget(0, 30, 18), (0, 18)); // capped at budget
+        // Mixed → clips up to half (≥1), photos the rest.
+        assert_eq!(split_beat_budget(100, 100, 18), (9, 9));
+        assert_eq!(split_beat_budget(100, 1, 18), (17, 1)); // few videos
+    }
+
+    #[test]
+    fn form_beats_mixes_clip_and_photo_beats_in_time_order() {
+        let photos = vec![item_at(0, "p0"), item_at(2_000_000, "p1")];
+        // A video between the two photo events (in time).
+        let videos = vec![item_ext(1_000_000, "v0", "mp4")];
+        let beats = form_beats(&photos, &videos, 10, MAX_BURST_PHOTOS);
+        // Two photo events + one clip = three beats, chronological.
+        assert_eq!(beats.len(), 3);
+        assert!(!beats[0].is_clip()); // p0 @ t=0
+        assert!(beats[1].is_clip()); // v0 @ t=1e6
+        assert!(!beats[2].is_clip()); // p1 @ t=2e6
+        assert!(matches!(beats[1].media[0], SegmentMedia::Clip { .. }));
+    }
+
+    #[test]
+    fn form_beats_videos_only_become_clip_beats() {
+        let videos: Vec<memories::MemoryItem> = (0..3)
+            .map(|i| item_ext(i as i64 * 1_000_000, &format!("v{i}"), "mov"))
+            .collect();
+        let beats = form_beats(&[], &videos, 10, MAX_BURST_PHOTOS);
+        assert_eq!(beats.len(), 3);
+        assert!(beats.iter().all(|b| b.is_clip()));
    }
 }