4 changed files with 479 additions and 91 deletions
@@ -36,21 +36,40 @@ use crate::otel::extract_context_from_request;
 use crate::state::AppState;
 use selector::ReelSelector;

-/// The media behind one shot. Photos-only for now; a `Clip` variant (a section
-/// of a source video) is the phase-2 extension point.
+/// The media behind one shot: a still photo, or a short section of a source
+/// video (played with its live audio ducked under the narration). Both carry
+/// just the library-relative path; the renderer applies fixed clip framing
+/// (start/length) from constants.
 #[derive(Debug, Clone)]
 pub enum SegmentMedia {
    Photo { rel_path: String, library_id: i32 },
+    Clip { rel_path: String, library_id: i32 },
 }

-/// A beat: one narration line over one or more photos. A single-photo beat is a
-/// held shot; a multi-photo beat is a quick burst that flashes through several
-/// moments of the same event while the line is read — so a week/month reel can
-/// *show* everything it spans without a narration line (and the seconds that
-/// come with it) per photo.
+impl SegmentMedia {
+    fn rel_path(&self) -> &str {
+        match self {
+            SegmentMedia::Photo { rel_path, .. } | SegmentMedia::Clip { rel_path, .. } => rel_path,
+        }
+    }
+    fn library_id(&self) -> i32 {
+        match self {
+            SegmentMedia::Photo { library_id, .. } | SegmentMedia::Clip { library_id, .. } => {
+                *library_id
+            }
+        }
+    }
+}
+
+/// A beat: one narration line over its media. A photo beat holds one still (a
+/// held shot) or several (a quick burst that flashes through moments of an
+/// event while the line is read). A clip beat holds a single video clip. Either
+/// way one narration line covers the whole beat, so a week/month reel can
+/// *show* everything it spans without a narration line — and the seconds that
+/// come with it — per item.
 #[derive(Debug, Clone)]
 pub struct PlannedBeat {
-    pub photos: Vec<SegmentMedia>,
+    pub media: Vec<SegmentMedia>,
    pub date: Option<i64>,
    pub insight_title: Option<String>,
    pub insight_summary: Option<String>,
@@ -63,6 +82,11 @@ impl PlannedBeat {
        let dt = DateTime::from_timestamp(ts, 0)?;
        Some(dt.format("%B %-d, %Y").to_string())
    }
+
+    /// True when this beat is a single video clip (vs one or more photos).
+    pub fn is_clip(&self) -> bool {
+        matches!(self.media.as_slice(), [SegmentMedia::Clip { .. }])
+    }
 }

 /// Reel-wide metadata the scripter uses for framing.
@@ -183,7 +207,7 @@ fn finish_job(

 /// Render version: bump to invalidate every cached reel after a rendering /
 /// scripting change that should produce a fresh result.
-const RENDER_VERSION: u32 = 4;
+const RENDER_VERSION: u32 = 5;

 /// Narration expressiveness — Chatterbox's `exaggeration` knob. A slight bump
 /// over the ~0.5 default warms up otherwise-flat narration without over-acting;
@@ -207,12 +231,13 @@ fn cache_key(selector: &ReelSelector, media: &[SegmentMedia], voice: Option<&str
        voice.unwrap_or("default")
    );
    for m in media {
-        match m {
-            SegmentMedia::Photo {
-                rel_path,
-                library_id,
-            } => buf.push_str(&format!("{library_id}:{rel_path}|")),
-        }
+        // Tag photo vs clip so the same path used as a still and as a video
+        // clip produce different keys.
+        let tag = match m {
+            SegmentMedia::Photo { .. } => 'P',
+            SegmentMedia::Clip { .. } => 'C',
+        };
+        buf.push_str(&format!("{tag}{}:{}|", m.library_id(), m.rel_path()));
    }
    blake3::hash(buf.as_bytes()).to_hex().to_string()
 }
@@ -309,9 +334,9 @@ pub async fn create_reel_handler(
        }));
    }

-    // Flatten every photo across beats (in order) into the cache key — the key
-    // tracks exactly which photos appear and in what sequence.
-    let media: Vec<SegmentMedia> = planned.iter().flat_map(|b| b.photos.clone()).collect();
+    // Flatten every media item across beats (in order) into the cache key — the
+    // key tracks exactly which photos/clips appear and in what sequence.
+    let media: Vec<SegmentMedia> = planned.iter().flat_map(|b| b.media.clone()).collect();
    let voice = req.voice.clone().filter(|s| !s.is_empty());
    let key = cache_key(&selector, &media, voice.as_deref());

@@ -462,7 +487,7 @@ async fn run_reel_job(
    use anyhow::{Context, anyhow};

    let started = Instant::now();
-    let total_photos: usize = planned.iter().map(|b| b.photos.len()).sum();
+    let total_photos: usize = planned.iter().map(|b| b.media.len()).sum();
    log::info!(
        "reel {job_id}: starting — span {:?}, {} beats, {} photos, voice={}",
        meta.span,
@@ -510,15 +535,15 @@ async fn run_reel_job(
    let beat_total = planned.len();
    let mut beat_files: Vec<String> = Vec::new();
    for (i, (beat, line)) in planned.iter().zip(script.lines.iter()).enumerate() {
-        // Resolve all of the beat's photos to absolute paths; drop any that
-        // don't resolve. An empty beat is skipped.
-        let image_paths: Vec<PathBuf> = beat
-            .photos
+        // Resolve the beat's media to absolute paths; drop any that don't
+        // resolve. An empty beat is skipped.
+        let paths: Vec<PathBuf> = beat
+            .media
            .iter()
-            .filter_map(|m| resolve_image_path(app_state, m))
+            .filter_map(|m| resolve_media_path(app_state, m))
            .collect();
-        if image_paths.is_empty() {
-            log::warn!("reel {job_id}: skipping beat {i}, no image paths resolved");
+        if paths.is_empty() {
+            log::warn!("reel {job_id}: skipping beat {i}, no media paths resolved");
            continue;
        }

@@ -551,17 +576,26 @@ async fn run_reel_job(
                .unwrap_or(render::MIN_SEGMENT_SECONDS);

        set_stage(job_id, "rendering");
-        log::info!(
-            "reel {job_id}: beat {}/{} — {} photo(s), narration {:.1}s",
-            i + 1,
-            beat_total,
-            image_paths.len(),
-            narration_secs
-        );
        let beat_out = work.path().join(format!("beat_{i:03}.mp4"));
-        if let Err(e) =
-            render::render_beat(&image_paths, &audio_path, &beat_out, narration_secs, &opts).await
-        {
+        let render_result = if beat.is_clip() {
+            log::info!(
+                "reel {job_id}: beat {}/{} — video clip, narration {:.1}s",
+                i + 1,
+                beat_total,
+                narration_secs
+            );
+            render::render_clip_beat(&paths[0], &audio_path, &beat_out, narration_secs, &opts).await
+        } else {
+            log::info!(
+                "reel {job_id}: beat {}/{} — {} photo(s), narration {:.1}s",
+                i + 1,
+                beat_total,
+                paths.len(),
+                narration_secs
+            );
+            render::render_beat(&paths, &audio_path, &beat_out, narration_secs, &opts).await
+        };
+        if let Err(e) = render_result {
            log::warn!("reel {job_id}: skipping beat {i}, render failed: {e}");
            continue;
        }
@@ -603,15 +637,12 @@ async fn run_reel_job(
    Ok((script.title, final_path))
 }

-/// Resolve a photo segment's library-relative path to a validated absolute
-/// path under its library root.
-fn resolve_image_path(app_state: &AppState, media: &SegmentMedia) -> Option<PathBuf> {
-    let SegmentMedia::Photo {
-        rel_path,
-        library_id,
-    } = media;
-    let lib = app_state.library_by_id(*library_id)?;
-    crate::files::is_valid_full_path(&lib.root_path, rel_path, false)
+/// Resolve a media item's library-relative path to a validated absolute path
+/// under its library root (works for both photos and clips).
+fn resolve_media_path(app_state: &AppState, media: &SegmentMedia) -> Option<PathBuf> {
+    let lib = app_state.library_by_id(media.library_id())?;
+    let rel = media.rel_path().to_string();
+    crate::files::is_valid_full_path(&lib.root_path, &rel, false)
 }

 #[cfg(test)]
@@ -625,6 +656,13 @@ mod tests {
        }
    }

+    fn clip(p: &str, lib: i32) -> SegmentMedia {
+        SegmentMedia::Clip {
+            rel_path: p.to_string(),
+            library_id: lib,
+        }
+    }
+
    fn day_selector() -> ReelSelector {
        ReelSelector::Memories {
            span: MemoriesSpan::Day,
@@ -668,6 +706,35 @@ mod tests {
        assert_ne!(base, cache_key(&week, &media, Some("grandma")));
    }

+    #[test]
+    fn cache_key_distinguishes_photo_from_clip() {
+        // Same path/library used as a still vs a video clip must differ.
+        let as_photo = vec![photo("v.mp4", 1)];
+        let as_clip = vec![clip("v.mp4", 1)];
+        assert_ne!(
+            cache_key(&day_selector(), &as_photo, None),
+            cache_key(&day_selector(), &as_clip, None)
+        );
+    }
+
+    #[test]
+    fn is_clip_only_for_single_clip_beat() {
+        let clip_beat = PlannedBeat {
+            media: vec![clip("v.mp4", 1)],
+            date: None,
+            insight_title: None,
+            insight_summary: None,
+        };
+        let photo_beat = PlannedBeat {
+            media: vec![photo("a.jpg", 1), photo("b.jpg", 1)],
+            date: None,
+            insight_title: None,
+            insight_summary: None,
+        };
+        assert!(clip_beat.is_clip());
+        assert!(!photo_beat.is_clip());
+    }
+
    #[test]
    fn span_phrase_maps_each_span() {
        let mk = |span| ReelMeta {
@@ -682,7 +749,7 @@ mod tests {
    #[test]
    fn date_label_formats_or_none() {
        let beat = PlannedBeat {
-            photos: vec![photo("a.jpg", 1)],
+            media: vec![photo("a.jpg", 1)],
            date: Some(1_560_384_000), // 2019-06-13 UTC
            insight_title: None,
            insight_summary: None,
@@ -690,7 +757,7 @@ mod tests {
        assert!(beat.date_label().unwrap().contains("2019"));

        let undated = PlannedBeat {
-            photos: vec![photo("a.jpg", 1)],
+            media: vec![photo("a.jpg", 1)],
            date: None,
            insight_title: None,
            insight_summary: None,
@@ -36,9 +36,15 @@ pub const MIN_SEGMENT_SECONDS: f64 = 2.5;
 const NARRATION_TAIL_SECONDS: f64 = 0.6;

 /// Fade durations baked into each photo. A held (single-photo) beat gets a
-/// gentle dip; burst photos get a snappier fade so the montage feels quick.
+/// gentle dip; burst photos get a much snappier fade so the difference between
+/// a held shot and a quick burst is obvious.
 const SINGLE_FADE_SECONDS: f64 = 0.35;
-const BURST_FADE_SECONDS: f64 = 0.15;
+const BURST_FADE_SECONDS: f64 = 0.08;
+
+/// Video-clip framing. A clip plays at most this long, with its live audio
+/// ducked to `CLIP_DUCK_VOLUME` under the narration.
+pub const CLIP_SECONDS: f64 = 5.0;
+const CLIP_DUCK_VOLUME: f64 = 0.35;

 /// Floor on how long each burst photo stays up, so a long line over many photos
 /// doesn't flash them subliminally. If the narration is too short to give every
@@ -308,6 +314,162 @@ pub async fn render_beat(
    run_ffmpeg(&args, "beat render").await
 }

+// --- Video-clip beats --------------------------------------------------------
+
+/// Video chain for a clip beat: fill the clip to the portrait canvas (blurred
+/// backdrop, same look as photos), normalize fps, hold the last frame if the
+/// narration outlasts the clip (`tpad`), then fade. Produces `[v]`.
+fn clip_video_filter(opts: &SegmentOpts, clip_dur: f64, beat_total: f64) -> String {
+    let (w, h, fps) = (opts.width, opts.height, opts.fps);
+    let fade = SINGLE_FADE_SECONDS;
+    let hold = (beat_total - clip_dur).max(0.0);
+    let fade_out_start = (beat_total - fade).max(0.0);
+    // Freeze the final frame to cover narration that runs past the clip.
+    let tpad = if hold > 0.05 {
+        format!(",tpad=stop_mode=clone:stop_duration={hold:.3}")
+    } else {
+        String::new()
+    };
+    format!(
+        "[0:v]split=2[bg][fg];\
+         [bg]scale={w}:{h}:force_original_aspect_ratio=increase,\
+         crop={w}:{h},boxblur=20:2[bgb];\
+         [fg]scale={w}:{h}:force_original_aspect_ratio=decrease[fgs];\
+         [bgb][fgs]overlay=(W-w)/2:(H-h)/2,fps={fps}{tpad},\
+         fade=t=in:st=0:d={fade},fade=t=out:st={fade_out_start:.3}:d={fade},\
+         setsar=1,format=yuv420p[v]"
+    )
+}
+
+/// Audio chain for a clip beat. With a clip audio track, duck it under the
+/// narration and mix; without one, just the narration. Produces `[a]`.
+fn clip_audio_filter(has_audio: bool) -> String {
+    if has_audio {
+        format!(
+            "[0:a]volume={CLIP_DUCK_VOLUME}[duck];[1:a]apad[narr];\
+             [duck][narr]amix=inputs=2:duration=longest:normalize=0[a]"
+        )
+    } else {
+        "[1:a]apad[a]".to_string()
+    }
+}
+
+/// Full `filter_complex` for a clip beat (input 0 = clip, input 1 = narration).
+pub fn clip_beat_filtergraph(
+    opts: &SegmentOpts,
+    clip_dur: f64,
+    beat_total: f64,
+    has_audio: bool,
+) -> String {
+    format!(
+        "{};{}",
+        clip_video_filter(opts, clip_dur, beat_total),
+        clip_audio_filter(has_audio)
+    )
+}
+
+/// Build the ffmpeg args for a clip beat: the first `clip_dur` seconds of the
+/// source video, filled to the portrait canvas with its live audio ducked under
+/// the narration, bounded to `beat_total`.
+pub fn build_clip_beat_args(
+    clip_path: &str,
+    audio_path: &str,
+    out_path: &str,
+    clip_dur: f64,
+    beat_total: f64,
+    has_audio: bool,
+    opts: &SegmentOpts,
+) -> Vec<String> {
+    let fps = opts.fps.to_string();
+    let mut args: Vec<String> = vec!["-y".into()];
+    if opts.nvenc {
+        args.extend(["-hwaccel".into(), "cuda".into()]);
+    }
+    args.extend([
+        // Input `-t` limits the clip to its window; audio has none (apad fills).
+        "-t".into(),
+        format!("{clip_dur:.3}"),
+        "-i".into(),
+        clip_path.into(),
+        "-i".into(),
+        audio_path.into(),
+        "-filter_complex".into(),
+        clip_beat_filtergraph(opts, clip_dur, beat_total, has_audio),
+        "-map".into(),
+        "[v]".into(),
+        "-map".into(),
+        "[a]".into(),
+        "-t".into(),
+        format!("{beat_total:.3}"),
+        "-r".into(),
+        fps,
+    ]);
+    args.extend(video_encoder_args(opts.nvenc));
+    args.extend(
+        ["-c:a", "aac", "-b:a", "160k", "-ar", "48000"]
+            .iter()
+            .map(|s| s.to_string()),
+    );
+    args.push(out_path.into());
+    args
+}
+
+/// Whether a media file has at least one audio stream (so a clip beat knows
+/// whether to mix in live audio). Defaults to `false` on any probe failure.
+pub async fn has_audio_stream(path: &str) -> bool {
+    Command::new("ffprobe")
+        .args([
+            "-v",
+            "error",
+            "-select_streams",
+            "a",
+            "-show_entries",
+            "stream=index",
+            "-of",
+            "csv=p=0",
+            path,
+        ])
+        .output()
+        .await
+        .map(|out| !out.stdout.is_empty())
+        .unwrap_or(false)
+}
+
+/// Render one clip beat: a section of `clip_path` (capped at [`CLIP_SECONDS`],
+/// and to the source length) under the narration in `audio_path`. The beat
+/// lasts at least the narration, freezing the clip's last frame if needed.
+pub async fn render_clip_beat(
+    clip_path: &Path,
+    audio_path: &Path,
+    out_path: &Path,
+    narration_secs: f64,
+    opts: &SegmentOpts,
+) -> Result<()> {
+    let clip_str = clip_path.to_string_lossy().to_string();
+    // Clamp the clip to its own length so a short video isn't padded to the cap.
+    let source_dur = crate::video::ffmpeg::get_duration_seconds(&clip_str)
+        .await
+        .ok()
+        .flatten();
+    let clip_dur = match source_dur {
+        Some(d) if d > 0.0 && d < CLIP_SECONDS => d,
+        _ => CLIP_SECONDS,
+    };
+    let beat_total = clip_dur.max(segment_duration(narration_secs));
+    let has_audio = has_audio_stream(&clip_str).await;
+
+    let args = build_clip_beat_args(
+        &clip_str,
+        &audio_path.to_string_lossy(),
+        &out_path.to_string_lossy(),
+        clip_dur,
+        beat_total,
+        has_audio,
+        opts,
+    );
+    run_ffmpeg(&args, "clip beat render").await
+}
+
 /// Join rendered segments into the final reel. Writes the concat list into the
 /// same directory as the output so relative paths and cleanup stay local.
 pub async fn concat_segments(segment_paths: &[String], out_path: &Path) -> Result<()> {
@@ -397,8 +559,8 @@ mod tests {
        // Concatenated in order, audio is the 4th input (index 3).
        assert!(g.contains("[v0][v1][v2]concat=n=3:v=1:a=0[v]"));
        assert!(g.contains("[3:a]apad[a]"));
-        // Burst uses the snappier fade.
-        assert!(g.contains("d=0.15"));
+        // Burst uses the much snappier fade (vs 0.35 for a held shot).
+        assert!(g.contains("d=0.08"));
        assert!(!g.contains("d=0.35"));
    }

@@ -455,6 +617,54 @@ mod tests {
        assert!(!joined.contains("libx264"));
    }

+    #[test]
+    fn clip_filter_ducks_audio_and_holds_last_frame_when_narration_longer() {
+        // 5s clip, 7s beat → 2s freeze of the last frame, ducked-audio mix.
+        let g = clip_beat_filtergraph(&SegmentOpts::default(), 5.0, 7.0, true);
+        assert!(g.contains("tpad=stop_mode=clone:stop_duration=2.000"));
+        assert!(g.contains("volume=0.35"));
+        assert!(g.contains("amix=inputs=2"));
+        assert!(g.contains("[1:a]apad[narr]"));
+        // Fill applied to the clip too.
+        assert!(g.contains("boxblur"));
+        assert!(g.contains("overlay=(W-w)/2:(H-h)/2"));
+    }
+
+    #[test]
+    fn clip_filter_no_tpad_when_clip_covers_the_beat() {
+        // Clip at least as long as the beat → no freeze.
+        let g = clip_beat_filtergraph(&SegmentOpts::default(), 5.0, 5.0, true);
+        assert!(!g.contains("tpad"));
+    }
+
+    #[test]
+    fn clip_filter_narration_only_without_clip_audio() {
+        let g = clip_beat_filtergraph(&SegmentOpts::default(), 5.0, 5.0, false);
+        assert!(!g.contains("amix"));
+        assert!(!g.contains("volume="));
+        assert!(g.contains("[1:a]apad[a]"));
+    }
+
+    #[test]
+    fn clip_beat_args_bound_clip_and_output() {
+        let args = build_clip_beat_args(
+            "/v.mp4",
+            "/n.wav",
+            "/out.mp4",
+            5.0,
+            6.6,
+            true,
+            &SegmentOpts::default(),
+        );
+        let joined = args.join(" ");
+        // Input -t bounds the clip read; output -t bounds the beat.
+        assert!(joined.contains("-t 5.000 -i /v.mp4"));
+        assert!(joined.contains("-i /n.wav"));
+        assert!(joined.contains("-t 6.600"));
+        assert!(joined.contains("-r 30"));
+        assert!(joined.ends_with("/out.mp4"));
+    }
+
    #[test]
    fn concat_args_stream_copy_with_faststart_and_forced_muxer() {
        // Output goes to a .tmp path, so the muxer must be forced — ffmpeg
@@ -54,8 +54,10 @@ pub fn build_script_messages(meta: &ReelMeta, beats: &[PlannedBeat]) -> (String,
        if let Some(date) = beat.date_label() {
            user.push_str(&format!(" {date}"));
        }
-        if beat.photos.len() > 1 {
-            user.push_str(&format!(" (a burst of {} photos)", beat.photos.len()));
+        if beat.is_clip() {
+            user.push_str(" (a video clip)");
+        } else if beat.media.len() > 1 {
+            user.push_str(&format!(" (a burst of {} photos)", beat.media.len()));
        }
        user.push('\n');
        match (&beat.insight_title, &beat.insight_summary) {
@@ -211,7 +213,7 @@ mod tests {
    fn planned(n: usize) -> Vec<PlannedBeat> {
        (0..n)
            .map(|i| PlannedBeat {
-                photos: vec![super::super::SegmentMedia::Photo {
+                media: vec![super::super::SegmentMedia::Photo {
                    rel_path: format!("p{i}.jpg"),
                    library_id: 1,
                }],
@@ -236,7 +238,7 @@ mod tests {
    #[test]
    fn prompt_notes_burst_photo_count() {
        let mut p = planned(1);
-        p[0].photos = vec![
+        p[0].media = vec![
            super::super::SegmentMedia::Photo {
                rel_path: "a.jpg".into(),
                library_id: 1,
@@ -254,6 +256,17 @@ mod tests {
        assert!(user.contains("a burst of 3 photos"));
    }

+    #[test]
+    fn prompt_marks_clip_beats() {
+        let mut p = planned(1);
+        p[0].media = vec![super::super::SegmentMedia::Clip {
+            rel_path: "v.mp4".into(),
+            library_id: 1,
+        }];
+        let (_sys, user) = build_script_messages(&meta(), &p);
+        assert!(user.contains("a video clip"));
+    }
+
    #[test]
    fn prompt_includes_insight_context_when_present() {
        let mut p = planned(1);
@@ -15,7 +15,7 @@ use chrono::{DateTime, Datelike, FixedOffset};

 use super::{PlannedBeat, ReelMeta, SegmentMedia};
 use crate::database::{ExifDao, InsightDao};
-use crate::file_types::is_image_file;
+use crate::file_types::{is_image_file, is_video_file};
 use crate::memories::{self, MemoriesSpan};
 use crate::state::AppState;

@@ -167,13 +167,13 @@ fn partition_into_groups(
        .collect()
 }

-/// Turn a span's photos into `n_beats` beats. Clusters photos into events by
+/// Turn photo items into `n_beats` photo beats. Clusters photos into events by
 /// time gap; if there are more events than beats, adjacent events are merged so
-/// the whole span is still covered. Each beat then flashes up to
-/// `max_burst` photos (an even spread of its group) under one narration line —
-/// so a week/month reel *shows* all its moments without a narrated (and timed)
+/// the whole span is still covered. Each beat then flashes up to `max_burst`
+/// photos (an even spread of its group) under one narration line — so a
+/// week/month reel *shows* all its moments without a narrated (and timed)
 /// segment per photo.
-pub fn form_beats(
+fn form_photo_beats(
    items: &[memories::MemoryItem],
    n_beats: usize,
    max_burst: usize,
@@ -197,7 +197,7 @@ pub fn form_beats(
            let shown = sample_evenly(&group, max_burst);
            let date = shown.first().and_then(|it| it.created);
            PlannedBeat {
-                photos: shown
+                media: shown
                    .into_iter()
                    .map(|it| SegmentMedia::Photo {
                        rel_path: it.path,
@@ -212,6 +212,62 @@ pub fn form_beats(
        .collect()
 }

+/// Split the beat budget between photo beats and video-clip beats. Clips are
+/// individually valuable (motion + live audio) so they get up to half the
+/// budget (at least one if any exist); photos take the rest. With only one
+/// kind present, it gets the whole budget.
+fn split_beat_budget(n_photos: usize, n_videos: usize, n_beats: usize) -> (usize, usize) {
+    if n_videos == 0 {
+        return (n_beats, 0);
+    }
+    if n_photos == 0 {
+        return (0, n_beats.min(n_videos));
+    }
+    let clip_beats = n_videos.min((n_beats / 2).max(1));
+    let photo_beats = n_beats.saturating_sub(clip_beats);
+    (photo_beats, clip_beats)
+}
+
+/// Build the reel's beats from a span's photos and videos under a beat budget.
+/// Videos become one-clip beats (sampled across time if there are more than the
+/// clip budget); photos cluster into burst beats. The two are merged back into
+/// chronological order so the reel reads as the span unfolded.
+pub fn form_beats(
+    photos: &[memories::MemoryItem],
+    videos: &[memories::MemoryItem],
+    n_beats: usize,
+    max_burst: usize,
+) -> Vec<PlannedBeat> {
+    if n_beats == 0 {
+        return Vec::new();
+    }
+    let (photo_budget, clip_budget) = split_beat_budget(photos.len(), videos.len(), n_beats);
+
+    let mut beats = form_photo_beats(photos, photo_budget, max_burst);
+
+    // One clip beat per chosen video, spread across the span's videos.
+    for v in sample_evenly(videos, clip_budget) {
+        beats.push(PlannedBeat {
+            media: vec![SegmentMedia::Clip {
+                rel_path: v.path,
+                library_id: v.library_id,
+            }],
+            date: v.created,
+            insight_title: None,
+            insight_summary: None,
+        });
+    }
+
+    // Merge photo and clip beats back into chronological order (undated last).
+    beats.sort_by(|a, b| match (a.date, b.date) {
+        (Some(x), Some(y)) => x.cmp(&y),
+        (Some(_), None) => std::cmp::Ordering::Less,
+        (None, Some(_)) => std::cmp::Ordering::Greater,
+        (None, None) => std::cmp::Ordering::Equal,
+    });
+    beats
+}
+
 /// Cheap pass: resolve the selector into an ordered list of media (no insight
 /// lookups yet) plus reel metadata. `Err` only on an invalid library param.
 pub fn resolve(
@@ -238,23 +294,24 @@ pub fn resolve(
                library.as_deref(),
            )?;

-            // Phase 1 is photos-only: drop videos (a clip segment type lands
-            // in phase 2).
-            let items: Vec<memories::MemoryItem> = items
-                .into_iter()
-                .filter(|it| is_image_file(Path::new(&it.path)))
-                .collect();
-
-            // Years are derived from the whole span (what the reel represents),
-            // before the budget narrows it down to beats.
+            // Split into photos and video clips; anything that's neither is
+            // dropped. Years span both, computed before the budget narrows it.
            let years = distinct_years(&items, client_tz);
            let meta = ReelMeta { span: *span, years };

+            let (photos, videos): (Vec<_>, Vec<_>) = items
+                .into_iter()
+                .filter(|it| {
+                    is_image_file(Path::new(&it.path)) || is_video_file(Path::new(&it.path))
+                })
+                .partition(|it| is_image_file(Path::new(&it.path)));
+
            // The budget caps the number of narrated beats (≈ reel length);
-            // each beat then bursts through several photos, so the reel covers
-            // the span's moments without running minutes long.
+            // photo beats then burst through several photos and video beats
+            // play a short clip, so the reel covers the span without running
+            // minutes long.
            let n_beats = budget_segments(*max_segments);
-            let beats = form_beats(&items, n_beats, MAX_BURST_PHOTOS);
+            let beats = form_beats(&photos, &videos, n_beats, MAX_BURST_PHOTOS);
            Ok((beats, meta))
        }
    }
@@ -289,10 +346,13 @@ pub fn enrich(
        return;
    };
    for beat in beats.iter_mut() {
-        let Some(SegmentMedia::Photo { rel_path, .. }) = beat.photos.first() else {
-            continue;
+        let rel_path = match beat.media.first() {
+            Some(SegmentMedia::Photo { rel_path, .. } | SegmentMedia::Clip { rel_path, .. }) => {
+                rel_path.clone()
+            }
+            None => continue,
        };
-        if let Ok(Some(insight)) = dao.get_insight(span_context, rel_path) {
+        if let Ok(Some(insight)) = dao.get_insight(span_context, &rel_path) {
            beat.insight_title = Some(insight.title);
            beat.insight_summary = Some(insight.summary);
        }
@@ -372,15 +432,18 @@ mod tests {
        assert_eq!(distinct_years(&items, None), vec![2019, 2021]);
    }

-    // Build an item at a given unix timestamp (seconds).
-    fn item_at(ts: i64, name: &str) -> memories::MemoryItem {
+    // Build an item at a given unix timestamp (seconds) with a chosen extension.
+    fn item_ext(ts: i64, name: &str, ext: &str) -> memories::MemoryItem {
        memories::MemoryItem {
-            path: format!("{name}.jpg"),
+            path: format!("{name}.{ext}"),
            created: Some(ts),
            modified: None,
            library_id: 1,
        }
    }
+    fn item_at(ts: i64, name: &str) -> memories::MemoryItem {
+        item_ext(ts, name, "jpg")
+    }

    #[test]
    fn budget_segments_caps_to_duration_target() {
@@ -405,7 +468,7 @@ mod tests {
    }

    #[test]
-    fn form_beats_one_beat_per_event_when_they_fit() {
+    fn photo_beats_one_per_event_when_they_fit() {
        // Three well-separated events, budget of 10 → three beats, each holding
        // all of its (few) photos.
        let items = vec![
@@ -414,35 +477,70 @@ mod tests {
            item_at(1_000_000, "c"),
            item_at(2_000_000, "d"),
        ];
-        let beats = form_beats(&items, 10, MAX_BURST_PHOTOS);
+        let beats = form_photo_beats(&items, 10, MAX_BURST_PHOTOS);
        assert_eq!(beats.len(), 3);
-        assert_eq!(beats[0].photos.len(), 2); // burst of the first event
-        assert_eq!(beats[1].photos.len(), 1);
-        assert_eq!(beats[2].photos.len(), 1);
+        assert_eq!(beats[0].media.len(), 2); // burst of the first event
+        assert_eq!(beats[1].media.len(), 1);
+        assert_eq!(beats[2].media.len(), 1);
    }

    #[test]
-    fn form_beats_merges_events_when_over_budget() {
+    fn photo_beats_merge_events_when_over_budget() {
        // Six distinct events but only two beats → adjacent events fold in, and
        // every event's photos still appear (capped by the burst max).
        let items: Vec<memories::MemoryItem> = (0..6)
            .map(|i| item_at(i as i64 * 1_000_000, &format!("e{i}")))
            .collect();
-        let beats = form_beats(&items, 2, MAX_BURST_PHOTOS);
+        let beats = form_photo_beats(&items, 2, MAX_BURST_PHOTOS);
        assert_eq!(beats.len(), 2);
-        let shown: usize = beats.iter().map(|b| b.photos.len()).sum();
+        let shown: usize = beats.iter().map(|b| b.media.len()).sum();
        assert_eq!(shown, 6); // all six moments still shown across two beats
    }

    #[test]
-    fn form_beats_caps_burst_to_max() {
+    fn photo_beats_cap_burst_to_max() {
        // One dense event of 30 photos, generous budget → a single beat that
        // bursts at most MAX_BURST_PHOTOS, not all 30.
        let items: Vec<memories::MemoryItem> = (0..30)
            .map(|i| item_at(i as i64, &format!("p{i}")))
            .collect();
-        let beats = form_beats(&items, 18, MAX_BURST_PHOTOS);
+        let beats = form_photo_beats(&items, 18, MAX_BURST_PHOTOS);
        assert_eq!(beats.len(), 1);
-        assert_eq!(beats[0].photos.len(), MAX_BURST_PHOTOS);
+        assert_eq!(beats[0].media.len(), MAX_BURST_PHOTOS);
+    }
+
+    #[test]
+    fn split_beat_budget_handles_each_mix() {
+        // Only photos / only videos → that kind gets the whole budget.
+        assert_eq!(split_beat_budget(10, 0, 18), (18, 0));
+        assert_eq!(split_beat_budget(0, 10, 18), (0, 10)); // capped at n_videos
+        assert_eq!(split_beat_budget(0, 30, 18), (0, 18)); // capped at budget
+        // Mixed → clips up to half (≥1), photos the rest.
+        assert_eq!(split_beat_budget(100, 100, 18), (9, 9));
+        assert_eq!(split_beat_budget(100, 1, 18), (17, 1)); // few videos
+    }
+
+    #[test]
+    fn form_beats_mixes_clip_and_photo_beats_in_time_order() {
+        let photos = vec![item_at(0, "p0"), item_at(2_000_000, "p1")];
+        // A video between the two photo events (in time).
+        let videos = vec![item_ext(1_000_000, "v0", "mp4")];
+        let beats = form_beats(&photos, &videos, 10, MAX_BURST_PHOTOS);
+        // Two photo events + one clip = three beats, chronological.
+        assert_eq!(beats.len(), 3);
+        assert!(!beats[0].is_clip()); // p0 @ t=0
+        assert!(beats[1].is_clip()); // v0 @ t=1e6
+        assert!(!beats[2].is_clip()); // p1 @ t=2e6
+        assert!(matches!(beats[1].media[0], SegmentMedia::Clip { .. }));
+    }
+
+    #[test]
+    fn form_beats_videos_only_become_clip_beats() {
+        let videos: Vec<memories::MemoryItem> = (0..3)
+            .map(|i| item_ext(i as i64 * 1_000_000, &format!("v{i}"), "mov"))
+            .collect();
+        let beats = form_beats(&[], &videos, 10, MAX_BURST_PHOTOS);
+        assert_eq!(beats.len(), 3);
+        assert!(beats.iter().all(|b| b.is_clip()));
    }
 }