From 7715a7a905015faa977eb0a45c0670f022120f20 Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Fri, 12 Jun 2026 23:10:26 -0400
Subject: [PATCH] Reels: portrait canvas with blurred fill, fade transitions,
 warmer TTS
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes the "image is tiny" problem: a 1920x1080 landscape reel letterboxes
to a ~25%-height band on a portrait phone. Switch to a portrait 1080x1920
canvas and fill it per photo with a blurred, zoomed copy of the image
behind the sharp fitted photo — so the frame is always full regardless of
the photo's orientation, with no black bars and no cropping of the subject.

Add a quick 0.35s fade in/out baked into each segment so concatenated
photos dip smoothly instead of hard-cutting (fade-out lands in the
narration's silent tail, so speech isn't clipped). Drop the unused
Ken Burns branch — motion can return deliberately later.

Warm up the narration a touch: thread Chatterbox's `exaggeration` through
synthesize_serialized and default reels to 0.7 (tunable via
REEL_TTS_EXAGGERATION). Bump RENDER_VERSION so existing landscape reels
re-render.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/ai/tts.rs       |   6 ++-
 src/reels/mod.rs    |  38 ++++++++++----
 src/reels/render.rs | 123 +++++++++++++++++++++++++-------------------
 3 files changed, 101 insertions(+), 66 deletions(-)
diff --git a/src/ai/tts.rs b/src/ai/tts.rs
index 4e7544c..a9a610a 100644
--- a/src/ai/tts.rs
+++ b/src/ai/tts.rs
@@ -486,11 +486,15 @@ pub async fn synthesize_serialized(
     text: &str,
     voice: Option<&str>,
     format: &str,
+    exaggeration: Option<f32>,
 ) -> anyhow::Result<Vec<u8>> {
     let prepared = prepare_for_tts(text);
     if prepared.is_empty() {
         anyhow::bail!("nothing to synthesize after cleanup");
     }
+    // Clamp to Chatterbox's documented range, matching the HTTP handlers
+    // (which clamp before forwarding; this path bypasses them).
+    let exaggeration = exaggeration.map(|x| x.clamp(0.25, 2.0));
     // Queue rather than fast-fail (mirrors create_speech_job_handler).
     let _permit = TTS_PERMIT
         .acquire()
@@ -500,7 +504,7 @@ pub async fn synthesize_serialized(
     // starts (see ai::gpu).
     let _gpu = crate::ai::gpu::tts_lease().await;
     client
-        .text_to_speech(&prepared, voice, format, None, None, None)
+        .text_to_speech(&prepared, voice, format, exaggeration, None, None)
         .await
 }
 
diff --git a/src/reels/mod.rs b/src/reels/mod.rs
index fe270f8..9956984 100644
--- a/src/reels/mod.rs
+++ b/src/reels/mod.rs
@@ -180,7 +180,18 @@ fn finish_job(
 
 /// Render version: bump to invalidate every cached reel after a rendering /
 /// scripting change that should produce a fresh result.
-const RENDER_VERSION: u32 = 1;
+const RENDER_VERSION: u32 = 2;
+
+/// Narration expressiveness — Chatterbox's `exaggeration` knob. A modest bump
+/// over the ~0.5 default warms up otherwise-flat narration; tune via
+/// `REEL_TTS_EXAGGERATION` (0.25–2.0).
+fn reel_tts_exaggeration() -> f32 {
+    std::env::var("REEL_TTS_EXAGGERATION")
+        .ok()
+        .and_then(|s| s.trim().parse::<f32>().ok())
+        .filter(|x| x.is_finite())
+        .unwrap_or(0.7)
+}
 
 /// Cache key over everything that determines *which* media and *how* it's
 /// voiced — but not the (non-deterministic) narration text. Same inputs → same
@@ -470,16 +481,21 @@ async fn run_reel_job(
             }
         };
 
-        let audio_bytes =
-            match crate::ai::tts::synthesize_serialized(&client, line, voice.as_deref(), "wav")
-                .await
-            {
-                Ok(b) => b,
-                Err(e) => {
-                    log::warn!("reel {job_id}: skipping segment {i}, TTS failed: {e}");
-                    continue;
-                }
-            };
+        let audio_bytes = match crate::ai::tts::synthesize_serialized(
+            &client,
+            line,
+            voice.as_deref(),
+            "wav",
+            Some(reel_tts_exaggeration()),
+        )
+        .await
+        {
+            Ok(b) => b,
+            Err(e) => {
+                log::warn!("reel {job_id}: skipping segment {i}, TTS failed: {e}");
+                continue;
+            }
+        };
         let audio_path = work.path().join(format!("narration_{i:03}.wav"));
         if let Err(e) = tokio::fs::write(&audio_path, &audio_bytes).await {
             log::warn!("reel {job_id}: skipping segment {i}, writing audio failed: {e}");
diff --git a/src/reels/render.rs b/src/reels/render.rs
index 9643309..e40fc3d 100644
--- a/src/reels/render.rs
+++ b/src/reels/render.rs
@@ -19,11 +19,13 @@ use tokio::process::Command;
 /// rather than depending on `video::ffmpeg` directly.
 pub use crate::video::ffmpeg::is_nvenc_available;
 
-/// Reel canvas. Landscape matches the majority of camera photos; portrait
-/// shots are letterboxed by the `pad` in [`segment_filter`] rather than
-/// cropped, so faces never get cut off.
-pub const REEL_WIDTH: u32 = 1920;
-pub const REEL_HEIGHT: u32 = 1080;
+/// Reel canvas. Portrait, because reels are watched on a phone held upright —
+/// a landscape canvas letterboxes to a thin ~25%-height band there. Each photo
+/// is fitted sharp and centered over a blurred, zoomed copy of itself (see
+/// [`segment_filtergraph`]) so the frame is always filled regardless of the
+/// photo's orientation, without cropping the subject.
+pub const REEL_WIDTH: u32 = 1080;
+pub const REEL_HEIGHT: u32 = 1920;
 pub const REEL_FPS: u32 = 30;
 
 /// A still's screen time is its narration length plus a short breath, with a
@@ -33,6 +35,11 @@ pub const REEL_FPS: u32 = 30;
 pub const MIN_SEGMENT_SECONDS: f64 = 2.5;
 const NARRATION_TAIL_SECONDS: f64 = 0.6;
 
+/// Quick fade in/out baked into each segment so concatenated photos dip
+/// smoothly instead of hard-cutting. The fade-out lands inside the narration's
+/// silent tail, so speech is never clipped.
+const FADE_SECONDS: f64 = 0.35;
+
 /// Screen time for a photo segment given its narration audio length.
 pub fn segment_duration(narration_secs: f64) -> f64 {
     let d = narration_secs + NARRATION_TAIL_SECONDS;
@@ -43,16 +50,13 @@ pub fn segment_duration(narration_secs: f64) -> f64 {
     }
 }
 
-/// Options controlling per-segment rendering. `ken_burns` adds a slow zoom for
-/// motion; it's defaulted off until the effect is eyeballed on the GPU box,
-/// since a wrong zoompan expression reads as jitter and can't be verified here.
+/// Options controlling per-segment rendering.
 #[derive(Debug, Clone, Copy)]
 pub struct SegmentOpts {
     pub width: u32,
     pub height: u32,
     pub fps: u32,
     pub nvenc: bool,
-    pub ken_burns: bool,
 }
 
 impl Default for SegmentOpts {
@@ -62,35 +66,38 @@ impl Default for SegmentOpts {
             height: REEL_HEIGHT,
             fps: REEL_FPS,
             nvenc: false,
-            ken_burns: false,
         }
     }
 }
 
-/// Video filter for a photo segment: fit the image inside the canvas
-/// (preserving aspect, padding the rest), normalize SAR/fps/pixel format, and
-/// optionally apply a gentle Ken Burns zoom.
-pub fn segment_filter(opts: &SegmentOpts, duration: f64) -> String {
+/// Full `filter_complex` for one photo segment, producing labelled `[v]` (video)
+/// and `[a]` (audio) outputs. Input 0 is the looped still, input 1 the
+/// narration.
+///
+/// Video: split the still into a background and foreground. The background is
+/// scaled to *cover* the canvas and heavily blurred; the foreground is scaled to
+/// *fit* inside it and overlaid centered. This fills the portrait frame for any
+/// photo orientation — no black bars, no cropping of the subject — then a quick
+/// fade in/out softens the cut to the next segment.
+///
+/// Audio: pad the narration with trailing silence so a short line doesn't end
+/// the segment early; `-t` bounds it to the segment duration.
+pub fn segment_filtergraph(opts: &SegmentOpts, duration: f64) -> String {
     let (w, h, fps) = (opts.width, opts.height, opts.fps);
-    if opts.ken_burns {
-        // Upscale first so zoompan samples from a larger frame (avoids
-        // shimmer), drift the zoom from 1.0→~1.12 across the segment, hold the
-        // crop centered, then settle to the canvas.
-        let frames = (duration * fps as f64).round().max(1.0) as u64;
-        format!(
-            "scale={w}*2:{h}*2:force_original_aspect_ratio=increase,\
-             crop={w}*2:{h}*2,\
-             zoompan=z='min(zoom+0.0009,1.12)':d={frames}:\
-             x='iw/2-(iw/zoom/2)':y='ih/2-(ih/zoom/2)':s={w}x{h}:fps={fps},\
-             setsar=1,format=yuv420p"
-        )
-    } else {
-        format!(
-            "scale={w}:{h}:force_original_aspect_ratio=decrease,\
-             pad={w}:{h}:(ow-iw)/2:(oh-ih)/2,\
-             setsar=1,fps={fps},format=yuv420p"
-        )
-    }
+    // Fade-out begins one fade-length before the end; clamp so a floor-length
+    // segment still gets a valid (non-negative) start time.
+    let fade_out_start = (duration - FADE_SECONDS).max(0.0);
+    format!(
+        "[0:v]split=2[bg][fg];\
+         [bg]scale={w}:{h}:force_original_aspect_ratio=increase,\
+         crop={w}:{h},boxblur=20:2[bgb];\
+         [fg]scale={w}:{h}:force_original_aspect_ratio=decrease[fgs];\
+         [bgb][fgs]overlay=(W-w)/2:(H-h)/2,\
+         fade=t=in:st=0:d={FADE_SECONDS},\
+         fade=t=out:st={fade_out_start:.3}:d={FADE_SECONDS},\
+         setsar=1,fps={fps},format=yuv420p[v];\
+         [1:a]apad[a]"
+    )
 }
 
 fn video_encoder_args(nvenc: bool) -> Vec<String> {
@@ -117,9 +124,9 @@ fn video_encoder_args(nvenc: bool) -> Vec<String> {
 }
 
 /// Build the ffmpeg args that render one photo segment: a still looped for
-/// `duration` seconds with its narration muxed in. The narration is padded
-/// with trailing silence (`apad`) so short lines don't end the segment early;
-/// `-t` bounds both streams to the segment length.
+/// `duration` seconds, filled to the portrait canvas with a blurred backdrop
+/// (see [`segment_filtergraph`]) and the narration muxed in. `-t` bounds both
+/// streams to the segment length.
 pub fn build_segment_args(
     image_path: &str,
     audio_path: &str,
@@ -139,7 +146,7 @@ pub fn build_segment_args(
         "-i".into(),
         audio_path.into(),
         "-filter_complex".into(),
-        format!("[0:v]{}[v];[1:a]apad[a]", segment_filter(opts, duration)),
+        segment_filtergraph(opts, duration),
         "-map".into(),
         "[v]".into(),
         "-map".into(),
@@ -267,26 +274,34 @@ mod tests {
     }
 
     #[test]
-    fn static_filter_fits_and_pads_without_cropping() {
-        let f = segment_filter(&SegmentOpts::default(), 4.0);
-        assert!(f.contains("force_original_aspect_ratio=decrease"));
-        assert!(f.contains("pad=1920:1080"));
-        assert!(f.contains("format=yuv420p"));
-        // No zoompan when ken_burns is off.
-        assert!(!f.contains("zoompan"));
+    fn filtergraph_fills_portrait_with_blurred_bg_and_fitted_fg() {
+        let g = segment_filtergraph(&SegmentOpts::default(), 4.0);
+        // Background covers + blurs; foreground fits and is centered over it.
+        assert!(g.contains("split=2[bg][fg]"));
+        assert!(g.contains("scale=1080:1920:force_original_aspect_ratio=increase"));
+        assert!(g.contains("crop=1080:1920"));
+        assert!(g.contains("boxblur"));
+        assert!(g.contains("scale=1080:1920:force_original_aspect_ratio=decrease"));
+        assert!(g.contains("overlay=(W-w)/2:(H-h)/2"));
+        // Produces the labelled outputs build_segment_args maps.
+        assert!(g.contains("[v]"));
+        assert!(g.contains("[1:a]apad[a]"));
+        assert!(g.contains("format=yuv420p"));
     }
 
     #[test]
-    fn ken_burns_filter_uses_duration_scaled_frame_count() {
-        let opts = SegmentOpts {
-            ken_burns: true,
-            ..SegmentOpts::default()
-        };
-        // 4s * 30fps = 120 frames in the zoompan d= term.
-        let f = segment_filter(&opts, 4.0);
-        assert!(f.contains("zoompan"));
-        assert!(f.contains("d=120:"));
-        assert!(f.contains("s=1920x1080"));
+    fn filtergraph_fades_in_and_out_within_duration() {
+        // 4s segment, 0.35s fade → fade-out starts at 3.65s.
+        let g = segment_filtergraph(&SegmentOpts::default(), 4.0);
+        assert!(g.contains("fade=t=in:st=0:d=0.35"));
+        assert!(g.contains("fade=t=out:st=3.650:d=0.35"));
+    }
+
+    #[test]
+    fn filtergraph_fade_out_start_never_negative_at_floor() {
+        // A floor-length segment shorter than a fade still yields st >= 0.
+        let g = segment_filtergraph(&SegmentOpts::default(), 0.2);
+        assert!(g.contains("fade=t=out:st=0.000:d=0.35"));
     }
 
     #[test]