From 7715a7a905015faa977eb0a45c0670f022120f20 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Fri, 12 Jun 2026 23:10:26 -0400 Subject: [PATCH] Reels: portrait canvas with blurred fill, fade transitions, warmer TTS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes the "image is tiny" problem: a 1920x1080 landscape reel letterboxes to a ~25%-height band on a portrait phone. Switch to a portrait 1080x1920 canvas and fill it per photo with a blurred, zoomed copy of the image behind the sharp fitted photo — so the frame is always full regardless of the photo's orientation, with no black bars and no cropping of the subject. Add a quick 0.35s fade in/out baked into each segment so concatenated photos dip smoothly instead of hard-cutting (fade-out lands in the narration's silent tail, so speech isn't clipped). Drop the unused Ken Burns branch — motion can return deliberately later. Warm up the narration a touch: thread Chatterbox's `exaggeration` through synthesize_serialized and default reels to 0.7 (tunable via REEL_TTS_EXAGGERATION). Bump RENDER_VERSION so existing landscape reels re-render. Co-Authored-By: Claude Fable 5 --- src/ai/tts.rs | 6 ++- src/reels/mod.rs | 38 ++++++++++---- src/reels/render.rs | 123 +++++++++++++++++++++++++------------------- 3 files changed, 101 insertions(+), 66 deletions(-) diff --git a/src/ai/tts.rs b/src/ai/tts.rs index 4e7544c..a9a610a 100644 --- a/src/ai/tts.rs +++ b/src/ai/tts.rs @@ -486,11 +486,15 @@ pub async fn synthesize_serialized( text: &str, voice: Option<&str>, format: &str, + exaggeration: Option, ) -> anyhow::Result> { let prepared = prepare_for_tts(text); if prepared.is_empty() { anyhow::bail!("nothing to synthesize after cleanup"); } + // Clamp to Chatterbox's documented range, matching the HTTP handlers + // (which clamp before forwarding; this path bypasses them). + let exaggeration = exaggeration.map(|x| x.clamp(0.25, 2.0)); // Queue rather than fast-fail (mirrors create_speech_job_handler). let _permit = TTS_PERMIT .acquire() @@ -500,7 +504,7 @@ pub async fn synthesize_serialized( // starts (see ai::gpu). let _gpu = crate::ai::gpu::tts_lease().await; client - .text_to_speech(&prepared, voice, format, None, None, None) + .text_to_speech(&prepared, voice, format, exaggeration, None, None) .await } diff --git a/src/reels/mod.rs b/src/reels/mod.rs index fe270f8..9956984 100644 --- a/src/reels/mod.rs +++ b/src/reels/mod.rs @@ -180,7 +180,18 @@ fn finish_job( /// Render version: bump to invalidate every cached reel after a rendering / /// scripting change that should produce a fresh result. -const RENDER_VERSION: u32 = 1; +const RENDER_VERSION: u32 = 2; + +/// Narration expressiveness — Chatterbox's `exaggeration` knob. A modest bump +/// over the ~0.5 default warms up otherwise-flat narration; tune via +/// `REEL_TTS_EXAGGERATION` (0.25–2.0). +fn reel_tts_exaggeration() -> f32 { + std::env::var("REEL_TTS_EXAGGERATION") + .ok() + .and_then(|s| s.trim().parse::().ok()) + .filter(|x| x.is_finite()) + .unwrap_or(0.7) +} /// Cache key over everything that determines *which* media and *how* it's /// voiced — but not the (non-deterministic) narration text. Same inputs → same @@ -470,16 +481,21 @@ async fn run_reel_job( } }; - let audio_bytes = - match crate::ai::tts::synthesize_serialized(&client, line, voice.as_deref(), "wav") - .await - { - Ok(b) => b, - Err(e) => { - log::warn!("reel {job_id}: skipping segment {i}, TTS failed: {e}"); - continue; - } - }; + let audio_bytes = match crate::ai::tts::synthesize_serialized( + &client, + line, + voice.as_deref(), + "wav", + Some(reel_tts_exaggeration()), + ) + .await + { + Ok(b) => b, + Err(e) => { + log::warn!("reel {job_id}: skipping segment {i}, TTS failed: {e}"); + continue; + } + }; let audio_path = work.path().join(format!("narration_{i:03}.wav")); if let Err(e) = tokio::fs::write(&audio_path, &audio_bytes).await { log::warn!("reel {job_id}: skipping segment {i}, writing audio failed: {e}"); diff --git a/src/reels/render.rs b/src/reels/render.rs index 9643309..e40fc3d 100644 --- a/src/reels/render.rs +++ b/src/reels/render.rs @@ -19,11 +19,13 @@ use tokio::process::Command; /// rather than depending on `video::ffmpeg` directly. pub use crate::video::ffmpeg::is_nvenc_available; -/// Reel canvas. Landscape matches the majority of camera photos; portrait -/// shots are letterboxed by the `pad` in [`segment_filter`] rather than -/// cropped, so faces never get cut off. -pub const REEL_WIDTH: u32 = 1920; -pub const REEL_HEIGHT: u32 = 1080; +/// Reel canvas. Portrait, because reels are watched on a phone held upright — +/// a landscape canvas letterboxes to a thin ~25%-height band there. Each photo +/// is fitted sharp and centered over a blurred, zoomed copy of itself (see +/// [`segment_filtergraph`]) so the frame is always filled regardless of the +/// photo's orientation, without cropping the subject. +pub const REEL_WIDTH: u32 = 1080; +pub const REEL_HEIGHT: u32 = 1920; pub const REEL_FPS: u32 = 30; /// A still's screen time is its narration length plus a short breath, with a @@ -33,6 +35,11 @@ pub const REEL_FPS: u32 = 30; pub const MIN_SEGMENT_SECONDS: f64 = 2.5; const NARRATION_TAIL_SECONDS: f64 = 0.6; +/// Quick fade in/out baked into each segment so concatenated photos dip +/// smoothly instead of hard-cutting. The fade-out lands inside the narration's +/// silent tail, so speech is never clipped. +const FADE_SECONDS: f64 = 0.35; + /// Screen time for a photo segment given its narration audio length. pub fn segment_duration(narration_secs: f64) -> f64 { let d = narration_secs + NARRATION_TAIL_SECONDS; @@ -43,16 +50,13 @@ pub fn segment_duration(narration_secs: f64) -> f64 { } } -/// Options controlling per-segment rendering. `ken_burns` adds a slow zoom for -/// motion; it's defaulted off until the effect is eyeballed on the GPU box, -/// since a wrong zoompan expression reads as jitter and can't be verified here. +/// Options controlling per-segment rendering. #[derive(Debug, Clone, Copy)] pub struct SegmentOpts { pub width: u32, pub height: u32, pub fps: u32, pub nvenc: bool, - pub ken_burns: bool, } impl Default for SegmentOpts { @@ -62,35 +66,38 @@ impl Default for SegmentOpts { height: REEL_HEIGHT, fps: REEL_FPS, nvenc: false, - ken_burns: false, } } } -/// Video filter for a photo segment: fit the image inside the canvas -/// (preserving aspect, padding the rest), normalize SAR/fps/pixel format, and -/// optionally apply a gentle Ken Burns zoom. -pub fn segment_filter(opts: &SegmentOpts, duration: f64) -> String { +/// Full `filter_complex` for one photo segment, producing labelled `[v]` (video) +/// and `[a]` (audio) outputs. Input 0 is the looped still, input 1 the +/// narration. +/// +/// Video: split the still into a background and foreground. The background is +/// scaled to *cover* the canvas and heavily blurred; the foreground is scaled to +/// *fit* inside it and overlaid centered. This fills the portrait frame for any +/// photo orientation — no black bars, no cropping of the subject — then a quick +/// fade in/out softens the cut to the next segment. +/// +/// Audio: pad the narration with trailing silence so a short line doesn't end +/// the segment early; `-t` bounds it to the segment duration. +pub fn segment_filtergraph(opts: &SegmentOpts, duration: f64) -> String { let (w, h, fps) = (opts.width, opts.height, opts.fps); - if opts.ken_burns { - // Upscale first so zoompan samples from a larger frame (avoids - // shimmer), drift the zoom from 1.0→~1.12 across the segment, hold the - // crop centered, then settle to the canvas. - let frames = (duration * fps as f64).round().max(1.0) as u64; - format!( - "scale={w}*2:{h}*2:force_original_aspect_ratio=increase,\ - crop={w}*2:{h}*2,\ - zoompan=z='min(zoom+0.0009,1.12)':d={frames}:\ - x='iw/2-(iw/zoom/2)':y='ih/2-(ih/zoom/2)':s={w}x{h}:fps={fps},\ - setsar=1,format=yuv420p" - ) - } else { - format!( - "scale={w}:{h}:force_original_aspect_ratio=decrease,\ - pad={w}:{h}:(ow-iw)/2:(oh-ih)/2,\ - setsar=1,fps={fps},format=yuv420p" - ) - } + // Fade-out begins one fade-length before the end; clamp so a floor-length + // segment still gets a valid (non-negative) start time. + let fade_out_start = (duration - FADE_SECONDS).max(0.0); + format!( + "[0:v]split=2[bg][fg];\ + [bg]scale={w}:{h}:force_original_aspect_ratio=increase,\ + crop={w}:{h},boxblur=20:2[bgb];\ + [fg]scale={w}:{h}:force_original_aspect_ratio=decrease[fgs];\ + [bgb][fgs]overlay=(W-w)/2:(H-h)/2,\ + fade=t=in:st=0:d={FADE_SECONDS},\ + fade=t=out:st={fade_out_start:.3}:d={FADE_SECONDS},\ + setsar=1,fps={fps},format=yuv420p[v];\ + [1:a]apad[a]" + ) } fn video_encoder_args(nvenc: bool) -> Vec { @@ -117,9 +124,9 @@ fn video_encoder_args(nvenc: bool) -> Vec { } /// Build the ffmpeg args that render one photo segment: a still looped for -/// `duration` seconds with its narration muxed in. The narration is padded -/// with trailing silence (`apad`) so short lines don't end the segment early; -/// `-t` bounds both streams to the segment length. +/// `duration` seconds, filled to the portrait canvas with a blurred backdrop +/// (see [`segment_filtergraph`]) and the narration muxed in. `-t` bounds both +/// streams to the segment length. pub fn build_segment_args( image_path: &str, audio_path: &str, @@ -139,7 +146,7 @@ pub fn build_segment_args( "-i".into(), audio_path.into(), "-filter_complex".into(), - format!("[0:v]{}[v];[1:a]apad[a]", segment_filter(opts, duration)), + segment_filtergraph(opts, duration), "-map".into(), "[v]".into(), "-map".into(), @@ -267,26 +274,34 @@ mod tests { } #[test] - fn static_filter_fits_and_pads_without_cropping() { - let f = segment_filter(&SegmentOpts::default(), 4.0); - assert!(f.contains("force_original_aspect_ratio=decrease")); - assert!(f.contains("pad=1920:1080")); - assert!(f.contains("format=yuv420p")); - // No zoompan when ken_burns is off. - assert!(!f.contains("zoompan")); + fn filtergraph_fills_portrait_with_blurred_bg_and_fitted_fg() { + let g = segment_filtergraph(&SegmentOpts::default(), 4.0); + // Background covers + blurs; foreground fits and is centered over it. + assert!(g.contains("split=2[bg][fg]")); + assert!(g.contains("scale=1080:1920:force_original_aspect_ratio=increase")); + assert!(g.contains("crop=1080:1920")); + assert!(g.contains("boxblur")); + assert!(g.contains("scale=1080:1920:force_original_aspect_ratio=decrease")); + assert!(g.contains("overlay=(W-w)/2:(H-h)/2")); + // Produces the labelled outputs build_segment_args maps. + assert!(g.contains("[v]")); + assert!(g.contains("[1:a]apad[a]")); + assert!(g.contains("format=yuv420p")); } #[test] - fn ken_burns_filter_uses_duration_scaled_frame_count() { - let opts = SegmentOpts { - ken_burns: true, - ..SegmentOpts::default() - }; - // 4s * 30fps = 120 frames in the zoompan d= term. - let f = segment_filter(&opts, 4.0); - assert!(f.contains("zoompan")); - assert!(f.contains("d=120:")); - assert!(f.contains("s=1920x1080")); + fn filtergraph_fades_in_and_out_within_duration() { + // 4s segment, 0.35s fade → fade-out starts at 3.65s. + let g = segment_filtergraph(&SegmentOpts::default(), 4.0); + assert!(g.contains("fade=t=in:st=0:d=0.35")); + assert!(g.contains("fade=t=out:st=3.650:d=0.35")); + } + + #[test] + fn filtergraph_fade_out_start_never_negative_at_floor() { + // A floor-length segment shorter than a fade still yields st >= 0. + let g = segment_filtergraph(&SegmentOpts::default(), 0.2); + assert!(g.contains("fade=t=out:st=0.000:d=0.35")); } #[test]