Reels: portrait canvas with blurred fill, fade transitions, warmer TTS
Fixes the "image is tiny" problem: a 1920x1080 landscape reel letterboxes to a ~25%-height band on a portrait phone. Switch to a portrait 1080x1920 canvas and fill it per photo with a blurred, zoomed copy of the image behind the sharp fitted photo — so the frame is always full regardless of the photo's orientation, with no black bars and no cropping of the subject. Add a quick 0.35s fade in/out baked into each segment so concatenated photos dip smoothly instead of hard-cutting (fade-out lands in the narration's silent tail, so speech isn't clipped). Drop the unused Ken Burns branch — motion can return deliberately later. Warm up the narration a touch: thread Chatterbox's `exaggeration` through synthesize_serialized and default reels to 0.7 (tunable via REEL_TTS_EXAGGERATION). Bump RENDER_VERSION so existing landscape reels re-render. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
+5
-1
@@ -486,11 +486,15 @@ pub async fn synthesize_serialized(
|
|||||||
text: &str,
|
text: &str,
|
||||||
voice: Option<&str>,
|
voice: Option<&str>,
|
||||||
format: &str,
|
format: &str,
|
||||||
|
exaggeration: Option<f32>,
|
||||||
) -> anyhow::Result<Vec<u8>> {
|
) -> anyhow::Result<Vec<u8>> {
|
||||||
let prepared = prepare_for_tts(text);
|
let prepared = prepare_for_tts(text);
|
||||||
if prepared.is_empty() {
|
if prepared.is_empty() {
|
||||||
anyhow::bail!("nothing to synthesize after cleanup");
|
anyhow::bail!("nothing to synthesize after cleanup");
|
||||||
}
|
}
|
||||||
|
// Clamp to Chatterbox's documented range, matching the HTTP handlers
|
||||||
|
// (which clamp before forwarding; this path bypasses them).
|
||||||
|
let exaggeration = exaggeration.map(|x| x.clamp(0.25, 2.0));
|
||||||
// Queue rather than fast-fail (mirrors create_speech_job_handler).
|
// Queue rather than fast-fail (mirrors create_speech_job_handler).
|
||||||
let _permit = TTS_PERMIT
|
let _permit = TTS_PERMIT
|
||||||
.acquire()
|
.acquire()
|
||||||
@@ -500,7 +504,7 @@ pub async fn synthesize_serialized(
|
|||||||
// starts (see ai::gpu).
|
// starts (see ai::gpu).
|
||||||
let _gpu = crate::ai::gpu::tts_lease().await;
|
let _gpu = crate::ai::gpu::tts_lease().await;
|
||||||
client
|
client
|
||||||
.text_to_speech(&prepared, voice, format, None, None, None)
|
.text_to_speech(&prepared, voice, format, exaggeration, None, None)
|
||||||
.await
|
.await
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
+19
-3
@@ -180,7 +180,18 @@ fn finish_job(
|
|||||||
|
|
||||||
/// Render version: bump to invalidate every cached reel after a rendering /
|
/// Render version: bump to invalidate every cached reel after a rendering /
|
||||||
/// scripting change that should produce a fresh result.
|
/// scripting change that should produce a fresh result.
|
||||||
const RENDER_VERSION: u32 = 1;
|
const RENDER_VERSION: u32 = 2;
|
||||||
|
|
||||||
|
/// Narration expressiveness — Chatterbox's `exaggeration` knob. A modest bump
|
||||||
|
/// over the ~0.5 default warms up otherwise-flat narration; tune via
|
||||||
|
/// `REEL_TTS_EXAGGERATION` (0.25–2.0).
|
||||||
|
fn reel_tts_exaggeration() -> f32 {
|
||||||
|
std::env::var("REEL_TTS_EXAGGERATION")
|
||||||
|
.ok()
|
||||||
|
.and_then(|s| s.trim().parse::<f32>().ok())
|
||||||
|
.filter(|x| x.is_finite())
|
||||||
|
.unwrap_or(0.7)
|
||||||
|
}
|
||||||
|
|
||||||
/// Cache key over everything that determines *which* media and *how* it's
|
/// Cache key over everything that determines *which* media and *how* it's
|
||||||
/// voiced — but not the (non-deterministic) narration text. Same inputs → same
|
/// voiced — but not the (non-deterministic) narration text. Same inputs → same
|
||||||
@@ -470,8 +481,13 @@ async fn run_reel_job(
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let audio_bytes =
|
let audio_bytes = match crate::ai::tts::synthesize_serialized(
|
||||||
match crate::ai::tts::synthesize_serialized(&client, line, voice.as_deref(), "wav")
|
&client,
|
||||||
|
line,
|
||||||
|
voice.as_deref(),
|
||||||
|
"wav",
|
||||||
|
Some(reel_tts_exaggeration()),
|
||||||
|
)
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
Ok(b) => b,
|
Ok(b) => b,
|
||||||
|
|||||||
+67
-52
@@ -19,11 +19,13 @@ use tokio::process::Command;
|
|||||||
/// rather than depending on `video::ffmpeg` directly.
|
/// rather than depending on `video::ffmpeg` directly.
|
||||||
pub use crate::video::ffmpeg::is_nvenc_available;
|
pub use crate::video::ffmpeg::is_nvenc_available;
|
||||||
|
|
||||||
/// Reel canvas. Landscape matches the majority of camera photos; portrait
|
/// Reel canvas. Portrait, because reels are watched on a phone held upright —
|
||||||
/// shots are letterboxed by the `pad` in [`segment_filter`] rather than
|
/// a landscape canvas letterboxes to a thin ~25%-height band there. Each photo
|
||||||
/// cropped, so faces never get cut off.
|
/// is fitted sharp and centered over a blurred, zoomed copy of itself (see
|
||||||
pub const REEL_WIDTH: u32 = 1920;
|
/// [`segment_filtergraph`]) so the frame is always filled regardless of the
|
||||||
pub const REEL_HEIGHT: u32 = 1080;
|
/// photo's orientation, without cropping the subject.
|
||||||
|
pub const REEL_WIDTH: u32 = 1080;
|
||||||
|
pub const REEL_HEIGHT: u32 = 1920;
|
||||||
pub const REEL_FPS: u32 = 30;
|
pub const REEL_FPS: u32 = 30;
|
||||||
|
|
||||||
/// A still's screen time is its narration length plus a short breath, with a
|
/// A still's screen time is its narration length plus a short breath, with a
|
||||||
@@ -33,6 +35,11 @@ pub const REEL_FPS: u32 = 30;
|
|||||||
pub const MIN_SEGMENT_SECONDS: f64 = 2.5;
|
pub const MIN_SEGMENT_SECONDS: f64 = 2.5;
|
||||||
const NARRATION_TAIL_SECONDS: f64 = 0.6;
|
const NARRATION_TAIL_SECONDS: f64 = 0.6;
|
||||||
|
|
||||||
|
/// Quick fade in/out baked into each segment so concatenated photos dip
|
||||||
|
/// smoothly instead of hard-cutting. The fade-out lands inside the narration's
|
||||||
|
/// silent tail, so speech is never clipped.
|
||||||
|
const FADE_SECONDS: f64 = 0.35;
|
||||||
|
|
||||||
/// Screen time for a photo segment given its narration audio length.
|
/// Screen time for a photo segment given its narration audio length.
|
||||||
pub fn segment_duration(narration_secs: f64) -> f64 {
|
pub fn segment_duration(narration_secs: f64) -> f64 {
|
||||||
let d = narration_secs + NARRATION_TAIL_SECONDS;
|
let d = narration_secs + NARRATION_TAIL_SECONDS;
|
||||||
@@ -43,16 +50,13 @@ pub fn segment_duration(narration_secs: f64) -> f64 {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Options controlling per-segment rendering. `ken_burns` adds a slow zoom for
|
/// Options controlling per-segment rendering.
|
||||||
/// motion; it's defaulted off until the effect is eyeballed on the GPU box,
|
|
||||||
/// since a wrong zoompan expression reads as jitter and can't be verified here.
|
|
||||||
#[derive(Debug, Clone, Copy)]
|
#[derive(Debug, Clone, Copy)]
|
||||||
pub struct SegmentOpts {
|
pub struct SegmentOpts {
|
||||||
pub width: u32,
|
pub width: u32,
|
||||||
pub height: u32,
|
pub height: u32,
|
||||||
pub fps: u32,
|
pub fps: u32,
|
||||||
pub nvenc: bool,
|
pub nvenc: bool,
|
||||||
pub ken_burns: bool,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
impl Default for SegmentOpts {
|
impl Default for SegmentOpts {
|
||||||
@@ -62,35 +66,38 @@ impl Default for SegmentOpts {
|
|||||||
height: REEL_HEIGHT,
|
height: REEL_HEIGHT,
|
||||||
fps: REEL_FPS,
|
fps: REEL_FPS,
|
||||||
nvenc: false,
|
nvenc: false,
|
||||||
ken_burns: false,
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Video filter for a photo segment: fit the image inside the canvas
|
/// Full `filter_complex` for one photo segment, producing labelled `[v]` (video)
|
||||||
/// (preserving aspect, padding the rest), normalize SAR/fps/pixel format, and
|
/// and `[a]` (audio) outputs. Input 0 is the looped still, input 1 the
|
||||||
/// optionally apply a gentle Ken Burns zoom.
|
/// narration.
|
||||||
pub fn segment_filter(opts: &SegmentOpts, duration: f64) -> String {
|
///
|
||||||
|
/// Video: split the still into a background and foreground. The background is
|
||||||
|
/// scaled to *cover* the canvas and heavily blurred; the foreground is scaled to
|
||||||
|
/// *fit* inside it and overlaid centered. This fills the portrait frame for any
|
||||||
|
/// photo orientation — no black bars, no cropping of the subject — then a quick
|
||||||
|
/// fade in/out softens the cut to the next segment.
|
||||||
|
///
|
||||||
|
/// Audio: pad the narration with trailing silence so a short line doesn't end
|
||||||
|
/// the segment early; `-t` bounds it to the segment duration.
|
||||||
|
pub fn segment_filtergraph(opts: &SegmentOpts, duration: f64) -> String {
|
||||||
let (w, h, fps) = (opts.width, opts.height, opts.fps);
|
let (w, h, fps) = (opts.width, opts.height, opts.fps);
|
||||||
if opts.ken_burns {
|
// Fade-out begins one fade-length before the end; clamp so a floor-length
|
||||||
// Upscale first so zoompan samples from a larger frame (avoids
|
// segment still gets a valid (non-negative) start time.
|
||||||
// shimmer), drift the zoom from 1.0→~1.12 across the segment, hold the
|
let fade_out_start = (duration - FADE_SECONDS).max(0.0);
|
||||||
// crop centered, then settle to the canvas.
|
|
||||||
let frames = (duration * fps as f64).round().max(1.0) as u64;
|
|
||||||
format!(
|
format!(
|
||||||
"scale={w}*2:{h}*2:force_original_aspect_ratio=increase,\
|
"[0:v]split=2[bg][fg];\
|
||||||
crop={w}*2:{h}*2,\
|
[bg]scale={w}:{h}:force_original_aspect_ratio=increase,\
|
||||||
zoompan=z='min(zoom+0.0009,1.12)':d={frames}:\
|
crop={w}:{h},boxblur=20:2[bgb];\
|
||||||
x='iw/2-(iw/zoom/2)':y='ih/2-(ih/zoom/2)':s={w}x{h}:fps={fps},\
|
[fg]scale={w}:{h}:force_original_aspect_ratio=decrease[fgs];\
|
||||||
setsar=1,format=yuv420p"
|
[bgb][fgs]overlay=(W-w)/2:(H-h)/2,\
|
||||||
|
fade=t=in:st=0:d={FADE_SECONDS},\
|
||||||
|
fade=t=out:st={fade_out_start:.3}:d={FADE_SECONDS},\
|
||||||
|
setsar=1,fps={fps},format=yuv420p[v];\
|
||||||
|
[1:a]apad[a]"
|
||||||
)
|
)
|
||||||
} else {
|
|
||||||
format!(
|
|
||||||
"scale={w}:{h}:force_original_aspect_ratio=decrease,\
|
|
||||||
pad={w}:{h}:(ow-iw)/2:(oh-ih)/2,\
|
|
||||||
setsar=1,fps={fps},format=yuv420p"
|
|
||||||
)
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn video_encoder_args(nvenc: bool) -> Vec<String> {
|
fn video_encoder_args(nvenc: bool) -> Vec<String> {
|
||||||
@@ -117,9 +124,9 @@ fn video_encoder_args(nvenc: bool) -> Vec<String> {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Build the ffmpeg args that render one photo segment: a still looped for
|
/// Build the ffmpeg args that render one photo segment: a still looped for
|
||||||
/// `duration` seconds with its narration muxed in. The narration is padded
|
/// `duration` seconds, filled to the portrait canvas with a blurred backdrop
|
||||||
/// with trailing silence (`apad`) so short lines don't end the segment early;
|
/// (see [`segment_filtergraph`]) and the narration muxed in. `-t` bounds both
|
||||||
/// `-t` bounds both streams to the segment length.
|
/// streams to the segment length.
|
||||||
pub fn build_segment_args(
|
pub fn build_segment_args(
|
||||||
image_path: &str,
|
image_path: &str,
|
||||||
audio_path: &str,
|
audio_path: &str,
|
||||||
@@ -139,7 +146,7 @@ pub fn build_segment_args(
|
|||||||
"-i".into(),
|
"-i".into(),
|
||||||
audio_path.into(),
|
audio_path.into(),
|
||||||
"-filter_complex".into(),
|
"-filter_complex".into(),
|
||||||
format!("[0:v]{}[v];[1:a]apad[a]", segment_filter(opts, duration)),
|
segment_filtergraph(opts, duration),
|
||||||
"-map".into(),
|
"-map".into(),
|
||||||
"[v]".into(),
|
"[v]".into(),
|
||||||
"-map".into(),
|
"-map".into(),
|
||||||
@@ -267,26 +274,34 @@ mod tests {
|
|||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn static_filter_fits_and_pads_without_cropping() {
|
fn filtergraph_fills_portrait_with_blurred_bg_and_fitted_fg() {
|
||||||
let f = segment_filter(&SegmentOpts::default(), 4.0);
|
let g = segment_filtergraph(&SegmentOpts::default(), 4.0);
|
||||||
assert!(f.contains("force_original_aspect_ratio=decrease"));
|
// Background covers + blurs; foreground fits and is centered over it.
|
||||||
assert!(f.contains("pad=1920:1080"));
|
assert!(g.contains("split=2[bg][fg]"));
|
||||||
assert!(f.contains("format=yuv420p"));
|
assert!(g.contains("scale=1080:1920:force_original_aspect_ratio=increase"));
|
||||||
// No zoompan when ken_burns is off.
|
assert!(g.contains("crop=1080:1920"));
|
||||||
assert!(!f.contains("zoompan"));
|
assert!(g.contains("boxblur"));
|
||||||
|
assert!(g.contains("scale=1080:1920:force_original_aspect_ratio=decrease"));
|
||||||
|
assert!(g.contains("overlay=(W-w)/2:(H-h)/2"));
|
||||||
|
// Produces the labelled outputs build_segment_args maps.
|
||||||
|
assert!(g.contains("[v]"));
|
||||||
|
assert!(g.contains("[1:a]apad[a]"));
|
||||||
|
assert!(g.contains("format=yuv420p"));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn ken_burns_filter_uses_duration_scaled_frame_count() {
|
fn filtergraph_fades_in_and_out_within_duration() {
|
||||||
let opts = SegmentOpts {
|
// 4s segment, 0.35s fade → fade-out starts at 3.65s.
|
||||||
ken_burns: true,
|
let g = segment_filtergraph(&SegmentOpts::default(), 4.0);
|
||||||
..SegmentOpts::default()
|
assert!(g.contains("fade=t=in:st=0:d=0.35"));
|
||||||
};
|
assert!(g.contains("fade=t=out:st=3.650:d=0.35"));
|
||||||
// 4s * 30fps = 120 frames in the zoompan d= term.
|
}
|
||||||
let f = segment_filter(&opts, 4.0);
|
|
||||||
assert!(f.contains("zoompan"));
|
#[test]
|
||||||
assert!(f.contains("d=120:"));
|
fn filtergraph_fade_out_start_never_negative_at_floor() {
|
||||||
assert!(f.contains("s=1920x1080"));
|
// A floor-length segment shorter than a fade still yields st >= 0.
|
||||||
|
let g = segment_filtergraph(&SegmentOpts::default(), 0.2);
|
||||||
|
assert!(g.contains("fade=t=out:st=0.000:d=0.35"));
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|||||||
Reference in New Issue
Block a user