From 1dec34540d89265def1908d1de48a6fe0f9af053 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Fri, 12 Jun 2026 16:09:03 -0400 Subject: [PATCH] Add start/duration window selection for voice-clone reference clips Both voice creation endpoints (upload + from-library) now accept optional start_seconds/duration_seconds, threaded to ffmpeg as -ss/-t, so the reference window can target clean speech anywhere in a long recording instead of always the first N seconds. Duration is clamped to the LLAMA_SWAP_TTS_REF_SECONDS cap and the voice-name tag reflects the actual window length. Co-Authored-By: Claude Fable 5 --- src/ai/tts.rs | 181 ++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 147 insertions(+), 34 deletions(-) diff --git a/src/ai/tts.rs b/src/ai/tts.rs index a0b9bd2..415dcbf 100644 --- a/src/ai/tts.rs +++ b/src/ai/tts.rs @@ -193,6 +193,27 @@ fn append_ref_seconds(name: &str, secs: u32) -> String { format!("{base}{suffix}") } +/// Resolve a caller-supplied reference window into concrete `(start, duration)` +/// seconds for ffmpeg. Start defaults to 0; duration defaults to the +/// `tts_ref_seconds` cap and is clamped to it (the cap is the most audio the +/// TTS backend benefits from, so longer requests are quietly bounded rather +/// than rejected). Non-finite or negative values are the caller's bug → Err. +fn resolve_ref_window( + start_seconds: Option, + duration_seconds: Option, +) -> Result<(f64, f64), String> { + let cap = f64::from(tts_ref_seconds()); + let start = start_seconds.unwrap_or(0.0); + if !start.is_finite() || start < 0.0 { + return Err("start_seconds must be a non-negative number".to_string()); + } + let duration = duration_seconds.unwrap_or(cap); + if !duration.is_finite() || duration <= 0.0 { + return Err("duration_seconds must be a positive number".to_string()); + } + Ok((start, duration.min(cap))) +} + /// Optional default voice for synthesis when the request doesn't name one. /// Set `LLAMA_SWAP_TTS_VOICE=m` to read insights in a cloned voice by default. fn default_voice() -> Option { @@ -265,24 +286,31 @@ fn prepare_for_tts(input: &str) -> String { /// Decode an audio/video file to mono 24 kHz WAV via ffmpeg, returning the WAV /// bytes. Chatterbox validates the reference clip by file *extension* and /// rejects several formats (e.g. `.aac`, `.opus`), so we always normalize to -/// WAV regardless of the source container. Capped at 30s — references only need -/// a few seconds of clean speech. -async fn run_ffmpeg_to_wav(input_path: &str) -> anyhow::Result> { +/// WAV regardless of the source container. Extracts `duration` seconds starting +/// at `start` (see resolve_ref_window) — references only need a few seconds of +/// clean speech, which may sit anywhere in a long recording. +async fn run_ffmpeg_to_wav(input_path: &str, start: f64, duration: f64) -> anyhow::Result> { let out = tempfile::Builder::new() .suffix(".wav") .tempfile() .context("creating temp wav")?; let out_s = out.path().to_string_lossy().to_string(); - // Cap the reference clip length — we use the first N seconds (see - // tts_ref_seconds). - let secs = tts_ref_seconds().to_string(); + let start_s = format!("{start}"); + let secs = format!("{duration}"); + + // -ss before -i is input seeking: fast, and frame accuracy doesn't matter + // for picking a speech window. + let mut args: Vec<&str> = vec!["-y"]; + if start > 0.0 { + args.extend(["-ss", &start_s]); + } + args.extend([ + "-i", input_path, "-vn", "-ac", "1", "-ar", "24000", "-t", &secs, "-f", "wav", &out_s, + ]); let output = tokio::process::Command::new("ffmpeg") - .args([ - "-y", "-i", input_path, "-vn", "-ac", "1", "-ar", "24000", "-t", &secs, "-f", "wav", - &out_s, - ]) + .args(&args) .output() .await .context("spawning ffmpeg")?; @@ -295,7 +323,12 @@ async fn run_ffmpeg_to_wav(input_path: &str) -> anyhow::Result> { /// Normalize in-memory upload bytes to WAV: write to a temp file (keeping the /// source extension as an ffmpeg probe hint) then transcode. -async fn transcode_bytes_to_wav(input: &[u8], src_ext: Option<&str>) -> anyhow::Result> { +async fn transcode_bytes_to_wav( + input: &[u8], + src_ext: Option<&str>, + start: f64, + duration: f64, +) -> anyhow::Result> { let suffix = src_ext .filter(|e| !e.is_empty()) .map(|e| format!(".{e}")) @@ -305,7 +338,7 @@ async fn transcode_bytes_to_wav(input: &[u8], src_ext: Option<&str>) -> anyhow:: .tempfile() .context("creating temp input")?; std::fs::write(in_tmp.path(), input).context("writing temp input")?; - run_ffmpeg_to_wav(&in_tmp.path().to_string_lossy()).await + run_ffmpeg_to_wav(&in_tmp.path().to_string_lossy(), start, duration).await } #[derive(Debug, Deserialize)] @@ -751,7 +784,9 @@ pub async fn delete_voice_handler( } /// POST /tts/voices/upload — register a cloned voice from an uploaded audio -/// clip. Multipart fields: `voice_name` (text) + a file part (`voice_file`). +/// clip. Multipart fields: `voice_name` (text) + a file part (`voice_file`), +/// plus optional `start_seconds` / `duration_seconds` (text) selecting which +/// window of a longer recording becomes the reference clip. #[post("/tts/voices/upload")] pub async fn create_voice_upload_handler( http_request: HttpRequest, @@ -769,6 +804,8 @@ pub async fn create_voice_upload_handler( }; let mut voice_name: Option = None; + let mut start_field: Option = None; + let mut duration_field: Option = None; let mut file_bytes = BytesMut::new(); let mut filename = "voice.wav".to_string(); @@ -793,25 +830,57 @@ pub async fn create_voice_upload_handler( } file_bytes.put(data); } - } else if name_opt.as_deref() == Some("voice_name") { + } else if matches!( + name_opt.as_deref(), + Some("voice_name" | "start_seconds" | "duration_seconds") + ) { + let field = name_opt.as_deref().unwrap().to_string(); let mut buf = BytesMut::new(); while let Some(Ok(data)) = part.next().await { buf.put(data); } - voice_name = Some(String::from_utf8_lossy(&buf).trim().to_string()); + let text = String::from_utf8_lossy(&buf).trim().to_string(); + match field.as_str() { + "voice_name" => voice_name = Some(text), + "start_seconds" => start_field = Some(text), + _ => duration_field = Some(text), + } } else { while let Some(Ok(_)) = part.next().await {} } } + // Empty text parts are treated as absent; anything else must parse, so a + // client bug ("abc") fails loudly instead of silently cloning from 0s. + let parse_secs = |field: Option<&String>, name: &str| -> Result, String> { + match field.map(|s| s.as_str()).filter(|s| !s.is_empty()) { + None => Ok(None), + Some(s) => s + .parse::() + .map(Some) + .map_err(|_| format!("{name} must be a number of seconds")), + } + }; + let window = parse_secs(start_field.as_ref(), "start_seconds").and_then(|start| { + parse_secs(duration_field.as_ref(), "duration_seconds") + .and_then(|dur| resolve_ref_window(start, dur)) + }); + let (ref_start, ref_duration) = match window { + Ok(w) => w, + Err(msg) => { + span.set_status(Status::error("invalid reference window")); + return HttpResponse::BadRequest().json(json!({ "error": msg })); + } + }; + let Some(name) = voice_name.as_deref().and_then(sanitize_voice_name) else { span.set_status(Status::error("voice_name is required")); return HttpResponse::BadRequest() .json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" })); }; - // Tag the name with the ref-clip cap (e.g. `grandma-30s`) so the library - // shows which reference length produced each clone. - let name = append_ref_seconds(&name, tts_ref_seconds()); + // Tag the name with the ref-clip length (e.g. `grandma-30s`) so the + // library shows which reference length produced each clone. + let name = append_ref_seconds(&name, ref_duration.round().max(1.0) as u32); if file_bytes.is_empty() { span.set_status(Status::error("voice_file is required")); return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" })); @@ -822,15 +891,16 @@ pub async fn create_voice_upload_handler( // Normalize to WAV so any device format (e.g. .aac / .opus, which Chatterbox // rejects by extension) is accepted. let src_ext = Path::new(&filename).extension().and_then(|e| e.to_str()); - let wav = match transcode_bytes_to_wav(file_bytes.as_ref(), src_ext).await { - Ok(w) => w, - Err(e) => { - span.set_status(Status::error("audio decode failed")); - log::error!("voice upload transcode failed: {:?}", e); - return HttpResponse::BadRequest() - .json(json!({ "error": "couldn't decode that audio file" })); - } - }; + let wav = + match transcode_bytes_to_wav(file_bytes.as_ref(), src_ext, ref_start, ref_duration).await { + Ok(w) => w, + Err(e) => { + span.set_status(Status::error("audio decode failed")); + log::error!("voice upload transcode failed: {:?}", e); + return HttpResponse::BadRequest() + .json(json!({ "error": "couldn't decode that audio file" })); + } + }; match client .create_voice(&name, wav, "reference.wav", "audio/wav") @@ -856,11 +926,19 @@ pub struct CreateVoiceFromLibraryRequest { pub path: String, #[serde(default)] pub library: Option, + /// Offset into the source where the reference window begins (default 0) — + /// lets the client pick the clean-speech section of a long recording. + #[serde(default)] + pub start_seconds: Option, + /// Reference window length; clamped to LLAMA_SWAP_TTS_REF_SECONDS. + #[serde(default)] + pub duration_seconds: Option, } /// POST /tts/voices/from-library — register a cloned voice from a file already /// in a library. Audio and video alike are ffmpeg-normalized to a mono 24 kHz -/// WAV reference clip (length capped by LLAMA_SWAP_TTS_REF_SECONDS). +/// WAV reference clip (window selected by start/duration_seconds, length +/// capped by LLAMA_SWAP_TTS_REF_SECONDS). #[post("/tts/voices/from-library")] pub async fn create_voice_from_library_handler( http_request: HttpRequest, @@ -882,9 +960,17 @@ pub async fn create_voice_from_library_handler( return HttpResponse::BadRequest() .json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" })); }; - // Tag the name with the ref-clip cap (e.g. `grandma-30s`) so the library - // shows which reference length produced each clone. - let voice_name = append_ref_seconds(&voice_name, tts_ref_seconds()); + let (ref_start, ref_duration) = + match resolve_ref_window(req.start_seconds, req.duration_seconds) { + Ok(w) => w, + Err(msg) => { + span.set_status(Status::error("invalid reference window")); + return HttpResponse::BadRequest().json(json!({ "error": msg })); + } + }; + // Tag the name with the ref-clip length (e.g. `grandma-30s`) so the + // library shows which reference length produced each clone. + let voice_name = append_ref_seconds(&voice_name, ref_duration.round().max(1.0) as u32); let library = match libraries::resolve_library_param(&app_state, req.library.as_deref()) { Ok(Some(l)) => l, @@ -913,7 +999,7 @@ pub async fn create_voice_from_library_handler( } span.set_attribute(KeyValue::new("tts.voice_name", voice_name.clone())); - let wav = match prepare_reference_audio(&abs).await { + let wav = match prepare_reference_audio(&abs, ref_start, ref_duration).await { Ok(b) => b, Err(e) => { span.set_status(Status::error("audio decode failed")); @@ -943,8 +1029,8 @@ pub async fn create_voice_from_library_handler( /// Read a library file (audio or video) as a Chatterbox-ready reference: ffmpeg /// decodes/extracts its audio to mono 24 kHz WAV. Reading straight from the /// library path avoids slurping a (possibly large) video into memory. -async fn prepare_reference_audio(abs: &Path) -> anyhow::Result> { - run_ffmpeg_to_wav(&abs.to_string_lossy()).await +async fn prepare_reference_audio(abs: &Path, start: f64, duration: f64) -> anyhow::Result> { + run_ffmpeg_to_wav(&abs.to_string_lossy(), start, duration).await } #[cfg(test)] @@ -1009,6 +1095,33 @@ mod tests { assert!(tagged.ends_with("-30s")); } + #[test] + fn resolve_ref_window_defaults_to_start_of_clip_at_cap_length() { + // Reads the live cap rather than mutating LLAMA_SWAP_TTS_REF_SECONDS: + // env mutation flakes under the parallel suite (see env_dispatch). + let cap = f64::from(tts_ref_seconds()); + assert_eq!(resolve_ref_window(None, None), Ok((0.0, cap))); + } + + #[test] + fn resolve_ref_window_accepts_offset_and_clamps_duration() { + let cap = f64::from(tts_ref_seconds()); + assert_eq!(resolve_ref_window(Some(92.5), None), Ok((92.5, cap))); + assert_eq!(resolve_ref_window(Some(10.0), Some(12.0)), Ok((10.0, 12.0))); + // Longer-than-cap windows are bounded, not rejected. + assert_eq!(resolve_ref_window(None, Some(cap + 100.0)), Ok((0.0, cap))); + } + + #[test] + fn resolve_ref_window_rejects_garbage() { + assert!(resolve_ref_window(Some(-1.0), None).is_err()); + assert!(resolve_ref_window(Some(f64::NAN), None).is_err()); + assert!(resolve_ref_window(Some(f64::INFINITY), None).is_err()); + assert!(resolve_ref_window(None, Some(0.0)).is_err()); + assert!(resolve_ref_window(None, Some(-5.0)).is_err()); + assert!(resolve_ref_window(None, Some(f64::NAN)).is_err()); + } + #[test] fn sweep_drops_expired_results_and_keeps_live_jobs() { let now = Instant::now();