Add start/duration window selection for voice-clone reference clips

Both voice creation endpoints (upload + from-library) now accept optional
start_seconds/duration_seconds, threaded to ffmpeg as -ss/-t, so the
reference window can target clean speech anywhere in a long recording
instead of always the first N seconds. Duration is clamped to the
LLAMA_SWAP_TTS_REF_SECONDS cap and the voice-name tag reflects the
actual window length.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
Cameron Cordes
2026-06-12 16:09:03 -04:00
parent 2e0f78aa1b
commit 1dec34540d
+139 -26
View File
@@ -193,6 +193,27 @@ fn append_ref_seconds(name: &str, secs: u32) -> String {
format!("{base}{suffix}") format!("{base}{suffix}")
} }
/// Resolve a caller-supplied reference window into concrete `(start, duration)`
/// seconds for ffmpeg. Start defaults to 0; duration defaults to the
/// `tts_ref_seconds` cap and is clamped to it (the cap is the most audio the
/// TTS backend benefits from, so longer requests are quietly bounded rather
/// than rejected). Non-finite or negative values are the caller's bug → Err.
fn resolve_ref_window(
start_seconds: Option<f64>,
duration_seconds: Option<f64>,
) -> Result<(f64, f64), String> {
let cap = f64::from(tts_ref_seconds());
let start = start_seconds.unwrap_or(0.0);
if !start.is_finite() || start < 0.0 {
return Err("start_seconds must be a non-negative number".to_string());
}
let duration = duration_seconds.unwrap_or(cap);
if !duration.is_finite() || duration <= 0.0 {
return Err("duration_seconds must be a positive number".to_string());
}
Ok((start, duration.min(cap)))
}
/// Optional default voice for synthesis when the request doesn't name one. /// Optional default voice for synthesis when the request doesn't name one.
/// Set `LLAMA_SWAP_TTS_VOICE=m` to read insights in a cloned voice by default. /// Set `LLAMA_SWAP_TTS_VOICE=m` to read insights in a cloned voice by default.
fn default_voice() -> Option<String> { fn default_voice() -> Option<String> {
@@ -265,24 +286,31 @@ fn prepare_for_tts(input: &str) -> String {
/// Decode an audio/video file to mono 24 kHz WAV via ffmpeg, returning the WAV /// Decode an audio/video file to mono 24 kHz WAV via ffmpeg, returning the WAV
/// bytes. Chatterbox validates the reference clip by file *extension* and /// bytes. Chatterbox validates the reference clip by file *extension* and
/// rejects several formats (e.g. `.aac`, `.opus`), so we always normalize to /// rejects several formats (e.g. `.aac`, `.opus`), so we always normalize to
/// WAV regardless of the source container. Capped at 30s — references only need /// WAV regardless of the source container. Extracts `duration` seconds starting
/// a few seconds of clean speech. /// at `start` (see resolve_ref_window) — references only need a few seconds of
async fn run_ffmpeg_to_wav(input_path: &str) -> anyhow::Result<Vec<u8>> { /// clean speech, which may sit anywhere in a long recording.
async fn run_ffmpeg_to_wav(input_path: &str, start: f64, duration: f64) -> anyhow::Result<Vec<u8>> {
let out = tempfile::Builder::new() let out = tempfile::Builder::new()
.suffix(".wav") .suffix(".wav")
.tempfile() .tempfile()
.context("creating temp wav")?; .context("creating temp wav")?;
let out_s = out.path().to_string_lossy().to_string(); let out_s = out.path().to_string_lossy().to_string();
// Cap the reference clip length — we use the first N seconds (see let start_s = format!("{start}");
// tts_ref_seconds). let secs = format!("{duration}");
let secs = tts_ref_seconds().to_string();
// -ss before -i is input seeking: fast, and frame accuracy doesn't matter
// for picking a speech window.
let mut args: Vec<&str> = vec!["-y"];
if start > 0.0 {
args.extend(["-ss", &start_s]);
}
args.extend([
"-i", input_path, "-vn", "-ac", "1", "-ar", "24000", "-t", &secs, "-f", "wav", &out_s,
]);
let output = tokio::process::Command::new("ffmpeg") let output = tokio::process::Command::new("ffmpeg")
.args([ .args(&args)
"-y", "-i", input_path, "-vn", "-ac", "1", "-ar", "24000", "-t", &secs, "-f", "wav",
&out_s,
])
.output() .output()
.await .await
.context("spawning ffmpeg")?; .context("spawning ffmpeg")?;
@@ -295,7 +323,12 @@ async fn run_ffmpeg_to_wav(input_path: &str) -> anyhow::Result<Vec<u8>> {
/// Normalize in-memory upload bytes to WAV: write to a temp file (keeping the /// Normalize in-memory upload bytes to WAV: write to a temp file (keeping the
/// source extension as an ffmpeg probe hint) then transcode. /// source extension as an ffmpeg probe hint) then transcode.
async fn transcode_bytes_to_wav(input: &[u8], src_ext: Option<&str>) -> anyhow::Result<Vec<u8>> { async fn transcode_bytes_to_wav(
input: &[u8],
src_ext: Option<&str>,
start: f64,
duration: f64,
) -> anyhow::Result<Vec<u8>> {
let suffix = src_ext let suffix = src_ext
.filter(|e| !e.is_empty()) .filter(|e| !e.is_empty())
.map(|e| format!(".{e}")) .map(|e| format!(".{e}"))
@@ -305,7 +338,7 @@ async fn transcode_bytes_to_wav(input: &[u8], src_ext: Option<&str>) -> anyhow::
.tempfile() .tempfile()
.context("creating temp input")?; .context("creating temp input")?;
std::fs::write(in_tmp.path(), input).context("writing temp input")?; std::fs::write(in_tmp.path(), input).context("writing temp input")?;
run_ffmpeg_to_wav(&in_tmp.path().to_string_lossy()).await run_ffmpeg_to_wav(&in_tmp.path().to_string_lossy(), start, duration).await
} }
#[derive(Debug, Deserialize)] #[derive(Debug, Deserialize)]
@@ -751,7 +784,9 @@ pub async fn delete_voice_handler(
} }
/// POST /tts/voices/upload — register a cloned voice from an uploaded audio /// POST /tts/voices/upload — register a cloned voice from an uploaded audio
/// clip. Multipart fields: `voice_name` (text) + a file part (`voice_file`). /// clip. Multipart fields: `voice_name` (text) + a file part (`voice_file`),
/// plus optional `start_seconds` / `duration_seconds` (text) selecting which
/// window of a longer recording becomes the reference clip.
#[post("/tts/voices/upload")] #[post("/tts/voices/upload")]
pub async fn create_voice_upload_handler( pub async fn create_voice_upload_handler(
http_request: HttpRequest, http_request: HttpRequest,
@@ -769,6 +804,8 @@ pub async fn create_voice_upload_handler(
}; };
let mut voice_name: Option<String> = None; let mut voice_name: Option<String> = None;
let mut start_field: Option<String> = None;
let mut duration_field: Option<String> = None;
let mut file_bytes = BytesMut::new(); let mut file_bytes = BytesMut::new();
let mut filename = "voice.wav".to_string(); let mut filename = "voice.wav".to_string();
@@ -793,25 +830,57 @@ pub async fn create_voice_upload_handler(
} }
file_bytes.put(data); file_bytes.put(data);
} }
} else if name_opt.as_deref() == Some("voice_name") { } else if matches!(
name_opt.as_deref(),
Some("voice_name" | "start_seconds" | "duration_seconds")
) {
let field = name_opt.as_deref().unwrap().to_string();
let mut buf = BytesMut::new(); let mut buf = BytesMut::new();
while let Some(Ok(data)) = part.next().await { while let Some(Ok(data)) = part.next().await {
buf.put(data); buf.put(data);
} }
voice_name = Some(String::from_utf8_lossy(&buf).trim().to_string()); let text = String::from_utf8_lossy(&buf).trim().to_string();
match field.as_str() {
"voice_name" => voice_name = Some(text),
"start_seconds" => start_field = Some(text),
_ => duration_field = Some(text),
}
} else { } else {
while let Some(Ok(_)) = part.next().await {} while let Some(Ok(_)) = part.next().await {}
} }
} }
// Empty text parts are treated as absent; anything else must parse, so a
// client bug ("abc") fails loudly instead of silently cloning from 0s.
let parse_secs = |field: Option<&String>, name: &str| -> Result<Option<f64>, String> {
match field.map(|s| s.as_str()).filter(|s| !s.is_empty()) {
None => Ok(None),
Some(s) => s
.parse::<f64>()
.map(Some)
.map_err(|_| format!("{name} must be a number of seconds")),
}
};
let window = parse_secs(start_field.as_ref(), "start_seconds").and_then(|start| {
parse_secs(duration_field.as_ref(), "duration_seconds")
.and_then(|dur| resolve_ref_window(start, dur))
});
let (ref_start, ref_duration) = match window {
Ok(w) => w,
Err(msg) => {
span.set_status(Status::error("invalid reference window"));
return HttpResponse::BadRequest().json(json!({ "error": msg }));
}
};
let Some(name) = voice_name.as_deref().and_then(sanitize_voice_name) else { let Some(name) = voice_name.as_deref().and_then(sanitize_voice_name) else {
span.set_status(Status::error("voice_name is required")); span.set_status(Status::error("voice_name is required"));
return HttpResponse::BadRequest() return HttpResponse::BadRequest()
.json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" })); .json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
}; };
// Tag the name with the ref-clip cap (e.g. `grandma-30s`) so the library // Tag the name with the ref-clip length (e.g. `grandma-30s`) so the
// shows which reference length produced each clone. // library shows which reference length produced each clone.
let name = append_ref_seconds(&name, tts_ref_seconds()); let name = append_ref_seconds(&name, ref_duration.round().max(1.0) as u32);
if file_bytes.is_empty() { if file_bytes.is_empty() {
span.set_status(Status::error("voice_file is required")); span.set_status(Status::error("voice_file is required"));
return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" })); return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" }));
@@ -822,7 +891,8 @@ pub async fn create_voice_upload_handler(
// Normalize to WAV so any device format (e.g. .aac / .opus, which Chatterbox // Normalize to WAV so any device format (e.g. .aac / .opus, which Chatterbox
// rejects by extension) is accepted. // rejects by extension) is accepted.
let src_ext = Path::new(&filename).extension().and_then(|e| e.to_str()); let src_ext = Path::new(&filename).extension().and_then(|e| e.to_str());
let wav = match transcode_bytes_to_wav(file_bytes.as_ref(), src_ext).await { let wav =
match transcode_bytes_to_wav(file_bytes.as_ref(), src_ext, ref_start, ref_duration).await {
Ok(w) => w, Ok(w) => w,
Err(e) => { Err(e) => {
span.set_status(Status::error("audio decode failed")); span.set_status(Status::error("audio decode failed"));
@@ -856,11 +926,19 @@ pub struct CreateVoiceFromLibraryRequest {
pub path: String, pub path: String,
#[serde(default)] #[serde(default)]
pub library: Option<String>, pub library: Option<String>,
/// Offset into the source where the reference window begins (default 0) —
/// lets the client pick the clean-speech section of a long recording.
#[serde(default)]
pub start_seconds: Option<f64>,
/// Reference window length; clamped to LLAMA_SWAP_TTS_REF_SECONDS.
#[serde(default)]
pub duration_seconds: Option<f64>,
} }
/// POST /tts/voices/from-library — register a cloned voice from a file already /// POST /tts/voices/from-library — register a cloned voice from a file already
/// in a library. Audio and video alike are ffmpeg-normalized to a mono 24 kHz /// in a library. Audio and video alike are ffmpeg-normalized to a mono 24 kHz
/// WAV reference clip (length capped by LLAMA_SWAP_TTS_REF_SECONDS). /// WAV reference clip (window selected by start/duration_seconds, length
/// capped by LLAMA_SWAP_TTS_REF_SECONDS).
#[post("/tts/voices/from-library")] #[post("/tts/voices/from-library")]
pub async fn create_voice_from_library_handler( pub async fn create_voice_from_library_handler(
http_request: HttpRequest, http_request: HttpRequest,
@@ -882,9 +960,17 @@ pub async fn create_voice_from_library_handler(
return HttpResponse::BadRequest() return HttpResponse::BadRequest()
.json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" })); .json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
}; };
// Tag the name with the ref-clip cap (e.g. `grandma-30s`) so the library let (ref_start, ref_duration) =
// shows which reference length produced each clone. match resolve_ref_window(req.start_seconds, req.duration_seconds) {
let voice_name = append_ref_seconds(&voice_name, tts_ref_seconds()); Ok(w) => w,
Err(msg) => {
span.set_status(Status::error("invalid reference window"));
return HttpResponse::BadRequest().json(json!({ "error": msg }));
}
};
// Tag the name with the ref-clip length (e.g. `grandma-30s`) so the
// library shows which reference length produced each clone.
let voice_name = append_ref_seconds(&voice_name, ref_duration.round().max(1.0) as u32);
let library = match libraries::resolve_library_param(&app_state, req.library.as_deref()) { let library = match libraries::resolve_library_param(&app_state, req.library.as_deref()) {
Ok(Some(l)) => l, Ok(Some(l)) => l,
@@ -913,7 +999,7 @@ pub async fn create_voice_from_library_handler(
} }
span.set_attribute(KeyValue::new("tts.voice_name", voice_name.clone())); span.set_attribute(KeyValue::new("tts.voice_name", voice_name.clone()));
let wav = match prepare_reference_audio(&abs).await { let wav = match prepare_reference_audio(&abs, ref_start, ref_duration).await {
Ok(b) => b, Ok(b) => b,
Err(e) => { Err(e) => {
span.set_status(Status::error("audio decode failed")); span.set_status(Status::error("audio decode failed"));
@@ -943,8 +1029,8 @@ pub async fn create_voice_from_library_handler(
/// Read a library file (audio or video) as a Chatterbox-ready reference: ffmpeg /// Read a library file (audio or video) as a Chatterbox-ready reference: ffmpeg
/// decodes/extracts its audio to mono 24 kHz WAV. Reading straight from the /// decodes/extracts its audio to mono 24 kHz WAV. Reading straight from the
/// library path avoids slurping a (possibly large) video into memory. /// library path avoids slurping a (possibly large) video into memory.
async fn prepare_reference_audio(abs: &Path) -> anyhow::Result<Vec<u8>> { async fn prepare_reference_audio(abs: &Path, start: f64, duration: f64) -> anyhow::Result<Vec<u8>> {
run_ffmpeg_to_wav(&abs.to_string_lossy()).await run_ffmpeg_to_wav(&abs.to_string_lossy(), start, duration).await
} }
#[cfg(test)] #[cfg(test)]
@@ -1009,6 +1095,33 @@ mod tests {
assert!(tagged.ends_with("-30s")); assert!(tagged.ends_with("-30s"));
} }
#[test]
fn resolve_ref_window_defaults_to_start_of_clip_at_cap_length() {
// Reads the live cap rather than mutating LLAMA_SWAP_TTS_REF_SECONDS:
// env mutation flakes under the parallel suite (see env_dispatch).
let cap = f64::from(tts_ref_seconds());
assert_eq!(resolve_ref_window(None, None), Ok((0.0, cap)));
}
#[test]
fn resolve_ref_window_accepts_offset_and_clamps_duration() {
let cap = f64::from(tts_ref_seconds());
assert_eq!(resolve_ref_window(Some(92.5), None), Ok((92.5, cap)));
assert_eq!(resolve_ref_window(Some(10.0), Some(12.0)), Ok((10.0, 12.0)));
// Longer-than-cap windows are bounded, not rejected.
assert_eq!(resolve_ref_window(None, Some(cap + 100.0)), Ok((0.0, cap)));
}
#[test]
fn resolve_ref_window_rejects_garbage() {
assert!(resolve_ref_window(Some(-1.0), None).is_err());
assert!(resolve_ref_window(Some(f64::NAN), None).is_err());
assert!(resolve_ref_window(Some(f64::INFINITY), None).is_err());
assert!(resolve_ref_window(None, Some(0.0)).is_err());
assert!(resolve_ref_window(None, Some(-5.0)).is_err());
assert!(resolve_ref_window(None, Some(f64::NAN)).is_err());
}
#[test] #[test]
fn sweep_drops_expired_results_and_keeps_live_jobs() { fn sweep_drops_expired_results_and_keeps_live_jobs() {
let now = Instant::now(); let now = Instant::now();