Add start/duration window selection for voice-clone reference clips
Both voice creation endpoints (upload + from-library) now accept optional start_seconds/duration_seconds, threaded to ffmpeg as -ss/-t, so the reference window can target clean speech anywhere in a long recording instead of always the first N seconds. Duration is clamped to the LLAMA_SWAP_TTS_REF_SECONDS cap and the voice-name tag reflects the actual window length. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
+147
-34
@@ -193,6 +193,27 @@ fn append_ref_seconds(name: &str, secs: u32) -> String {
|
|||||||
format!("{base}{suffix}")
|
format!("{base}{suffix}")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Resolve a caller-supplied reference window into concrete `(start, duration)`
|
||||||
|
/// seconds for ffmpeg. Start defaults to 0; duration defaults to the
|
||||||
|
/// `tts_ref_seconds` cap and is clamped to it (the cap is the most audio the
|
||||||
|
/// TTS backend benefits from, so longer requests are quietly bounded rather
|
||||||
|
/// than rejected). Non-finite or negative values are the caller's bug → Err.
|
||||||
|
fn resolve_ref_window(
|
||||||
|
start_seconds: Option<f64>,
|
||||||
|
duration_seconds: Option<f64>,
|
||||||
|
) -> Result<(f64, f64), String> {
|
||||||
|
let cap = f64::from(tts_ref_seconds());
|
||||||
|
let start = start_seconds.unwrap_or(0.0);
|
||||||
|
if !start.is_finite() || start < 0.0 {
|
||||||
|
return Err("start_seconds must be a non-negative number".to_string());
|
||||||
|
}
|
||||||
|
let duration = duration_seconds.unwrap_or(cap);
|
||||||
|
if !duration.is_finite() || duration <= 0.0 {
|
||||||
|
return Err("duration_seconds must be a positive number".to_string());
|
||||||
|
}
|
||||||
|
Ok((start, duration.min(cap)))
|
||||||
|
}
|
||||||
|
|
||||||
/// Optional default voice for synthesis when the request doesn't name one.
|
/// Optional default voice for synthesis when the request doesn't name one.
|
||||||
/// Set `LLAMA_SWAP_TTS_VOICE=m` to read insights in a cloned voice by default.
|
/// Set `LLAMA_SWAP_TTS_VOICE=m` to read insights in a cloned voice by default.
|
||||||
fn default_voice() -> Option<String> {
|
fn default_voice() -> Option<String> {
|
||||||
@@ -265,24 +286,31 @@ fn prepare_for_tts(input: &str) -> String {
|
|||||||
/// Decode an audio/video file to mono 24 kHz WAV via ffmpeg, returning the WAV
|
/// Decode an audio/video file to mono 24 kHz WAV via ffmpeg, returning the WAV
|
||||||
/// bytes. Chatterbox validates the reference clip by file *extension* and
|
/// bytes. Chatterbox validates the reference clip by file *extension* and
|
||||||
/// rejects several formats (e.g. `.aac`, `.opus`), so we always normalize to
|
/// rejects several formats (e.g. `.aac`, `.opus`), so we always normalize to
|
||||||
/// WAV regardless of the source container. Capped at 30s — references only need
|
/// WAV regardless of the source container. Extracts `duration` seconds starting
|
||||||
/// a few seconds of clean speech.
|
/// at `start` (see resolve_ref_window) — references only need a few seconds of
|
||||||
async fn run_ffmpeg_to_wav(input_path: &str) -> anyhow::Result<Vec<u8>> {
|
/// clean speech, which may sit anywhere in a long recording.
|
||||||
|
async fn run_ffmpeg_to_wav(input_path: &str, start: f64, duration: f64) -> anyhow::Result<Vec<u8>> {
|
||||||
let out = tempfile::Builder::new()
|
let out = tempfile::Builder::new()
|
||||||
.suffix(".wav")
|
.suffix(".wav")
|
||||||
.tempfile()
|
.tempfile()
|
||||||
.context("creating temp wav")?;
|
.context("creating temp wav")?;
|
||||||
let out_s = out.path().to_string_lossy().to_string();
|
let out_s = out.path().to_string_lossy().to_string();
|
||||||
|
|
||||||
// Cap the reference clip length — we use the first N seconds (see
|
let start_s = format!("{start}");
|
||||||
// tts_ref_seconds).
|
let secs = format!("{duration}");
|
||||||
let secs = tts_ref_seconds().to_string();
|
|
||||||
|
// -ss before -i is input seeking: fast, and frame accuracy doesn't matter
|
||||||
|
// for picking a speech window.
|
||||||
|
let mut args: Vec<&str> = vec!["-y"];
|
||||||
|
if start > 0.0 {
|
||||||
|
args.extend(["-ss", &start_s]);
|
||||||
|
}
|
||||||
|
args.extend([
|
||||||
|
"-i", input_path, "-vn", "-ac", "1", "-ar", "24000", "-t", &secs, "-f", "wav", &out_s,
|
||||||
|
]);
|
||||||
|
|
||||||
let output = tokio::process::Command::new("ffmpeg")
|
let output = tokio::process::Command::new("ffmpeg")
|
||||||
.args([
|
.args(&args)
|
||||||
"-y", "-i", input_path, "-vn", "-ac", "1", "-ar", "24000", "-t", &secs, "-f", "wav",
|
|
||||||
&out_s,
|
|
||||||
])
|
|
||||||
.output()
|
.output()
|
||||||
.await
|
.await
|
||||||
.context("spawning ffmpeg")?;
|
.context("spawning ffmpeg")?;
|
||||||
@@ -295,7 +323,12 @@ async fn run_ffmpeg_to_wav(input_path: &str) -> anyhow::Result<Vec<u8>> {
|
|||||||
|
|
||||||
/// Normalize in-memory upload bytes to WAV: write to a temp file (keeping the
|
/// Normalize in-memory upload bytes to WAV: write to a temp file (keeping the
|
||||||
/// source extension as an ffmpeg probe hint) then transcode.
|
/// source extension as an ffmpeg probe hint) then transcode.
|
||||||
async fn transcode_bytes_to_wav(input: &[u8], src_ext: Option<&str>) -> anyhow::Result<Vec<u8>> {
|
async fn transcode_bytes_to_wav(
|
||||||
|
input: &[u8],
|
||||||
|
src_ext: Option<&str>,
|
||||||
|
start: f64,
|
||||||
|
duration: f64,
|
||||||
|
) -> anyhow::Result<Vec<u8>> {
|
||||||
let suffix = src_ext
|
let suffix = src_ext
|
||||||
.filter(|e| !e.is_empty())
|
.filter(|e| !e.is_empty())
|
||||||
.map(|e| format!(".{e}"))
|
.map(|e| format!(".{e}"))
|
||||||
@@ -305,7 +338,7 @@ async fn transcode_bytes_to_wav(input: &[u8], src_ext: Option<&str>) -> anyhow::
|
|||||||
.tempfile()
|
.tempfile()
|
||||||
.context("creating temp input")?;
|
.context("creating temp input")?;
|
||||||
std::fs::write(in_tmp.path(), input).context("writing temp input")?;
|
std::fs::write(in_tmp.path(), input).context("writing temp input")?;
|
||||||
run_ffmpeg_to_wav(&in_tmp.path().to_string_lossy()).await
|
run_ffmpeg_to_wav(&in_tmp.path().to_string_lossy(), start, duration).await
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
#[derive(Debug, Deserialize)]
|
||||||
@@ -751,7 +784,9 @@ pub async fn delete_voice_handler(
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// POST /tts/voices/upload — register a cloned voice from an uploaded audio
|
/// POST /tts/voices/upload — register a cloned voice from an uploaded audio
|
||||||
/// clip. Multipart fields: `voice_name` (text) + a file part (`voice_file`).
|
/// clip. Multipart fields: `voice_name` (text) + a file part (`voice_file`),
|
||||||
|
/// plus optional `start_seconds` / `duration_seconds` (text) selecting which
|
||||||
|
/// window of a longer recording becomes the reference clip.
|
||||||
#[post("/tts/voices/upload")]
|
#[post("/tts/voices/upload")]
|
||||||
pub async fn create_voice_upload_handler(
|
pub async fn create_voice_upload_handler(
|
||||||
http_request: HttpRequest,
|
http_request: HttpRequest,
|
||||||
@@ -769,6 +804,8 @@ pub async fn create_voice_upload_handler(
|
|||||||
};
|
};
|
||||||
|
|
||||||
let mut voice_name: Option<String> = None;
|
let mut voice_name: Option<String> = None;
|
||||||
|
let mut start_field: Option<String> = None;
|
||||||
|
let mut duration_field: Option<String> = None;
|
||||||
let mut file_bytes = BytesMut::new();
|
let mut file_bytes = BytesMut::new();
|
||||||
let mut filename = "voice.wav".to_string();
|
let mut filename = "voice.wav".to_string();
|
||||||
|
|
||||||
@@ -793,25 +830,57 @@ pub async fn create_voice_upload_handler(
|
|||||||
}
|
}
|
||||||
file_bytes.put(data);
|
file_bytes.put(data);
|
||||||
}
|
}
|
||||||
} else if name_opt.as_deref() == Some("voice_name") {
|
} else if matches!(
|
||||||
|
name_opt.as_deref(),
|
||||||
|
Some("voice_name" | "start_seconds" | "duration_seconds")
|
||||||
|
) {
|
||||||
|
let field = name_opt.as_deref().unwrap().to_string();
|
||||||
let mut buf = BytesMut::new();
|
let mut buf = BytesMut::new();
|
||||||
while let Some(Ok(data)) = part.next().await {
|
while let Some(Ok(data)) = part.next().await {
|
||||||
buf.put(data);
|
buf.put(data);
|
||||||
}
|
}
|
||||||
voice_name = Some(String::from_utf8_lossy(&buf).trim().to_string());
|
let text = String::from_utf8_lossy(&buf).trim().to_string();
|
||||||
|
match field.as_str() {
|
||||||
|
"voice_name" => voice_name = Some(text),
|
||||||
|
"start_seconds" => start_field = Some(text),
|
||||||
|
_ => duration_field = Some(text),
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
while let Some(Ok(_)) = part.next().await {}
|
while let Some(Ok(_)) = part.next().await {}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Empty text parts are treated as absent; anything else must parse, so a
|
||||||
|
// client bug ("abc") fails loudly instead of silently cloning from 0s.
|
||||||
|
let parse_secs = |field: Option<&String>, name: &str| -> Result<Option<f64>, String> {
|
||||||
|
match field.map(|s| s.as_str()).filter(|s| !s.is_empty()) {
|
||||||
|
None => Ok(None),
|
||||||
|
Some(s) => s
|
||||||
|
.parse::<f64>()
|
||||||
|
.map(Some)
|
||||||
|
.map_err(|_| format!("{name} must be a number of seconds")),
|
||||||
|
}
|
||||||
|
};
|
||||||
|
let window = parse_secs(start_field.as_ref(), "start_seconds").and_then(|start| {
|
||||||
|
parse_secs(duration_field.as_ref(), "duration_seconds")
|
||||||
|
.and_then(|dur| resolve_ref_window(start, dur))
|
||||||
|
});
|
||||||
|
let (ref_start, ref_duration) = match window {
|
||||||
|
Ok(w) => w,
|
||||||
|
Err(msg) => {
|
||||||
|
span.set_status(Status::error("invalid reference window"));
|
||||||
|
return HttpResponse::BadRequest().json(json!({ "error": msg }));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
let Some(name) = voice_name.as_deref().and_then(sanitize_voice_name) else {
|
let Some(name) = voice_name.as_deref().and_then(sanitize_voice_name) else {
|
||||||
span.set_status(Status::error("voice_name is required"));
|
span.set_status(Status::error("voice_name is required"));
|
||||||
return HttpResponse::BadRequest()
|
return HttpResponse::BadRequest()
|
||||||
.json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
|
.json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
|
||||||
};
|
};
|
||||||
// Tag the name with the ref-clip cap (e.g. `grandma-30s`) so the library
|
// Tag the name with the ref-clip length (e.g. `grandma-30s`) so the
|
||||||
// shows which reference length produced each clone.
|
// library shows which reference length produced each clone.
|
||||||
let name = append_ref_seconds(&name, tts_ref_seconds());
|
let name = append_ref_seconds(&name, ref_duration.round().max(1.0) as u32);
|
||||||
if file_bytes.is_empty() {
|
if file_bytes.is_empty() {
|
||||||
span.set_status(Status::error("voice_file is required"));
|
span.set_status(Status::error("voice_file is required"));
|
||||||
return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" }));
|
return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" }));
|
||||||
@@ -822,15 +891,16 @@ pub async fn create_voice_upload_handler(
|
|||||||
// Normalize to WAV so any device format (e.g. .aac / .opus, which Chatterbox
|
// Normalize to WAV so any device format (e.g. .aac / .opus, which Chatterbox
|
||||||
// rejects by extension) is accepted.
|
// rejects by extension) is accepted.
|
||||||
let src_ext = Path::new(&filename).extension().and_then(|e| e.to_str());
|
let src_ext = Path::new(&filename).extension().and_then(|e| e.to_str());
|
||||||
let wav = match transcode_bytes_to_wav(file_bytes.as_ref(), src_ext).await {
|
let wav =
|
||||||
Ok(w) => w,
|
match transcode_bytes_to_wav(file_bytes.as_ref(), src_ext, ref_start, ref_duration).await {
|
||||||
Err(e) => {
|
Ok(w) => w,
|
||||||
span.set_status(Status::error("audio decode failed"));
|
Err(e) => {
|
||||||
log::error!("voice upload transcode failed: {:?}", e);
|
span.set_status(Status::error("audio decode failed"));
|
||||||
return HttpResponse::BadRequest()
|
log::error!("voice upload transcode failed: {:?}", e);
|
||||||
.json(json!({ "error": "couldn't decode that audio file" }));
|
return HttpResponse::BadRequest()
|
||||||
}
|
.json(json!({ "error": "couldn't decode that audio file" }));
|
||||||
};
|
}
|
||||||
|
};
|
||||||
|
|
||||||
match client
|
match client
|
||||||
.create_voice(&name, wav, "reference.wav", "audio/wav")
|
.create_voice(&name, wav, "reference.wav", "audio/wav")
|
||||||
@@ -856,11 +926,19 @@ pub struct CreateVoiceFromLibraryRequest {
|
|||||||
pub path: String,
|
pub path: String,
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub library: Option<String>,
|
pub library: Option<String>,
|
||||||
|
/// Offset into the source where the reference window begins (default 0) —
|
||||||
|
/// lets the client pick the clean-speech section of a long recording.
|
||||||
|
#[serde(default)]
|
||||||
|
pub start_seconds: Option<f64>,
|
||||||
|
/// Reference window length; clamped to LLAMA_SWAP_TTS_REF_SECONDS.
|
||||||
|
#[serde(default)]
|
||||||
|
pub duration_seconds: Option<f64>,
|
||||||
}
|
}
|
||||||
|
|
||||||
/// POST /tts/voices/from-library — register a cloned voice from a file already
|
/// POST /tts/voices/from-library — register a cloned voice from a file already
|
||||||
/// in a library. Audio and video alike are ffmpeg-normalized to a mono 24 kHz
|
/// in a library. Audio and video alike are ffmpeg-normalized to a mono 24 kHz
|
||||||
/// WAV reference clip (length capped by LLAMA_SWAP_TTS_REF_SECONDS).
|
/// WAV reference clip (window selected by start/duration_seconds, length
|
||||||
|
/// capped by LLAMA_SWAP_TTS_REF_SECONDS).
|
||||||
#[post("/tts/voices/from-library")]
|
#[post("/tts/voices/from-library")]
|
||||||
pub async fn create_voice_from_library_handler(
|
pub async fn create_voice_from_library_handler(
|
||||||
http_request: HttpRequest,
|
http_request: HttpRequest,
|
||||||
@@ -882,9 +960,17 @@ pub async fn create_voice_from_library_handler(
|
|||||||
return HttpResponse::BadRequest()
|
return HttpResponse::BadRequest()
|
||||||
.json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
|
.json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
|
||||||
};
|
};
|
||||||
// Tag the name with the ref-clip cap (e.g. `grandma-30s`) so the library
|
let (ref_start, ref_duration) =
|
||||||
// shows which reference length produced each clone.
|
match resolve_ref_window(req.start_seconds, req.duration_seconds) {
|
||||||
let voice_name = append_ref_seconds(&voice_name, tts_ref_seconds());
|
Ok(w) => w,
|
||||||
|
Err(msg) => {
|
||||||
|
span.set_status(Status::error("invalid reference window"));
|
||||||
|
return HttpResponse::BadRequest().json(json!({ "error": msg }));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
// Tag the name with the ref-clip length (e.g. `grandma-30s`) so the
|
||||||
|
// library shows which reference length produced each clone.
|
||||||
|
let voice_name = append_ref_seconds(&voice_name, ref_duration.round().max(1.0) as u32);
|
||||||
|
|
||||||
let library = match libraries::resolve_library_param(&app_state, req.library.as_deref()) {
|
let library = match libraries::resolve_library_param(&app_state, req.library.as_deref()) {
|
||||||
Ok(Some(l)) => l,
|
Ok(Some(l)) => l,
|
||||||
@@ -913,7 +999,7 @@ pub async fn create_voice_from_library_handler(
|
|||||||
}
|
}
|
||||||
span.set_attribute(KeyValue::new("tts.voice_name", voice_name.clone()));
|
span.set_attribute(KeyValue::new("tts.voice_name", voice_name.clone()));
|
||||||
|
|
||||||
let wav = match prepare_reference_audio(&abs).await {
|
let wav = match prepare_reference_audio(&abs, ref_start, ref_duration).await {
|
||||||
Ok(b) => b,
|
Ok(b) => b,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
span.set_status(Status::error("audio decode failed"));
|
span.set_status(Status::error("audio decode failed"));
|
||||||
@@ -943,8 +1029,8 @@ pub async fn create_voice_from_library_handler(
|
|||||||
/// Read a library file (audio or video) as a Chatterbox-ready reference: ffmpeg
|
/// Read a library file (audio or video) as a Chatterbox-ready reference: ffmpeg
|
||||||
/// decodes/extracts its audio to mono 24 kHz WAV. Reading straight from the
|
/// decodes/extracts its audio to mono 24 kHz WAV. Reading straight from the
|
||||||
/// library path avoids slurping a (possibly large) video into memory.
|
/// library path avoids slurping a (possibly large) video into memory.
|
||||||
async fn prepare_reference_audio(abs: &Path) -> anyhow::Result<Vec<u8>> {
|
async fn prepare_reference_audio(abs: &Path, start: f64, duration: f64) -> anyhow::Result<Vec<u8>> {
|
||||||
run_ffmpeg_to_wav(&abs.to_string_lossy()).await
|
run_ffmpeg_to_wav(&abs.to_string_lossy(), start, duration).await
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -1009,6 +1095,33 @@ mod tests {
|
|||||||
assert!(tagged.ends_with("-30s"));
|
assert!(tagged.ends_with("-30s"));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn resolve_ref_window_defaults_to_start_of_clip_at_cap_length() {
|
||||||
|
// Reads the live cap rather than mutating LLAMA_SWAP_TTS_REF_SECONDS:
|
||||||
|
// env mutation flakes under the parallel suite (see env_dispatch).
|
||||||
|
let cap = f64::from(tts_ref_seconds());
|
||||||
|
assert_eq!(resolve_ref_window(None, None), Ok((0.0, cap)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn resolve_ref_window_accepts_offset_and_clamps_duration() {
|
||||||
|
let cap = f64::from(tts_ref_seconds());
|
||||||
|
assert_eq!(resolve_ref_window(Some(92.5), None), Ok((92.5, cap)));
|
||||||
|
assert_eq!(resolve_ref_window(Some(10.0), Some(12.0)), Ok((10.0, 12.0)));
|
||||||
|
// Longer-than-cap windows are bounded, not rejected.
|
||||||
|
assert_eq!(resolve_ref_window(None, Some(cap + 100.0)), Ok((0.0, cap)));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn resolve_ref_window_rejects_garbage() {
|
||||||
|
assert!(resolve_ref_window(Some(-1.0), None).is_err());
|
||||||
|
assert!(resolve_ref_window(Some(f64::NAN), None).is_err());
|
||||||
|
assert!(resolve_ref_window(Some(f64::INFINITY), None).is_err());
|
||||||
|
assert!(resolve_ref_window(None, Some(0.0)).is_err());
|
||||||
|
assert!(resolve_ref_window(None, Some(-5.0)).is_err());
|
||||||
|
assert!(resolve_ref_window(None, Some(f64::NAN)).is_err());
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn sweep_drops_expired_results_and_keeps_live_jobs() {
|
fn sweep_drops_expired_results_and_keeps_live_jobs() {
|
||||||
let now = Instant::now();
|
let now = Instant::now();
|
||||||
|
|||||||
Reference in New Issue
Block a user