Normalize voice-clone reference audio to WAV via ffmpeg
Chatterbox validates the reference clip by file extension and rejects formats like .aac/.opus. Always transcode the reference (upload bytes and library files alike) to mono 24 kHz WAV with ffmpeg before forwarding, so any source format is accepted and the from-library audio/video paths are unified. The reference length cap is now configurable via LLAMA_SWAP_TTS_REF_SECONDS (default 30) — Chatterbox is zero-shot, so a clean ~10-20s clip is the sweet spot. Drops the now-unused mime guesser. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -87,6 +87,7 @@ AGENTIC_CHAT_MAX_ITERATIONS=6
|
|||||||
# + voice cloning in the mobile app).
|
# + voice cloning in the mobile app).
|
||||||
# LLAMA_SWAP_TTS_MODEL=chatterbox # TTS model id in config.yaml
|
# LLAMA_SWAP_TTS_MODEL=chatterbox # TTS model id in config.yaml
|
||||||
# LLAMA_SWAP_TTS_VOICE=m # default voice when a request omits one
|
# LLAMA_SWAP_TTS_VOICE=m # default voice when a request omits one
|
||||||
|
# LLAMA_SWAP_TTS_REF_SECONDS=30 # max voice-clone reference clip length (s)
|
||||||
|
|
||||||
# ── AI Insights — sibling services (optional) ───────────────────────────
|
# ── AI Insights — sibling services (optional) ───────────────────────────
|
||||||
# Apollo (places, face inference, CLIP encoders). Single-Apollo deploys
|
# Apollo (places, face inference, CLIP encoders). Single-Apollo deploys
|
||||||
|
|||||||
@@ -165,6 +165,10 @@ is built whenever that's set — independent of `LLM_BACKEND`). Endpoints:
|
|||||||
Env:
|
Env:
|
||||||
- `LLAMA_SWAP_TTS_MODEL` - TTS model id in llama-swap's `config.yaml` [default: `chatterbox`]
|
- `LLAMA_SWAP_TTS_MODEL` - TTS model id in llama-swap's `config.yaml` [default: `chatterbox`]
|
||||||
- `LLAMA_SWAP_TTS_VOICE` - default voice used when a `/tts/speech` request omits `voice` (optional)
|
- `LLAMA_SWAP_TTS_VOICE` - default voice used when a `/tts/speech` request omits `voice` (optional)
|
||||||
|
- `LLAMA_SWAP_TTS_REF_SECONDS` - max voice-clone reference clip length in seconds
|
||||||
|
[default: `30`]. Reference audio is ffmpeg-normalized to mono 24 kHz WAV (so any
|
||||||
|
source format works); Chatterbox is zero-shot, so a clean ~10–20s sample is the
|
||||||
|
sweet spot — more rarely helps.
|
||||||
|
|
||||||
#### Fallback Behavior
|
#### Fallback Behavior
|
||||||
- Primary server is tried first with 5-second connection timeout
|
- Primary server is tried first with 5-second connection timeout
|
||||||
|
|||||||
+74
-78
@@ -110,21 +110,56 @@ fn clean_for_tts(input: &str) -> String {
|
|||||||
s.trim().to_string()
|
s.trim().to_string()
|
||||||
}
|
}
|
||||||
|
|
||||||
fn guess_audio_mime(path: &Path) -> String {
|
/// Decode an audio/video file to mono 24 kHz WAV via ffmpeg, returning the WAV
|
||||||
match path
|
/// bytes. Chatterbox validates the reference clip by file *extension* and
|
||||||
.extension()
|
/// rejects several formats (e.g. `.aac`, `.opus`), so we always normalize to
|
||||||
.and_then(|e| e.to_str())
|
/// WAV regardless of the source container. Capped at 30s — references only need
|
||||||
.map(|e| e.to_lowercase())
|
/// a few seconds of clean speech.
|
||||||
.as_deref()
|
async fn run_ffmpeg_to_wav(input_path: &str) -> anyhow::Result<Vec<u8>> {
|
||||||
{
|
let out = tempfile::Builder::new()
|
||||||
Some("wav") => "audio/wav",
|
.suffix(".wav")
|
||||||
Some("mp3") => "audio/mpeg",
|
.tempfile()
|
||||||
Some("m4a") | Some("mp4") | Some("aac") => "audio/mp4",
|
.context("creating temp wav")?;
|
||||||
Some("flac") => "audio/flac",
|
let out_s = out.path().to_string_lossy().to_string();
|
||||||
Some("ogg") | Some("oga") => "audio/ogg",
|
|
||||||
_ => "application/octet-stream",
|
// Cap the reference clip length. Chatterbox is zero-shot — a clean ~10–20s
|
||||||
|
// sample is the sweet spot and more rarely helps — so we use the first N
|
||||||
|
// seconds. Tune via LLAMA_SWAP_TTS_REF_SECONDS (default 30).
|
||||||
|
let secs = std::env::var("LLAMA_SWAP_TTS_REF_SECONDS")
|
||||||
|
.ok()
|
||||||
|
.and_then(|s| s.trim().parse::<u32>().ok())
|
||||||
|
.filter(|n| *n > 0)
|
||||||
|
.unwrap_or(30)
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
let output = tokio::process::Command::new("ffmpeg")
|
||||||
|
.args([
|
||||||
|
"-y", "-i", input_path, "-vn", "-ac", "1", "-ar", "24000", "-t", &secs, "-f", "wav",
|
||||||
|
&out_s,
|
||||||
|
])
|
||||||
|
.output()
|
||||||
|
.await
|
||||||
|
.context("spawning ffmpeg")?;
|
||||||
|
|
||||||
|
if !output.status.success() {
|
||||||
|
anyhow::bail!("ffmpeg failed: {}", String::from_utf8_lossy(&output.stderr));
|
||||||
}
|
}
|
||||||
.to_string()
|
std::fs::read(&out_s).context("reading transcoded audio")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Normalize in-memory upload bytes to WAV: write to a temp file (keeping the
|
||||||
|
/// source extension as an ffmpeg probe hint) then transcode.
|
||||||
|
async fn transcode_bytes_to_wav(input: &[u8], src_ext: Option<&str>) -> anyhow::Result<Vec<u8>> {
|
||||||
|
let suffix = src_ext
|
||||||
|
.filter(|e| !e.is_empty())
|
||||||
|
.map(|e| format!(".{e}"))
|
||||||
|
.unwrap_or_else(|| ".bin".to_string());
|
||||||
|
let in_tmp = tempfile::Builder::new()
|
||||||
|
.suffix(&suffix)
|
||||||
|
.tempfile()
|
||||||
|
.context("creating temp input")?;
|
||||||
|
std::fs::write(in_tmp.path(), input).context("writing temp input")?;
|
||||||
|
run_ffmpeg_to_wav(&in_tmp.path().to_string_lossy()).await
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Deserialize)]
|
#[derive(Debug, Deserialize)]
|
||||||
@@ -239,7 +274,6 @@ pub async fn create_voice_upload_handler(
|
|||||||
let mut voice_name: Option<String> = None;
|
let mut voice_name: Option<String> = None;
|
||||||
let mut file_bytes = BytesMut::new();
|
let mut file_bytes = BytesMut::new();
|
||||||
let mut filename = "voice.wav".to_string();
|
let mut filename = "voice.wav".to_string();
|
||||||
let mut mime = "application/octet-stream".to_string();
|
|
||||||
|
|
||||||
while let Some(Ok(mut part)) = payload.next().await {
|
while let Some(Ok(mut part)) = payload.next().await {
|
||||||
// Capture disposition fields up front so the immutable borrow ends
|
// Capture disposition fields up front so the immutable borrow ends
|
||||||
@@ -254,9 +288,6 @@ pub async fn create_voice_upload_handler(
|
|||||||
|
|
||||||
if let Some(fname) = fname_opt {
|
if let Some(fname) = fname_opt {
|
||||||
filename = fname;
|
filename = fname;
|
||||||
if let Some(ct) = part.content_type() {
|
|
||||||
mime = ct.to_string();
|
|
||||||
}
|
|
||||||
while let Some(Ok(data)) = part.next().await {
|
while let Some(Ok(data)) = part.next().await {
|
||||||
if file_bytes.len() + data.len() > MAX_VOICE_UPLOAD_BYTES {
|
if file_bytes.len() + data.len() > MAX_VOICE_UPLOAD_BYTES {
|
||||||
return HttpResponse::PayloadTooLarge()
|
return HttpResponse::PayloadTooLarge()
|
||||||
@@ -282,12 +313,21 @@ pub async fn create_voice_upload_handler(
|
|||||||
if file_bytes.is_empty() {
|
if file_bytes.is_empty() {
|
||||||
return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" }));
|
return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" }));
|
||||||
}
|
}
|
||||||
if !mime.starts_with("audio") {
|
|
||||||
mime = guess_audio_mime(Path::new(&filename));
|
// Normalize to WAV so any device format (e.g. .aac / .opus, which Chatterbox
|
||||||
}
|
// rejects by extension) is accepted.
|
||||||
|
let src_ext = Path::new(&filename).extension().and_then(|e| e.to_str());
|
||||||
|
let wav = match transcode_bytes_to_wav(file_bytes.as_ref(), src_ext).await {
|
||||||
|
Ok(w) => w,
|
||||||
|
Err(e) => {
|
||||||
|
log::error!("voice upload transcode failed: {:?}", e);
|
||||||
|
return HttpResponse::BadRequest()
|
||||||
|
.json(json!({ "error": "couldn't decode that audio file" }));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
match client
|
match client
|
||||||
.create_voice(&name, file_bytes.to_vec(), &filename, &mime)
|
.create_voice(&name, wav, "reference.wav", "audio/wav")
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
Ok(v) => HttpResponse::Ok().json(v),
|
Ok(v) => HttpResponse::Ok().json(v),
|
||||||
@@ -308,8 +348,8 @@ pub struct CreateVoiceFromLibraryRequest {
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// POST /tts/voices/from-library — register a cloned voice from a file already
|
/// POST /tts/voices/from-library — register a cloned voice from a file already
|
||||||
/// in a library. Audio files are forwarded as-is; video files have up to 30s
|
/// in a library. Audio and video alike are ffmpeg-normalized to a mono 24 kHz
|
||||||
/// of their audio track extracted (mono, 24 kHz) via ffmpeg.
|
/// WAV reference clip (length capped by LLAMA_SWAP_TTS_REF_SECONDS).
|
||||||
#[post("/tts/voices/from-library")]
|
#[post("/tts/voices/from-library")]
|
||||||
pub async fn create_voice_from_library_handler(
|
pub async fn create_voice_from_library_handler(
|
||||||
_claims: Claims,
|
_claims: Claims,
|
||||||
@@ -346,16 +386,17 @@ pub async fn create_voice_from_library_handler(
|
|||||||
.json(json!({ "error": "file is not an audio or video file" }));
|
.json(json!({ "error": "file is not an audio or video file" }));
|
||||||
}
|
}
|
||||||
|
|
||||||
let (bytes, filename, mime) = match prepare_reference_audio(&abs).await {
|
let wav = match prepare_reference_audio(&abs).await {
|
||||||
Ok(t) => t,
|
Ok(b) => b,
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
log::error!("voice reference prep failed for {:?}: {:?}", abs, e);
|
log::error!("voice reference prep failed for {:?}: {:?}", abs, e);
|
||||||
return HttpResponse::BadRequest().json(json!({ "error": format!("{e}") }));
|
return HttpResponse::BadRequest()
|
||||||
|
.json(json!({ "error": "couldn't decode that file's audio" }));
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
match client
|
match client
|
||||||
.create_voice(&voice_name, bytes, &filename, &mime)
|
.create_voice(&voice_name, wav, "reference.wav", "audio/wav")
|
||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
Ok(v) => HttpResponse::Ok().json(v),
|
Ok(v) => HttpResponse::Ok().json(v),
|
||||||
@@ -366,44 +407,11 @@ pub async fn create_voice_from_library_handler(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Read a library file as reference audio. Audio is returned verbatim; video
|
/// Read a library file (audio or video) as a Chatterbox-ready reference: ffmpeg
|
||||||
/// has up to 30s of audio extracted to mono 24 kHz WAV via ffmpeg.
|
/// decodes/extracts its audio to mono 24 kHz WAV. Reading straight from the
|
||||||
async fn prepare_reference_audio(abs: &Path) -> anyhow::Result<(Vec<u8>, String, String)> {
|
/// library path avoids slurping a (possibly large) video into memory.
|
||||||
if is_video_file(abs) {
|
async fn prepare_reference_audio(abs: &Path) -> anyhow::Result<Vec<u8>> {
|
||||||
let tmp = tempfile::Builder::new()
|
run_ffmpeg_to_wav(&abs.to_string_lossy()).await
|
||||||
.suffix(".wav")
|
|
||||||
.tempfile()
|
|
||||||
.context("creating temp wav")?;
|
|
||||||
let out = tmp.path().to_path_buf();
|
|
||||||
let abs_s = abs.to_string_lossy().to_string();
|
|
||||||
let out_s = out.to_string_lossy().to_string();
|
|
||||||
|
|
||||||
let output = tokio::process::Command::new("ffmpeg")
|
|
||||||
.args([
|
|
||||||
"-y", "-i", &abs_s, "-vn", "-ac", "1", "-ar", "24000", "-t", "30", "-f", "wav",
|
|
||||||
&out_s,
|
|
||||||
])
|
|
||||||
.output()
|
|
||||||
.await
|
|
||||||
.context("spawning ffmpeg")?;
|
|
||||||
|
|
||||||
if !output.status.success() {
|
|
||||||
anyhow::bail!(
|
|
||||||
"ffmpeg audio extraction failed: {}",
|
|
||||||
String::from_utf8_lossy(&output.stderr)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
let bytes = std::fs::read(&out).context("reading extracted audio")?;
|
|
||||||
Ok((bytes, "reference.wav".to_string(), "audio/wav".to_string()))
|
|
||||||
} else {
|
|
||||||
let bytes = std::fs::read(abs).context("reading audio file")?;
|
|
||||||
let filename = abs
|
|
||||||
.file_name()
|
|
||||||
.and_then(|f| f.to_str())
|
|
||||||
.unwrap_or("reference")
|
|
||||||
.to_string();
|
|
||||||
Ok((bytes, filename, guess_audio_mime(abs)))
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
@@ -447,18 +455,6 @@ mod tests {
|
|||||||
assert_eq!(sanitize_voice_name(&long).unwrap().len(), 64);
|
assert_eq!(sanitize_voice_name(&long).unwrap().len(), 64);
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
|
||||||
fn guess_audio_mime_maps_known_extensions() {
|
|
||||||
assert_eq!(guess_audio_mime(Path::new("clip.wav")), "audio/wav");
|
|
||||||
assert_eq!(guess_audio_mime(Path::new("clip.MP3")), "audio/mpeg");
|
|
||||||
assert_eq!(guess_audio_mime(Path::new("clip.m4a")), "audio/mp4");
|
|
||||||
assert_eq!(guess_audio_mime(Path::new("clip.flac")), "audio/flac");
|
|
||||||
assert_eq!(
|
|
||||||
guess_audio_mime(Path::new("clip.xyz")),
|
|
||||||
"application/octet-stream"
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn clean_for_tts_strips_markdown() {
|
fn clean_for_tts_strips_markdown() {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
|
|||||||
Reference in New Issue
Block a user