From 62d517dcdaba390ceb3db6a53aa86cd37569369a Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Tue, 2 Jun 2026 22:50:08 -0400 Subject: [PATCH] Normalize voice-clone reference audio to WAV via ffmpeg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Chatterbox validates the reference clip by file extension and rejects formats like .aac/.opus. Always transcode the reference (upload bytes and library files alike) to mono 24 kHz WAV with ffmpeg before forwarding, so any source format is accepted and the from-library audio/video paths are unified. The reference length cap is now configurable via LLAMA_SWAP_TTS_REF_SECONDS (default 30) — Chatterbox is zero-shot, so a clean ~10-20s clip is the sweet spot. Drops the now-unused mime guesser. Co-Authored-By: Claude Opus 4.8 (1M context) --- .env.example | 1 + README.md | 4 ++ src/ai/tts.rs | 152 ++++++++++++++++++++++++-------------------------- 3 files changed, 79 insertions(+), 78 deletions(-) diff --git a/.env.example b/.env.example index 835bef5..2b6cff0 100644 --- a/.env.example +++ b/.env.example @@ -87,6 +87,7 @@ AGENTIC_CHAT_MAX_ITERATIONS=6 # + voice cloning in the mobile app). # LLAMA_SWAP_TTS_MODEL=chatterbox # TTS model id in config.yaml # LLAMA_SWAP_TTS_VOICE=m # default voice when a request omits one +# LLAMA_SWAP_TTS_REF_SECONDS=30 # max voice-clone reference clip length (s) # ── AI Insights — sibling services (optional) ─────────────────────────── # Apollo (places, face inference, CLIP encoders). Single-Apollo deploys diff --git a/README.md b/README.md index 12c220f..0b678df 100644 --- a/README.md +++ b/README.md @@ -165,6 +165,10 @@ is built whenever that's set — independent of `LLM_BACKEND`). Endpoints: Env: - `LLAMA_SWAP_TTS_MODEL` - TTS model id in llama-swap's `config.yaml` [default: `chatterbox`] - `LLAMA_SWAP_TTS_VOICE` - default voice used when a `/tts/speech` request omits `voice` (optional) +- `LLAMA_SWAP_TTS_REF_SECONDS` - max voice-clone reference clip length in seconds + [default: `30`]. Reference audio is ffmpeg-normalized to mono 24 kHz WAV (so any + source format works); Chatterbox is zero-shot, so a clean ~10–20s sample is the + sweet spot — more rarely helps. #### Fallback Behavior - Primary server is tried first with 5-second connection timeout diff --git a/src/ai/tts.rs b/src/ai/tts.rs index 8078132..59b4a80 100644 --- a/src/ai/tts.rs +++ b/src/ai/tts.rs @@ -110,21 +110,56 @@ fn clean_for_tts(input: &str) -> String { s.trim().to_string() } -fn guess_audio_mime(path: &Path) -> String { - match path - .extension() - .and_then(|e| e.to_str()) - .map(|e| e.to_lowercase()) - .as_deref() - { - Some("wav") => "audio/wav", - Some("mp3") => "audio/mpeg", - Some("m4a") | Some("mp4") | Some("aac") => "audio/mp4", - Some("flac") => "audio/flac", - Some("ogg") | Some("oga") => "audio/ogg", - _ => "application/octet-stream", +/// Decode an audio/video file to mono 24 kHz WAV via ffmpeg, returning the WAV +/// bytes. Chatterbox validates the reference clip by file *extension* and +/// rejects several formats (e.g. `.aac`, `.opus`), so we always normalize to +/// WAV regardless of the source container. Capped at 30s — references only need +/// a few seconds of clean speech. +async fn run_ffmpeg_to_wav(input_path: &str) -> anyhow::Result> { + let out = tempfile::Builder::new() + .suffix(".wav") + .tempfile() + .context("creating temp wav")?; + let out_s = out.path().to_string_lossy().to_string(); + + // Cap the reference clip length. Chatterbox is zero-shot — a clean ~10–20s + // sample is the sweet spot and more rarely helps — so we use the first N + // seconds. Tune via LLAMA_SWAP_TTS_REF_SECONDS (default 30). + let secs = std::env::var("LLAMA_SWAP_TTS_REF_SECONDS") + .ok() + .and_then(|s| s.trim().parse::().ok()) + .filter(|n| *n > 0) + .unwrap_or(30) + .to_string(); + + let output = tokio::process::Command::new("ffmpeg") + .args([ + "-y", "-i", input_path, "-vn", "-ac", "1", "-ar", "24000", "-t", &secs, "-f", "wav", + &out_s, + ]) + .output() + .await + .context("spawning ffmpeg")?; + + if !output.status.success() { + anyhow::bail!("ffmpeg failed: {}", String::from_utf8_lossy(&output.stderr)); } - .to_string() + std::fs::read(&out_s).context("reading transcoded audio") +} + +/// Normalize in-memory upload bytes to WAV: write to a temp file (keeping the +/// source extension as an ffmpeg probe hint) then transcode. +async fn transcode_bytes_to_wav(input: &[u8], src_ext: Option<&str>) -> anyhow::Result> { + let suffix = src_ext + .filter(|e| !e.is_empty()) + .map(|e| format!(".{e}")) + .unwrap_or_else(|| ".bin".to_string()); + let in_tmp = tempfile::Builder::new() + .suffix(&suffix) + .tempfile() + .context("creating temp input")?; + std::fs::write(in_tmp.path(), input).context("writing temp input")?; + run_ffmpeg_to_wav(&in_tmp.path().to_string_lossy()).await } #[derive(Debug, Deserialize)] @@ -239,7 +274,6 @@ pub async fn create_voice_upload_handler( let mut voice_name: Option = None; let mut file_bytes = BytesMut::new(); let mut filename = "voice.wav".to_string(); - let mut mime = "application/octet-stream".to_string(); while let Some(Ok(mut part)) = payload.next().await { // Capture disposition fields up front so the immutable borrow ends @@ -254,9 +288,6 @@ pub async fn create_voice_upload_handler( if let Some(fname) = fname_opt { filename = fname; - if let Some(ct) = part.content_type() { - mime = ct.to_string(); - } while let Some(Ok(data)) = part.next().await { if file_bytes.len() + data.len() > MAX_VOICE_UPLOAD_BYTES { return HttpResponse::PayloadTooLarge() @@ -282,12 +313,21 @@ pub async fn create_voice_upload_handler( if file_bytes.is_empty() { return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" })); } - if !mime.starts_with("audio") { - mime = guess_audio_mime(Path::new(&filename)); - } + + // Normalize to WAV so any device format (e.g. .aac / .opus, which Chatterbox + // rejects by extension) is accepted. + let src_ext = Path::new(&filename).extension().and_then(|e| e.to_str()); + let wav = match transcode_bytes_to_wav(file_bytes.as_ref(), src_ext).await { + Ok(w) => w, + Err(e) => { + log::error!("voice upload transcode failed: {:?}", e); + return HttpResponse::BadRequest() + .json(json!({ "error": "couldn't decode that audio file" })); + } + }; match client - .create_voice(&name, file_bytes.to_vec(), &filename, &mime) + .create_voice(&name, wav, "reference.wav", "audio/wav") .await { Ok(v) => HttpResponse::Ok().json(v), @@ -308,8 +348,8 @@ pub struct CreateVoiceFromLibraryRequest { } /// POST /tts/voices/from-library — register a cloned voice from a file already -/// in a library. Audio files are forwarded as-is; video files have up to 30s -/// of their audio track extracted (mono, 24 kHz) via ffmpeg. +/// in a library. Audio and video alike are ffmpeg-normalized to a mono 24 kHz +/// WAV reference clip (length capped by LLAMA_SWAP_TTS_REF_SECONDS). #[post("/tts/voices/from-library")] pub async fn create_voice_from_library_handler( _claims: Claims, @@ -346,16 +386,17 @@ pub async fn create_voice_from_library_handler( .json(json!({ "error": "file is not an audio or video file" })); } - let (bytes, filename, mime) = match prepare_reference_audio(&abs).await { - Ok(t) => t, + let wav = match prepare_reference_audio(&abs).await { + Ok(b) => b, Err(e) => { log::error!("voice reference prep failed for {:?}: {:?}", abs, e); - return HttpResponse::BadRequest().json(json!({ "error": format!("{e}") })); + return HttpResponse::BadRequest() + .json(json!({ "error": "couldn't decode that file's audio" })); } }; match client - .create_voice(&voice_name, bytes, &filename, &mime) + .create_voice(&voice_name, wav, "reference.wav", "audio/wav") .await { Ok(v) => HttpResponse::Ok().json(v), @@ -366,44 +407,11 @@ pub async fn create_voice_from_library_handler( } } -/// Read a library file as reference audio. Audio is returned verbatim; video -/// has up to 30s of audio extracted to mono 24 kHz WAV via ffmpeg. -async fn prepare_reference_audio(abs: &Path) -> anyhow::Result<(Vec, String, String)> { - if is_video_file(abs) { - let tmp = tempfile::Builder::new() - .suffix(".wav") - .tempfile() - .context("creating temp wav")?; - let out = tmp.path().to_path_buf(); - let abs_s = abs.to_string_lossy().to_string(); - let out_s = out.to_string_lossy().to_string(); - - let output = tokio::process::Command::new("ffmpeg") - .args([ - "-y", "-i", &abs_s, "-vn", "-ac", "1", "-ar", "24000", "-t", "30", "-f", "wav", - &out_s, - ]) - .output() - .await - .context("spawning ffmpeg")?; - - if !output.status.success() { - anyhow::bail!( - "ffmpeg audio extraction failed: {}", - String::from_utf8_lossy(&output.stderr) - ); - } - let bytes = std::fs::read(&out).context("reading extracted audio")?; - Ok((bytes, "reference.wav".to_string(), "audio/wav".to_string())) - } else { - let bytes = std::fs::read(abs).context("reading audio file")?; - let filename = abs - .file_name() - .and_then(|f| f.to_str()) - .unwrap_or("reference") - .to_string(); - Ok((bytes, filename, guess_audio_mime(abs))) - } +/// Read a library file (audio or video) as a Chatterbox-ready reference: ffmpeg +/// decodes/extracts its audio to mono 24 kHz WAV. Reading straight from the +/// library path avoids slurping a (possibly large) video into memory. +async fn prepare_reference_audio(abs: &Path) -> anyhow::Result> { + run_ffmpeg_to_wav(&abs.to_string_lossy()).await } #[cfg(test)] @@ -447,18 +455,6 @@ mod tests { assert_eq!(sanitize_voice_name(&long).unwrap().len(), 64); } - #[test] - fn guess_audio_mime_maps_known_extensions() { - assert_eq!(guess_audio_mime(Path::new("clip.wav")), "audio/wav"); - assert_eq!(guess_audio_mime(Path::new("clip.MP3")), "audio/mpeg"); - assert_eq!(guess_audio_mime(Path::new("clip.m4a")), "audio/mp4"); - assert_eq!(guess_audio_mime(Path::new("clip.flac")), "audio/flac"); - assert_eq!( - guess_audio_mime(Path::new("clip.xyz")), - "application/octet-stream" - ); - } - #[test] fn clean_for_tts_strips_markdown() { assert_eq!(