From 62d517dcdaba390ceb3db6a53aa86cd37569369a Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Tue, 2 Jun 2026 22:50:08 -0400
Subject: [PATCH] Normalize voice-clone reference audio to WAV via ffmpeg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Chatterbox validates the reference clip by file extension and rejects formats
like .aac/.opus. Always transcode the reference (upload bytes and library
files alike) to mono 24 kHz WAV with ffmpeg before forwarding, so any source
format is accepted and the from-library audio/video paths are unified.

The reference length cap is now configurable via LLAMA_SWAP_TTS_REF_SECONDS
(default 30) — Chatterbox is zero-shot, so a clean ~10-20s clip is the sweet
spot. Drops the now-unused mime guesser.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .env.example  |   1 +
 README.md     |   4 ++
 src/ai/tts.rs | 152 ++++++++++++++++++++++++--------------------------
 3 files changed, 79 insertions(+), 78 deletions(-)
diff --git a/.env.example b/.env.example
index 835bef5..2b6cff0 100644
--- a/.env.example
+++ b/.env.example
@@ -87,6 +87,7 @@ AGENTIC_CHAT_MAX_ITERATIONS=6
 # + voice cloning in the mobile app).
 # LLAMA_SWAP_TTS_MODEL=chatterbox        # TTS model id in config.yaml
 # LLAMA_SWAP_TTS_VOICE=m                 # default voice when a request omits one
+# LLAMA_SWAP_TTS_REF_SECONDS=30          # max voice-clone reference clip length (s)
 
 # ── AI Insights — sibling services (optional) ───────────────────────────
 # Apollo (places, face inference, CLIP encoders). Single-Apollo deploys
diff --git a/README.md b/README.md
index 12c220f..0b678df 100644
--- a/README.md
+++ b/README.md
@@ -165,6 +165,10 @@ is built whenever that's set — independent of `LLM_BACKEND`). Endpoints:
 Env:
 - `LLAMA_SWAP_TTS_MODEL` - TTS model id in llama-swap's `config.yaml` [default: `chatterbox`]
 - `LLAMA_SWAP_TTS_VOICE` - default voice used when a `/tts/speech` request omits `voice` (optional)
+- `LLAMA_SWAP_TTS_REF_SECONDS` - max voice-clone reference clip length in seconds
+  [default: `30`]. Reference audio is ffmpeg-normalized to mono 24 kHz WAV (so any
+  source format works); Chatterbox is zero-shot, so a clean ~10–20s sample is the
+  sweet spot — more rarely helps.
 
 #### Fallback Behavior
 - Primary server is tried first with 5-second connection timeout
diff --git a/src/ai/tts.rs b/src/ai/tts.rs
index 8078132..59b4a80 100644
--- a/src/ai/tts.rs
+++ b/src/ai/tts.rs
@@ -110,21 +110,56 @@ fn clean_for_tts(input: &str) -> String {
     s.trim().to_string()
 }
 
-fn guess_audio_mime(path: &Path) -> String {
-    match path
-        .extension()
-        .and_then(|e| e.to_str())
-        .map(|e| e.to_lowercase())
-        .as_deref()
-    {
-        Some("wav") => "audio/wav",
-        Some("mp3") => "audio/mpeg",
-        Some("m4a") | Some("mp4") | Some("aac") => "audio/mp4",
-        Some("flac") => "audio/flac",
-        Some("ogg") | Some("oga") => "audio/ogg",
-        _ => "application/octet-stream",
+/// Decode an audio/video file to mono 24 kHz WAV via ffmpeg, returning the WAV
+/// bytes. Chatterbox validates the reference clip by file *extension* and
+/// rejects several formats (e.g. `.aac`, `.opus`), so we always normalize to
+/// WAV regardless of the source container. Capped at 30s — references only need
+/// a few seconds of clean speech.
+async fn run_ffmpeg_to_wav(input_path: &str) -> anyhow::Result<Vec<u8>> {
+    let out = tempfile::Builder::new()
+        .suffix(".wav")
+        .tempfile()
+        .context("creating temp wav")?;
+    let out_s = out.path().to_string_lossy().to_string();
+
+    // Cap the reference clip length. Chatterbox is zero-shot — a clean ~10–20s
+    // sample is the sweet spot and more rarely helps — so we use the first N
+    // seconds. Tune via LLAMA_SWAP_TTS_REF_SECONDS (default 30).
+    let secs = std::env::var("LLAMA_SWAP_TTS_REF_SECONDS")
+        .ok()
+        .and_then(|s| s.trim().parse::<u32>().ok())
+        .filter(|n| *n > 0)
+        .unwrap_or(30)
+        .to_string();
+
+    let output = tokio::process::Command::new("ffmpeg")
+        .args([
+            "-y", "-i", input_path, "-vn", "-ac", "1", "-ar", "24000", "-t", &secs, "-f", "wav",
+            &out_s,
+        ])
+        .output()
+        .await
+        .context("spawning ffmpeg")?;
+
+    if !output.status.success() {
+        anyhow::bail!("ffmpeg failed: {}", String::from_utf8_lossy(&output.stderr));
     }
-    .to_string()
+    std::fs::read(&out_s).context("reading transcoded audio")
+}
+
+/// Normalize in-memory upload bytes to WAV: write to a temp file (keeping the
+/// source extension as an ffmpeg probe hint) then transcode.
+async fn transcode_bytes_to_wav(input: &[u8], src_ext: Option<&str>) -> anyhow::Result<Vec<u8>> {
+    let suffix = src_ext
+        .filter(|e| !e.is_empty())
+        .map(|e| format!(".{e}"))
+        .unwrap_or_else(|| ".bin".to_string());
+    let in_tmp = tempfile::Builder::new()
+        .suffix(&suffix)
+        .tempfile()
+        .context("creating temp input")?;
+    std::fs::write(in_tmp.path(), input).context("writing temp input")?;
+    run_ffmpeg_to_wav(&in_tmp.path().to_string_lossy()).await
 }
 
 #[derive(Debug, Deserialize)]
@@ -239,7 +274,6 @@ pub async fn create_voice_upload_handler(
     let mut voice_name: Option<String> = None;
     let mut file_bytes = BytesMut::new();
     let mut filename = "voice.wav".to_string();
-    let mut mime = "application/octet-stream".to_string();
 
     while let Some(Ok(mut part)) = payload.next().await {
         // Capture disposition fields up front so the immutable borrow ends
@@ -254,9 +288,6 @@ pub async fn create_voice_upload_handler(
 
         if let Some(fname) = fname_opt {
             filename = fname;
-            if let Some(ct) = part.content_type() {
-                mime = ct.to_string();
-            }
             while let Some(Ok(data)) = part.next().await {
                 if file_bytes.len() + data.len() > MAX_VOICE_UPLOAD_BYTES {
                     return HttpResponse::PayloadTooLarge()
@@ -282,12 +313,21 @@ pub async fn create_voice_upload_handler(
     if file_bytes.is_empty() {
         return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" }));
     }
-    if !mime.starts_with("audio") {
-        mime = guess_audio_mime(Path::new(&filename));
-    }
+
+    // Normalize to WAV so any device format (e.g. .aac / .opus, which Chatterbox
+    // rejects by extension) is accepted.
+    let src_ext = Path::new(&filename).extension().and_then(|e| e.to_str());
+    let wav = match transcode_bytes_to_wav(file_bytes.as_ref(), src_ext).await {
+        Ok(w) => w,
+        Err(e) => {
+            log::error!("voice upload transcode failed: {:?}", e);
+            return HttpResponse::BadRequest()
+                .json(json!({ "error": "couldn't decode that audio file" }));
+        }
+    };
 
     match client
-        .create_voice(&name, file_bytes.to_vec(), &filename, &mime)
+        .create_voice(&name, wav, "reference.wav", "audio/wav")
         .await
     {
         Ok(v) => HttpResponse::Ok().json(v),
@@ -308,8 +348,8 @@ pub struct CreateVoiceFromLibraryRequest {
 }
 
 /// POST /tts/voices/from-library — register a cloned voice from a file already
-/// in a library. Audio files are forwarded as-is; video files have up to 30s
-/// of their audio track extracted (mono, 24 kHz) via ffmpeg.
+/// in a library. Audio and video alike are ffmpeg-normalized to a mono 24 kHz
+/// WAV reference clip (length capped by LLAMA_SWAP_TTS_REF_SECONDS).
 #[post("/tts/voices/from-library")]
 pub async fn create_voice_from_library_handler(
     _claims: Claims,
@@ -346,16 +386,17 @@ pub async fn create_voice_from_library_handler(
             .json(json!({ "error": "file is not an audio or video file" }));
     }
 
-    let (bytes, filename, mime) = match prepare_reference_audio(&abs).await {
-        Ok(t) => t,
+    let wav = match prepare_reference_audio(&abs).await {
+        Ok(b) => b,
         Err(e) => {
             log::error!("voice reference prep failed for {:?}: {:?}", abs, e);
-            return HttpResponse::BadRequest().json(json!({ "error": format!("{e}") }));
+            return HttpResponse::BadRequest()
+                .json(json!({ "error": "couldn't decode that file's audio" }));
         }
     };
 
     match client
-        .create_voice(&voice_name, bytes, &filename, &mime)
+        .create_voice(&voice_name, wav, "reference.wav", "audio/wav")
         .await
     {
         Ok(v) => HttpResponse::Ok().json(v),
@@ -366,44 +407,11 @@ pub async fn create_voice_from_library_handler(
     }
 }
 
-/// Read a library file as reference audio. Audio is returned verbatim; video
-/// has up to 30s of audio extracted to mono 24 kHz WAV via ffmpeg.
-async fn prepare_reference_audio(abs: &Path) -> anyhow::Result<(Vec<u8>, String, String)> {
-    if is_video_file(abs) {
-        let tmp = tempfile::Builder::new()
-            .suffix(".wav")
-            .tempfile()
-            .context("creating temp wav")?;
-        let out = tmp.path().to_path_buf();
-        let abs_s = abs.to_string_lossy().to_string();
-        let out_s = out.to_string_lossy().to_string();
-
-        let output = tokio::process::Command::new("ffmpeg")
-            .args([
-                "-y", "-i", &abs_s, "-vn", "-ac", "1", "-ar", "24000", "-t", "30", "-f", "wav",
-                &out_s,
-            ])
-            .output()
-            .await
-            .context("spawning ffmpeg")?;
-
-        if !output.status.success() {
-            anyhow::bail!(
-                "ffmpeg audio extraction failed: {}",
-                String::from_utf8_lossy(&output.stderr)
-            );
-        }
-        let bytes = std::fs::read(&out).context("reading extracted audio")?;
-        Ok((bytes, "reference.wav".to_string(), "audio/wav".to_string()))
-    } else {
-        let bytes = std::fs::read(abs).context("reading audio file")?;
-        let filename = abs
-            .file_name()
-            .and_then(|f| f.to_str())
-            .unwrap_or("reference")
-            .to_string();
-        Ok((bytes, filename, guess_audio_mime(abs)))
-    }
+/// Read a library file (audio or video) as a Chatterbox-ready reference: ffmpeg
+/// decodes/extracts its audio to mono 24 kHz WAV. Reading straight from the
+/// library path avoids slurping a (possibly large) video into memory.
+async fn prepare_reference_audio(abs: &Path) -> anyhow::Result<Vec<u8>> {
+    run_ffmpeg_to_wav(&abs.to_string_lossy()).await
 }
 
 #[cfg(test)]
@@ -447,18 +455,6 @@ mod tests {
         assert_eq!(sanitize_voice_name(&long).unwrap().len(), 64);
     }
 
-    #[test]
-    fn guess_audio_mime_maps_known_extensions() {
-        assert_eq!(guess_audio_mime(Path::new("clip.wav")), "audio/wav");
-        assert_eq!(guess_audio_mime(Path::new("clip.MP3")), "audio/mpeg");
-        assert_eq!(guess_audio_mime(Path::new("clip.m4a")), "audio/mp4");
-        assert_eq!(guess_audio_mime(Path::new("clip.flac")), "audio/flac");
-        assert_eq!(
-            guess_audio_mime(Path::new("clip.xyz")),
-            "application/octet-stream"
-        );
-    }
-
     #[test]
     fn clean_for_tts_strips_markdown() {
         assert_eq!(