From 69268d03fe18f6b222ba75906798b5cfe258bfa0 Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Tue, 2 Jun 2026 22:04:42 -0400
Subject: [PATCH 01/10] Add TTS endpoints backed by Chatterbox via llama-swap

LlamaCppClient gains text_to_speech (OpenAI /audio/speech), list_voices and
create_voice (voice library at the swap-root /upstream/<model>/voices
passthrough), plus a tts_model slot configured via LLAMA_SWAP_TTS_MODEL
(default "chatterbox").

New Claims-gated routes:
- POST /tts/speech        -> { audio_base64, format } for data: URI playback
- GET  /tts/voices        -> voice library passthrough
- POST /tts/voices/upload -> clone a voice from an uploaded clip (multipart)
- POST /tts/voices/from-library -> clone from a library file (ffmpeg-extracts
  audio from video; audio forwarded as-is)

Security: voice_name sanitized to [A-Za-z0-9_-] (it becomes an upstream
filename), 25 MB upload cap, library refs restricted to real audio/video,
path confined via is_valid_full_path. Adds is_audio_file + unit tests for the
sanitizer, mime guesser, and swap-root derivation.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/ai/llamacpp.rs | 136 ++++++++++++++++
 src/ai/mod.rs      |   5 +
 src/ai/tts.rs      | 393 +++++++++++++++++++++++++++++++++++++++++++++
 src/file_types.rs  |  17 ++
 src/main.rs        |   4 +
 src/state.rs       |   3 +
 6 files changed, 558 insertions(+)
 create mode 100644 src/ai/tts.rs
diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs
index e2ba00d..afd7f1b 100644
--- a/src/ai/llamacpp.rs
+++ b/src/ai/llamacpp.rs
@@ -36,6 +36,7 @@ const DEFAULT_BASE_URL: &str = "http://localhost:9292/v1";
 const DEFAULT_PRIMARY_MODEL: &str = "chat";
 const DEFAULT_VISION_MODEL: &str = "vision";
 const DEFAULT_EMBEDDING_MODEL: &str = "embed";
+const DEFAULT_TTS_MODEL: &str = "chatterbox";
 const DEFAULT_REQUEST_TIMEOUT_SECS: u64 = 180;
 
 /// OpenAI-compatible client targeting a llama-swap proxy in front of one or
@@ -54,6 +55,10 @@ pub struct LlamaCppClient {
     /// to `primary_model` so describe_image works out of the box; override
     /// via `LLAMA_SWAP_VISION_MODEL` for a dedicated vision slot.
     pub vision_model: String,
+    /// TTS model slot id (e.g. `"chatterbox"`). Routes `text_to_speech` and
+    /// is the `/upstream/<id>/voices` path segment for the voice library.
+    /// Override via `LLAMA_SWAP_TTS_MODEL`.
+    pub tts_model: String,
     num_ctx: Option<i32>,
     temperature: Option<f32>,
     top_p: Option<f32>,
@@ -78,6 +83,7 @@ impl LlamaCppClient {
             primary_model: pm.clone(),
             embedding_model: DEFAULT_EMBEDDING_MODEL.to_string(),
             vision_model: pm,
+            tts_model: DEFAULT_TTS_MODEL.to_string(),
             num_ctx: None,
             temperature: None,
             top_p: None,
@@ -111,6 +117,116 @@ impl LlamaCppClient {
         self.min_p = min_p;
     }
 
+    pub fn set_tts_model(&mut self, model: String) {
+        self.tts_model = model;
+    }
+
+    // --- TTS (Chatterbox behind llama-swap) ---------------------------------
+    //
+    // Speech synthesis uses the OpenAI-compatible `{base_url}/audio/speech`
+    // endpoint (llama-swap routes by the `model` field). The voice *library*
+    // (list / create cloned voices) is NOT an OpenAI endpoint — it lives on the
+    // upstream server directly, reached via llama-swap's passthrough at
+    // `{swap_root}/upstream/<tts_model>/voices`.
+
+    /// Root of the llama-swap proxy: `base_url` with a trailing `/v1` removed.
+    /// The `/upstream/...` passthrough lives here, not under `/v1`.
+    fn swap_root(&self) -> &str {
+        let b = self.base_url.trim_end_matches('/');
+        b.strip_suffix("/v1").unwrap_or(b)
+    }
+
+    /// Synthesize speech for `input` in an optional named `voice`, returning
+    /// the raw audio bytes (format per `response_format`, e.g. `"mp3"`/`"wav"`).
+    pub async fn text_to_speech(
+        &self,
+        input: &str,
+        voice: Option<&str>,
+        response_format: &str,
+    ) -> Result<Vec<u8>> {
+        let url = format!("{}/audio/speech", self.base_url);
+        let mut body = json!({
+            "model": self.tts_model,
+            "input": input,
+            "response_format": response_format,
+        });
+        if let Some(v) = voice {
+            body["voice"] = Value::String(v.to_string());
+        }
+
+        let resp = self
+            .client
+            .post(&url)
+            .json(&body)
+            .send()
+            .await
+            .with_context(|| format!("POST {} failed", url))?;
+
+        if !resp.status().is_success() {
+            let status = resp.status();
+            let text = resp.text().await.unwrap_or_default();
+            bail!("llama-swap TTS request failed: {} — {}", status, text);
+        }
+
+        Ok(resp
+            .bytes()
+            .await
+            .context("reading TTS audio bytes")?
+            .to_vec())
+    }
+
+    /// List voices in the Chatterbox voice library (raw JSON passthrough).
+    pub async fn list_voices(&self) -> Result<Value> {
+        let url = format!("{}/upstream/{}/voices", self.swap_root(), self.tts_model);
+        let resp = self
+            .client
+            .get(&url)
+            .send()
+            .await
+            .with_context(|| format!("GET {} failed", url))?;
+
+        if !resp.status().is_success() {
+            let status = resp.status();
+            let text = resp.text().await.unwrap_or_default();
+            bail!("llama-swap list_voices failed: {} — {}", status, text);
+        }
+        resp.json().await.context("parsing voices response")
+    }
+
+    /// Register a cloned voice from raw audio bytes (multipart `voice_name` +
+    /// `voice_file`). Returns the upstream JSON response.
+    pub async fn create_voice(
+        &self,
+        voice_name: &str,
+        audio_bytes: Vec<u8>,
+        filename: &str,
+        mime: &str,
+    ) -> Result<Value> {
+        let url = format!("{}/upstream/{}/voices", self.swap_root(), self.tts_model);
+        let part = reqwest::multipart::Part::bytes(audio_bytes)
+            .file_name(filename.to_string())
+            .mime_str(mime)
+            .context("invalid audio mime type")?;
+        let form = reqwest::multipart::Form::new()
+            .text("voice_name", voice_name.to_string())
+            .part("voice_file", part);
+
+        let resp = self
+            .client
+            .post(&url)
+            .multipart(form)
+            .send()
+            .await
+            .with_context(|| format!("POST {} (multipart) failed", url))?;
+
+        if !resp.status().is_success() {
+            let status = resp.status();
+            let text = resp.text().await.unwrap_or_default();
+            bail!("llama-swap create_voice failed: {} — {}", status, text);
+        }
+        resp.json().await.context("parsing create_voice response")
+    }
+
     /// Translate canonical messages to the OpenAI-compatible wire shape.
     /// Behaviorally identical to `OpenRouterClient::messages_to_openai` —
     /// stringify tool-call arguments, rewrite images into content-parts, attach
@@ -1140,4 +1256,24 @@ mod tests {
         let wire = LlamaCppClient::messages_to_openai(&[msg]);
         assert_eq!(wire[0]["content"], "");
     }
+
+    #[test]
+    fn swap_root_strips_v1_suffix() {
+        let c = LlamaCppClient::new(Some("http://localhost:9292/v1".to_string()), None);
+        assert_eq!(c.swap_root(), "http://localhost:9292");
+
+        // Tolerates a trailing slash on the base URL.
+        let c2 = LlamaCppClient::new(Some("http://localhost:9292/v1/".to_string()), None);
+        assert_eq!(c2.swap_root(), "http://localhost:9292");
+
+        // No /v1 suffix → returned unchanged.
+        let c3 = LlamaCppClient::new(Some("http://host:1234".to_string()), None);
+        assert_eq!(c3.swap_root(), "http://host:1234");
+    }
+
+    #[test]
+    fn tts_model_defaults_to_chatterbox() {
+        let c = LlamaCppClient::new(None, None);
+        assert_eq!(c.tts_model, "chatterbox");
+    }
 }
diff --git a/src/ai/mod.rs b/src/ai/mod.rs
index e9bec09..e61eace 100644
--- a/src/ai/mod.rs
+++ b/src/ai/mod.rs
@@ -11,6 +11,7 @@ pub mod llm_client;
 pub mod ollama;
 pub mod openrouter;
 pub mod sms_client;
+pub mod tts;
 pub mod turn_registry;
 
 // strip_summary_boilerplate is used by binaries (test_daily_summary), not the library
@@ -34,6 +35,10 @@ pub use llm_client::{
 };
 pub use ollama::{EMBEDDING_MODEL, OllamaClient};
 pub use sms_client::{SmsApiClient, SmsMessage};
+pub use tts::{
+    create_voice_from_library_handler, create_voice_upload_handler, list_voices_handler,
+    tts_speech_handler,
+};
 
 /// Display name used for the user in message transcripts and first-person
 /// prompt text. Reads the `USER_NAME` env var; defaults to `"Me"`. Models
diff --git a/src/ai/tts.rs b/src/ai/tts.rs
new file mode 100644
index 0000000..b2bd675
--- /dev/null
+++ b/src/ai/tts.rs
@@ -0,0 +1,393 @@
+// TTS endpoints: proxy text-to-speech + voice-library management to the
+// Chatterbox server that sits behind llama-swap (via LlamaCppClient). Speech
+// synthesis returns audio as base64-in-JSON so the mobile app can play it as a
+// `data:` URI without a binary-fetch path. Voice cloning registers a named
+// voice from either an uploaded clip (device) or an existing library file
+// (audio read directly; video has its audio track extracted via ffmpeg).
+
+use actix_multipart::Multipart;
+use actix_web::{HttpResponse, Responder, get, post, web};
+use anyhow::Context;
+use base64::Engine;
+use bytes::{BufMut, BytesMut};
+use futures::StreamExt;
+use serde::{Deserialize, Serialize};
+use serde_json::json;
+use std::path::Path;
+
+use crate::data::Claims;
+use crate::file_types::{is_audio_file, is_video_file};
+use crate::files::is_valid_full_path;
+use crate::libraries;
+use crate::state::AppState;
+
+/// Hard cap on an uploaded voice-reference clip. Chatterbox itself caps the
+/// payload (~60s clip); this is a defensive ceiling so a hostile/oversized
+/// upload can't balloon ImageApi memory before we ever forward it.
+const MAX_VOICE_UPLOAD_BYTES: usize = 25 * 1024 * 1024; // 25 MB
+
+/// Sanitize a user-supplied voice name. The name is forwarded to Chatterbox
+/// where it becomes a filename in the voice-library directory, so we restrict
+/// it to a safe charset (alphanumerics, dash, underscore) — no path
+/// separators, dots, or whitespace — and bound its length. Returns `None`
+/// when nothing usable remains.
+fn sanitize_voice_name(raw: &str) -> Option<String> {
+    let cleaned: String = raw
+        .trim()
+        .chars()
+        .map(|c| {
+            if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
+                c
+            } else {
+                '-'
+            }
+        })
+        .collect();
+    let cleaned = cleaned.trim_matches('-').to_string();
+    if cleaned.is_empty() {
+        return None;
+    }
+    Some(cleaned.chars().take(64).collect())
+}
+
+/// Optional default voice for synthesis when the request doesn't name one.
+/// Set `LLAMA_SWAP_TTS_VOICE=m` to read insights in a cloned voice by default.
+fn default_voice() -> Option<String> {
+    std::env::var("LLAMA_SWAP_TTS_VOICE")
+        .ok()
+        .map(|s| s.trim().to_string())
+        .filter(|s| !s.is_empty())
+}
+
+fn guess_audio_mime(path: &Path) -> String {
+    match path
+        .extension()
+        .and_then(|e| e.to_str())
+        .map(|e| e.to_lowercase())
+        .as_deref()
+    {
+        Some("wav") => "audio/wav",
+        Some("mp3") => "audio/mpeg",
+        Some("m4a") | Some("mp4") | Some("aac") => "audio/mp4",
+        Some("flac") => "audio/flac",
+        Some("ogg") | Some("oga") => "audio/ogg",
+        _ => "application/octet-stream",
+    }
+    .to_string()
+}
+
+#[derive(Debug, Deserialize)]
+pub struct TtsSpeechRequest {
+    pub text: String,
+    #[serde(default)]
+    pub voice: Option<String>,
+    /// Audio container, e.g. `"mp3"` (default) or `"wav"`.
+    #[serde(default)]
+    pub format: Option<String>,
+}
+
+#[derive(Debug, Serialize)]
+pub struct TtsSpeechResponse {
+    pub audio_base64: String,
+    pub format: String,
+}
+
+/// POST /tts/speech — synthesize `text` (optionally in a named `voice`) and
+/// return base64-encoded audio for `data:` URI playback on the client.
+#[post("/tts/speech")]
+pub async fn tts_speech_handler(
+    _claims: Claims,
+    req: web::Json<TtsSpeechRequest>,
+    app_state: web::Data<AppState>,
+) -> impl Responder {
+    let text = req.text.trim();
+    if text.is_empty() {
+        return HttpResponse::BadRequest().json(json!({ "error": "text is required" }));
+    }
+    let Some(client) = app_state.llamacpp.as_ref() else {
+        return HttpResponse::ServiceUnavailable()
+            .json(json!({ "error": "TTS backend not configured (set LLAMA_SWAP_URL)" }));
+    };
+
+    let format = req
+        .format
+        .as_deref()
+        .filter(|s| !s.is_empty())
+        .unwrap_or("mp3");
+    let dv = default_voice();
+    let voice = req
+        .voice
+        .as_deref()
+        .filter(|s| !s.is_empty())
+        .or(dv.as_deref());
+
+    match client.text_to_speech(text, voice, format).await {
+        Ok(bytes) => {
+            let audio_base64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
+            HttpResponse::Ok().json(TtsSpeechResponse {
+                audio_base64,
+                format: format.to_string(),
+            })
+        }
+        Err(e) => {
+            log::error!("TTS synth failed: {:?}", e);
+            HttpResponse::BadGateway().json(json!({ "error": format!("TTS failed: {e}") }))
+        }
+    }
+}
+
+/// GET /tts/voices — list the Chatterbox voice library (raw passthrough).
+#[get("/tts/voices")]
+pub async fn list_voices_handler(
+    _claims: Claims,
+    app_state: web::Data<AppState>,
+) -> impl Responder {
+    let Some(client) = app_state.llamacpp.as_ref() else {
+        return HttpResponse::ServiceUnavailable()
+            .json(json!({ "error": "TTS backend not configured" }));
+    };
+    match client.list_voices().await {
+        Ok(v) => HttpResponse::Ok().json(v),
+        Err(e) => {
+            log::error!("list_voices failed: {:?}", e);
+            HttpResponse::BadGateway().json(json!({ "error": format!("{e}") }))
+        }
+    }
+}
+
+/// POST /tts/voices/upload — register a cloned voice from an uploaded audio
+/// clip. Multipart fields: `voice_name` (text) + a file part (`voice_file`).
+#[post("/tts/voices/upload")]
+pub async fn create_voice_upload_handler(
+    _claims: Claims,
+    mut payload: Multipart,
+    app_state: web::Data<AppState>,
+) -> impl Responder {
+    let Some(client) = app_state.llamacpp.as_ref() else {
+        return HttpResponse::ServiceUnavailable()
+            .json(json!({ "error": "TTS backend not configured" }));
+    };
+
+    let mut voice_name: Option<String> = None;
+    let mut file_bytes = BytesMut::new();
+    let mut filename = "voice.wav".to_string();
+    let mut mime = "application/octet-stream".to_string();
+
+    while let Some(Ok(mut part)) = payload.next().await {
+        // Capture disposition fields up front so the immutable borrow ends
+        // before we mutably stream the part body (mirrors handlers/image.rs).
+        let (fname_opt, name_opt) = {
+            let cd = part.content_disposition();
+            (
+                cd.and_then(|c| c.get_filename()).map(|s| s.to_string()),
+                cd.and_then(|c| c.get_name()).map(|s| s.to_string()),
+            )
+        };
+
+        if let Some(fname) = fname_opt {
+            filename = fname;
+            if let Some(ct) = part.content_type() {
+                mime = ct.to_string();
+            }
+            while let Some(Ok(data)) = part.next().await {
+                if file_bytes.len() + data.len() > MAX_VOICE_UPLOAD_BYTES {
+                    return HttpResponse::PayloadTooLarge()
+                        .json(json!({ "error": "voice clip exceeds 25 MB" }));
+                }
+                file_bytes.put(data);
+            }
+        } else if name_opt.as_deref() == Some("voice_name") {
+            let mut buf = BytesMut::new();
+            while let Some(Ok(data)) = part.next().await {
+                buf.put(data);
+            }
+            voice_name = Some(String::from_utf8_lossy(&buf).trim().to_string());
+        } else {
+            while let Some(Ok(_)) = part.next().await {}
+        }
+    }
+
+    let Some(name) = voice_name.as_deref().and_then(sanitize_voice_name) else {
+        return HttpResponse::BadRequest()
+            .json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
+    };
+    if file_bytes.is_empty() {
+        return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" }));
+    }
+    if !mime.starts_with("audio") {
+        mime = guess_audio_mime(Path::new(&filename));
+    }
+
+    match client
+        .create_voice(&name, file_bytes.to_vec(), &filename, &mime)
+        .await
+    {
+        Ok(v) => HttpResponse::Ok().json(v),
+        Err(e) => {
+            log::error!("create_voice (upload) failed: {:?}", e);
+            HttpResponse::BadGateway().json(json!({ "error": format!("{e}") }))
+        }
+    }
+}
+
+#[derive(Debug, Deserialize)]
+pub struct CreateVoiceFromLibraryRequest {
+    pub voice_name: String,
+    /// Library-relative path to an audio or video file.
+    pub path: String,
+    #[serde(default)]
+    pub library: Option<String>,
+}
+
+/// POST /tts/voices/from-library — register a cloned voice from a file already
+/// in a library. Audio files are forwarded as-is; video files have up to 30s
+/// of their audio track extracted (mono, 24 kHz) via ffmpeg.
+#[post("/tts/voices/from-library")]
+pub async fn create_voice_from_library_handler(
+    _claims: Claims,
+    req: web::Json<CreateVoiceFromLibraryRequest>,
+    app_state: web::Data<AppState>,
+) -> impl Responder {
+    let Some(client) = app_state.llamacpp.as_ref() else {
+        return HttpResponse::ServiceUnavailable()
+            .json(json!({ "error": "TTS backend not configured" }));
+    };
+    let Some(voice_name) = sanitize_voice_name(&req.voice_name) else {
+        return HttpResponse::BadRequest()
+            .json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
+    };
+
+    let library = match libraries::resolve_library_param(&app_state, req.library.as_deref()) {
+        Ok(Some(l)) => l,
+        Ok(None) => app_state.primary_library(),
+        Err(msg) => return HttpResponse::BadRequest().json(json!({ "error": msg })),
+    };
+
+    // is_valid_full_path confines the path to the library root (no traversal).
+    let abs = match is_valid_full_path(&library.root_path, &req.path, false) {
+        Some(p) if p.exists() => p,
+        _ => {
+            return HttpResponse::NotFound().json(json!({ "error": "file not found in library" }));
+        }
+    };
+
+    // Only real audio/video sources are valid voice references — refuse to
+    // slurp arbitrary library files into memory / ffmpeg.
+    if !is_audio_file(&abs) && !is_video_file(&abs) {
+        return HttpResponse::BadRequest()
+            .json(json!({ "error": "file is not an audio or video file" }));
+    }
+
+    let (bytes, filename, mime) = match prepare_reference_audio(&abs).await {
+        Ok(t) => t,
+        Err(e) => {
+            log::error!("voice reference prep failed for {:?}: {:?}", abs, e);
+            return HttpResponse::BadRequest().json(json!({ "error": format!("{e}") }));
+        }
+    };
+
+    match client
+        .create_voice(&voice_name, bytes, &filename, &mime)
+        .await
+    {
+        Ok(v) => HttpResponse::Ok().json(v),
+        Err(e) => {
+            log::error!("create_voice (from-library) failed: {:?}", e);
+            HttpResponse::BadGateway().json(json!({ "error": format!("{e}") }))
+        }
+    }
+}
+
+/// Read a library file as reference audio. Audio is returned verbatim; video
+/// has up to 30s of audio extracted to mono 24 kHz WAV via ffmpeg.
+async fn prepare_reference_audio(abs: &Path) -> anyhow::Result<(Vec<u8>, String, String)> {
+    if is_video_file(abs) {
+        let tmp = tempfile::Builder::new()
+            .suffix(".wav")
+            .tempfile()
+            .context("creating temp wav")?;
+        let out = tmp.path().to_path_buf();
+        let abs_s = abs.to_string_lossy().to_string();
+        let out_s = out.to_string_lossy().to_string();
+
+        let output = tokio::process::Command::new("ffmpeg")
+            .args([
+                "-y", "-i", &abs_s, "-vn", "-ac", "1", "-ar", "24000", "-t", "30", "-f", "wav",
+                &out_s,
+            ])
+            .output()
+            .await
+            .context("spawning ffmpeg")?;
+
+        if !output.status.success() {
+            anyhow::bail!(
+                "ffmpeg audio extraction failed: {}",
+                String::from_utf8_lossy(&output.stderr)
+            );
+        }
+        let bytes = std::fs::read(&out).context("reading extracted audio")?;
+        Ok((bytes, "reference.wav".to_string(), "audio/wav".to_string()))
+    } else {
+        let bytes = std::fs::read(abs).context("reading audio file")?;
+        let filename = abs
+            .file_name()
+            .and_then(|f| f.to_str())
+            .unwrap_or("reference")
+            .to_string();
+        Ok((bytes, filename, guess_audio_mime(abs)))
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn sanitize_voice_name_keeps_safe_chars() {
+        assert_eq!(sanitize_voice_name("m").as_deref(), Some("m"));
+        assert_eq!(
+            sanitize_voice_name("  Cameron ").as_deref(),
+            Some("Cameron")
+        );
+        assert_eq!(
+            sanitize_voice_name("voice_01-a").as_deref(),
+            Some("voice_01-a")
+        );
+    }
+
+    #[test]
+    fn sanitize_voice_name_strips_unsafe_chars() {
+        // Path separators / dots / spaces become '-' and are trimmed at edges.
+        assert_eq!(sanitize_voice_name("a b.c").as_deref(), Some("a-b-c"));
+        assert_eq!(
+            sanitize_voice_name("../etc/passwd").as_deref(),
+            Some("etc-passwd")
+        );
+    }
+
+    #[test]
+    fn sanitize_voice_name_rejects_empty_or_all_unsafe() {
+        assert_eq!(sanitize_voice_name(""), None);
+        assert_eq!(sanitize_voice_name("   "), None);
+        assert_eq!(sanitize_voice_name("../../"), None);
+        assert_eq!(sanitize_voice_name("...."), None);
+    }
+
+    #[test]
+    fn sanitize_voice_name_bounds_length() {
+        let long = "a".repeat(200);
+        assert_eq!(sanitize_voice_name(&long).unwrap().len(), 64);
+    }
+
+    #[test]
+    fn guess_audio_mime_maps_known_extensions() {
+        assert_eq!(guess_audio_mime(Path::new("clip.wav")), "audio/wav");
+        assert_eq!(guess_audio_mime(Path::new("clip.MP3")), "audio/mpeg");
+        assert_eq!(guess_audio_mime(Path::new("clip.m4a")), "audio/mp4");
+        assert_eq!(guess_audio_mime(Path::new("clip.flac")), "audio/flac");
+        assert_eq!(
+            guess_audio_mime(Path::new("clip.xyz")),
+            "application/octet-stream"
+        );
+    }
+}
diff --git a/src/file_types.rs b/src/file_types.rs
index 33f71dd..b834cba 100644
--- a/src/file_types.rs
+++ b/src/file_types.rs
@@ -22,6 +22,10 @@ pub fn needs_ffmpeg_thumbnail(path: &Path) -> bool {
 /// Supported video file extensions
 pub const VIDEO_EXTENSIONS: &[&str] = &["mp4", "mov", "avi", "mkv"];
 
+/// Audio file extensions accepted as voice-clone references (TTS). Mirrors
+/// the formats Chatterbox can decode (wav/mp3/flac/m4a/aac/ogg).
+pub const AUDIO_EXTENSIONS: &[&str] = &["wav", "mp3", "flac", "m4a", "aac", "ogg", "oga", "opus"];
+
 /// Filenames that are filesystem metadata, not real media — exact
 /// basename match. Extend if a new platform sidecar appears (Windows
 /// Thumbs.db / desktop.ini live here too if those libraries land).
@@ -75,6 +79,19 @@ pub fn is_video_file(path: &Path) -> bool {
     }
 }
 
+/// Check if a path has an audio extension (voice-clone references)
+pub fn is_audio_file(path: &Path) -> bool {
+    if is_filesystem_metadata(path) {
+        return false;
+    }
+    if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
+        let ext_lower = ext.to_lowercase();
+        AUDIO_EXTENSIONS.contains(&ext_lower.as_str())
+    } else {
+        false
+    }
+}
+
 /// Check if a path has a supported media extension (image or video)
 pub fn is_media_file(path: &Path) -> bool {
     is_image_file(path) || is_video_file(path)
diff --git a/src/main.rs b/src/main.rs
index 4099a5d..8b06228 100644
--- a/src/main.rs
+++ b/src/main.rs
@@ -362,6 +362,10 @@ fn main() -> std::io::Result<()> {
                 .service(ai::cancel_turn_handler)
                 .service(ai::rate_insight_handler)
                 .service(ai::export_training_data_handler)
+                .service(ai::tts_speech_handler)
+                .service(ai::list_voices_handler)
+                .service(ai::create_voice_upload_handler)
+                .service(ai::create_voice_from_library_handler)
                 .service(libraries::list_libraries)
                 .service(libraries::patch_library)
                 .add_feature(add_tag_services::<_, SqliteTagDao>)
diff --git a/src/state.rs b/src/state.rs
index f9adda7..ef071a8 100644
--- a/src/state.rs
+++ b/src/state.rs
@@ -391,6 +391,9 @@ fn build_llamacpp_from_env() -> Option<Arc<LlamaCppClient>> {
     if let Ok(model) = env::var("LLAMA_SWAP_VISION_MODEL") {
         client.set_vision_model(model);
     }
+    if let Ok(model) = env::var("LLAMA_SWAP_TTS_MODEL") {
+        client.set_tts_model(model);
+    }
     Some(Arc::new(client))
 }
 

From 51be5df2148f53014ef5f0161c5f305dc17e0a92 Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Tue, 2 Jun 2026 22:15:05 -0400
Subject: [PATCH 02/10] Clean insight text for TTS and pass through Chatterbox
 tuning knobs

/tts/speech now normalizes input before synthesis: unwraps markdown
links/images to visible text, drops heading/list/blockquote/emphasis
markers and URLs, strips emoji (which non-turbo Chatterbox mispronounces
or skips), and collapses whitespace. Centralized in clean_for_tts so the
app, WebUI, and curl all get clean audio. Bracketed tags are deliberately
preserved for a future Turbo (paralinguistic) switch.

Adds optional exaggeration / cfg_weight / temperature to the request,
clamped to Chatterbox's documented ranges and forwarded on the speech
body. Unit tests cover markdown/emoji/URL stripping and tag preservation.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/ai/llamacpp.rs |  16 +++++++
 src/ai/tts.rs      | 104 ++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 118 insertions(+), 2 deletions(-)

diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs
index afd7f1b..2946688 100644
--- a/src/ai/llamacpp.rs
+++ b/src/ai/llamacpp.rs
@@ -138,11 +138,18 @@ impl LlamaCppClient {
 
     /// Synthesize speech for `input` in an optional named `voice`, returning
     /// the raw audio bytes (format per `response_format`, e.g. `"mp3"`/`"wav"`).
+    ///
+    /// Chatterbox generation knobs are forwarded when set (caller is expected
+    /// to have range-clamped them): `exaggeration` (0.25–2.0, emotion),
+    /// `cfg_weight` (0.0–1.0, pace), `temperature` (0.05–5.0, randomness).
     pub async fn text_to_speech(
         &self,
         input: &str,
         voice: Option<&str>,
         response_format: &str,
+        exaggeration: Option<f32>,
+        cfg_weight: Option<f32>,
+        temperature: Option<f32>,
     ) -> Result<Vec<u8>> {
         let url = format!("{}/audio/speech", self.base_url);
         let mut body = json!({
@@ -153,6 +160,15 @@ impl LlamaCppClient {
         if let Some(v) = voice {
             body["voice"] = Value::String(v.to_string());
         }
+        if let Some(x) = exaggeration {
+            body["exaggeration"] = json!(x);
+        }
+        if let Some(x) = cfg_weight {
+            body["cfg_weight"] = json!(x);
+        }
+        if let Some(x) = temperature {
+            body["temperature"] = json!(x);
+        }
 
         let resp = self
             .client
diff --git a/src/ai/tts.rs b/src/ai/tts.rs
index b2bd675..8078132 100644
--- a/src/ai/tts.rs
+++ b/src/ai/tts.rs
@@ -11,9 +11,11 @@ use anyhow::Context;
 use base64::Engine;
 use bytes::{BufMut, BytesMut};
 use futures::StreamExt;
+use regex::Regex;
 use serde::{Deserialize, Serialize};
 use serde_json::json;
 use std::path::Path;
+use std::sync::LazyLock;
 
 use crate::data::Claims;
 use crate::file_types::{is_audio_file, is_video_file};
@@ -59,6 +61,55 @@ fn default_voice() -> Option<String> {
         .filter(|s| !s.is_empty())
 }
 
+// Markdown / formatting strippers, compiled once. Insight text is markdown,
+// which TTS would otherwise read literally ("star star bold star star").
+static MD_IMAGE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\([^)]*\)").unwrap());
+static MD_LINK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\([^)]*\)").unwrap());
+static MD_HEADING: LazyLock<Regex> =
+    LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}#{1,6}\s*").unwrap());
+static MD_BLOCKQUOTE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}>\s?").unwrap());
+static MD_LIST: LazyLock<Regex> =
+    LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}([-*+]|\d+\.)\s+").unwrap());
+static MD_EMPHASIS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[*_`~]+").unwrap());
+static URL_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"https?://\S+").unwrap());
+static MULTISPACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[ \t]{2,}").unwrap());
+static MULTINEWLINE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\n{3,}").unwrap());
+
+/// True for emoji / pictographic symbols, which most TTS models either skip or
+/// mispronounce. Covers the main emoji blocks plus dingbats, misc-technical,
+/// variation selectors, and the ZWJ used to glue emoji sequences. We do NOT
+/// strip `[bracketed]` tags — non-turbo Chatterbox ignores them, and a future
+/// Turbo switch uses them as paralinguistic cues.
+fn is_emoji_like(c: char) -> bool {
+    let u = c as u32;
+    matches!(u,
+        0x1F000..=0x1FAFF   // emoji, pictographs, supplemental symbols, flags
+        | 0x2300..=0x23FF   // misc technical (⌚ ⏰ ⏳ …)
+        | 0x2600..=0x27BF   // misc symbols + dingbats
+        | 0x2B00..=0x2BFF   // misc symbols & arrows (★ ⬆ …)
+        | 0xFE00..=0xFE0F   // variation selectors
+        | 0x200D            // zero-width joiner
+    )
+}
+
+/// Normalize insight text for speech: unwrap markdown links/images to their
+/// visible text, drop heading/list/blockquote/emphasis markers and URLs, strip
+/// emoji, and collapse whitespace. Centralized here so every caller (app,
+/// WebUI, curl) gets clean audio.
+fn clean_for_tts(input: &str) -> String {
+    let s = MD_IMAGE.replace_all(input, "$1");
+    let s = MD_LINK.replace_all(&s, "$1");
+    let s = MD_HEADING.replace_all(&s, "");
+    let s = MD_BLOCKQUOTE.replace_all(&s, "");
+    let s = MD_LIST.replace_all(&s, "");
+    let s = MD_EMPHASIS.replace_all(&s, "");
+    let s = URL_RE.replace_all(&s, " ");
+    let s: String = s.chars().filter(|c| !is_emoji_like(*c)).collect();
+    let s = MULTISPACE.replace_all(&s, " ");
+    let s = MULTINEWLINE.replace_all(&s, "\n\n");
+    s.trim().to_string()
+}
+
 fn guess_audio_mime(path: &Path) -> String {
     match path
         .extension()
@@ -84,6 +135,15 @@ pub struct TtsSpeechRequest {
     /// Audio container, e.g. `"mp3"` (default) or `"wav"`.
     #[serde(default)]
     pub format: Option<String>,
+    /// Chatterbox knobs (clamped server-side). exaggeration 0.25–2.0 (emotion),
+    /// cfg_weight 0.0–1.0 (pace; ~0.3 for fast speakers, 0 to neutralize a
+    /// reference accent), temperature 0.05–5.0 (randomness).
+    #[serde(default)]
+    pub exaggeration: Option<f32>,
+    #[serde(default)]
+    pub cfg_weight: Option<f32>,
+    #[serde(default)]
+    pub temperature: Option<f32>,
 }
 
 #[derive(Debug, Serialize)]
@@ -100,7 +160,7 @@ pub async fn tts_speech_handler(
     req: web::Json<TtsSpeechRequest>,
     app_state: web::Data<AppState>,
 ) -> impl Responder {
-    let text = req.text.trim();
+    let text = clean_for_tts(&req.text);
     if text.is_empty() {
         return HttpResponse::BadRequest().json(json!({ "error": "text is required" }));
     }
@@ -121,7 +181,15 @@ pub async fn tts_speech_handler(
         .filter(|s| !s.is_empty())
         .or(dv.as_deref());
 
-    match client.text_to_speech(text, voice, format).await {
+    // Clamp generation knobs to Chatterbox's documented ranges before forwarding.
+    let exaggeration = req.exaggeration.map(|x| x.clamp(0.25, 2.0));
+    let cfg_weight = req.cfg_weight.map(|x| x.clamp(0.0, 1.0));
+    let temperature = req.temperature.map(|x| x.clamp(0.05, 5.0));
+
+    match client
+        .text_to_speech(&text, voice, format, exaggeration, cfg_weight, temperature)
+        .await
+    {
         Ok(bytes) => {
             let audio_base64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
             HttpResponse::Ok().json(TtsSpeechResponse {
@@ -390,4 +458,36 @@ mod tests {
             "application/octet-stream"
         );
     }
+
+    #[test]
+    fn clean_for_tts_strips_markdown() {
+        assert_eq!(
+            clean_for_tts("**Bold** and _italic_ and `code`"),
+            "Bold and italic and code"
+        );
+        assert_eq!(clean_for_tts("# Title\n\nbody"), "Title\n\nbody");
+        assert_eq!(
+            clean_for_tts("See [docs](http://x.com) now"),
+            "See docs now"
+        );
+        assert_eq!(clean_for_tts("- one\n- two"), "one\ntwo");
+    }
+
+    #[test]
+    fn clean_for_tts_strips_emoji_and_urls() {
+        assert_eq!(clean_for_tts("Hello 😀 world 🎉"), "Hello world");
+        assert_eq!(
+            clean_for_tts("visit https://example.com today"),
+            "visit today"
+        );
+        // ZWJ-glued emoji sequence is fully removed.
+        assert_eq!(clean_for_tts("family 👨‍👩‍👧 photo"), "family photo");
+    }
+
+    #[test]
+    fn clean_for_tts_preserves_bracket_tags() {
+        // Non-turbo Chatterbox ignores these; a future Turbo uses them as
+        // paralinguistic cues — so we must not strip them.
+        assert_eq!(clean_for_tts("hello [laugh] there"), "hello [laugh] there");
+    }
 }

From 35c5ecb427f7118440a0095e8730b2aa82d0e8e8 Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Tue, 2 Jun 2026 22:34:34 -0400
Subject: [PATCH 03/10] Document TTS endpoints and env in README + .env.example

Adds the /tts/speech and /tts/voices* endpoints plus LLAMA_SWAP_TTS_MODEL /
LLAMA_SWAP_TTS_VOICE (TTS only needs LLAMA_SWAP_URL, not LLM_BACKEND=llamacpp).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .env.example |  8 ++++++++
 README.md    | 19 +++++++++++++++++++
 2 files changed, 27 insertions(+)

diff --git a/.env.example b/.env.example
index f7a1004..835bef5 100644
--- a/.env.example
+++ b/.env.example
@@ -80,6 +80,14 @@ AGENTIC_CHAT_MAX_ITERATIONS=6
 # LLAMA_SWAP_ALLOWED_MODELS=chat,vision,embed
 # LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180
 
+# ── Text-to-speech (optional, requires LLAMA_SWAP_URL) ───────────────────
+# TTS routes through the same llama-swap proxy (a Chatterbox model id), so it
+# only needs LLAMA_SWAP_URL — it does NOT require LLM_BACKEND=llamacpp.
+# Powers POST /tts/speech and the /tts/voices* endpoints (read-aloud insights
+# + voice cloning in the mobile app).
+# LLAMA_SWAP_TTS_MODEL=chatterbox        # TTS model id in config.yaml
+# LLAMA_SWAP_TTS_VOICE=m                 # default voice when a request omits one
+
 # ── AI Insights — sibling services (optional) ───────────────────────────
 # Apollo (places, face inference, CLIP encoders). Single-Apollo deploys
 # typically set only APOLLO_API_BASE_URL and let the face + CLIP
diff --git a/README.md b/README.md
index b6d764b..12c220f 100644
--- a/README.md
+++ b/README.md
@@ -147,6 +147,25 @@ so you can rewrite the saved summary from within chat.
 - `AGENTIC_CHAT_MAX_ITERATIONS` - Cap on tool-calling iterations per chat turn [default: `6`]
   - Per-request `max_iterations` (when sent by the client) is clamped to this cap
 
+#### Text-to-Speech (Optional)
+Reads insights aloud and manages cloned voices via a Chatterbox model served
+behind the same llama-swap proxy. Only requires `LLAMA_SWAP_URL` (the TTS client
+is built whenever that's set — independent of `LLM_BACKEND`). Endpoints:
+- `POST /tts/speech` — body `{ text, voice?, format?, exaggeration?, cfg_weight?,
+  temperature? }`; returns `{ audio_base64, format }`. Input is cleaned
+  server-side (markdown + emoji stripped) and the generation knobs are clamped
+  to Chatterbox's ranges.
+- `GET /tts/voices` — list the voice library.
+- `POST /tts/voices/upload` — multipart `voice_name` + `voice_file`; clone a
+  voice from an uploaded clip (≤25 MB).
+- `POST /tts/voices/from-library` — body `{ voice_name, path, library? }`; clone
+  from a library file (audio forwarded as-is; video has its audio extracted via
+  ffmpeg).
+
+Env:
+- `LLAMA_SWAP_TTS_MODEL` - TTS model id in llama-swap's `config.yaml` [default: `chatterbox`]
+- `LLAMA_SWAP_TTS_VOICE` - default voice used when a `/tts/speech` request omits `voice` (optional)
+
 #### Fallback Behavior
 - Primary server is tried first with 5-second connection timeout
 - On failure, automatically falls back to secondary server (if configured)

From 62d517dcdaba390ceb3db6a53aa86cd37569369a Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Tue, 2 Jun 2026 22:50:08 -0400
Subject: [PATCH 04/10] Normalize voice-clone reference audio to WAV via ffmpeg
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Chatterbox validates the reference clip by file extension and rejects formats
like .aac/.opus. Always transcode the reference (upload bytes and library
files alike) to mono 24 kHz WAV with ffmpeg before forwarding, so any source
format is accepted and the from-library audio/video paths are unified.

The reference length cap is now configurable via LLAMA_SWAP_TTS_REF_SECONDS
(default 30) — Chatterbox is zero-shot, so a clean ~10-20s clip is the sweet
spot. Drops the now-unused mime guesser.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .env.example  |   1 +
 README.md     |   4 ++
 src/ai/tts.rs | 152 ++++++++++++++++++++++++--------------------------
 3 files changed, 79 insertions(+), 78 deletions(-)

diff --git a/.env.example b/.env.example
index 835bef5..2b6cff0 100644
--- a/.env.example
+++ b/.env.example
@@ -87,6 +87,7 @@ AGENTIC_CHAT_MAX_ITERATIONS=6
 # + voice cloning in the mobile app).
 # LLAMA_SWAP_TTS_MODEL=chatterbox        # TTS model id in config.yaml
 # LLAMA_SWAP_TTS_VOICE=m                 # default voice when a request omits one
+# LLAMA_SWAP_TTS_REF_SECONDS=30          # max voice-clone reference clip length (s)
 
 # ── AI Insights — sibling services (optional) ───────────────────────────
 # Apollo (places, face inference, CLIP encoders). Single-Apollo deploys
diff --git a/README.md b/README.md
index 12c220f..0b678df 100644
--- a/README.md
+++ b/README.md
@@ -165,6 +165,10 @@ is built whenever that's set — independent of `LLM_BACKEND`). Endpoints:
 Env:
 - `LLAMA_SWAP_TTS_MODEL` - TTS model id in llama-swap's `config.yaml` [default: `chatterbox`]
 - `LLAMA_SWAP_TTS_VOICE` - default voice used when a `/tts/speech` request omits `voice` (optional)
+- `LLAMA_SWAP_TTS_REF_SECONDS` - max voice-clone reference clip length in seconds
+  [default: `30`]. Reference audio is ffmpeg-normalized to mono 24 kHz WAV (so any
+  source format works); Chatterbox is zero-shot, so a clean ~10–20s sample is the
+  sweet spot — more rarely helps.
 
 #### Fallback Behavior
 - Primary server is tried first with 5-second connection timeout
diff --git a/src/ai/tts.rs b/src/ai/tts.rs
index 8078132..59b4a80 100644
--- a/src/ai/tts.rs
+++ b/src/ai/tts.rs
@@ -110,21 +110,56 @@ fn clean_for_tts(input: &str) -> String {
     s.trim().to_string()
 }
 
-fn guess_audio_mime(path: &Path) -> String {
-    match path
-        .extension()
-        .and_then(|e| e.to_str())
-        .map(|e| e.to_lowercase())
-        .as_deref()
-    {
-        Some("wav") => "audio/wav",
-        Some("mp3") => "audio/mpeg",
-        Some("m4a") | Some("mp4") | Some("aac") => "audio/mp4",
-        Some("flac") => "audio/flac",
-        Some("ogg") | Some("oga") => "audio/ogg",
-        _ => "application/octet-stream",
+/// Decode an audio/video file to mono 24 kHz WAV via ffmpeg, returning the WAV
+/// bytes. Chatterbox validates the reference clip by file *extension* and
+/// rejects several formats (e.g. `.aac`, `.opus`), so we always normalize to
+/// WAV regardless of the source container. Capped at 30s — references only need
+/// a few seconds of clean speech.
+async fn run_ffmpeg_to_wav(input_path: &str) -> anyhow::Result<Vec<u8>> {
+    let out = tempfile::Builder::new()
+        .suffix(".wav")
+        .tempfile()
+        .context("creating temp wav")?;
+    let out_s = out.path().to_string_lossy().to_string();
+
+    // Cap the reference clip length. Chatterbox is zero-shot — a clean ~10–20s
+    // sample is the sweet spot and more rarely helps — so we use the first N
+    // seconds. Tune via LLAMA_SWAP_TTS_REF_SECONDS (default 30).
+    let secs = std::env::var("LLAMA_SWAP_TTS_REF_SECONDS")
+        .ok()
+        .and_then(|s| s.trim().parse::<u32>().ok())
+        .filter(|n| *n > 0)
+        .unwrap_or(30)
+        .to_string();
+
+    let output = tokio::process::Command::new("ffmpeg")
+        .args([
+            "-y", "-i", input_path, "-vn", "-ac", "1", "-ar", "24000", "-t", &secs, "-f", "wav",
+            &out_s,
+        ])
+        .output()
+        .await
+        .context("spawning ffmpeg")?;
+
+    if !output.status.success() {
+        anyhow::bail!("ffmpeg failed: {}", String::from_utf8_lossy(&output.stderr));
     }
-    .to_string()
+    std::fs::read(&out_s).context("reading transcoded audio")
+}
+
+/// Normalize in-memory upload bytes to WAV: write to a temp file (keeping the
+/// source extension as an ffmpeg probe hint) then transcode.
+async fn transcode_bytes_to_wav(input: &[u8], src_ext: Option<&str>) -> anyhow::Result<Vec<u8>> {
+    let suffix = src_ext
+        .filter(|e| !e.is_empty())
+        .map(|e| format!(".{e}"))
+        .unwrap_or_else(|| ".bin".to_string());
+    let in_tmp = tempfile::Builder::new()
+        .suffix(&suffix)
+        .tempfile()
+        .context("creating temp input")?;
+    std::fs::write(in_tmp.path(), input).context("writing temp input")?;
+    run_ffmpeg_to_wav(&in_tmp.path().to_string_lossy()).await
 }
 
 #[derive(Debug, Deserialize)]
@@ -239,7 +274,6 @@ pub async fn create_voice_upload_handler(
     let mut voice_name: Option<String> = None;
     let mut file_bytes = BytesMut::new();
     let mut filename = "voice.wav".to_string();
-    let mut mime = "application/octet-stream".to_string();
 
     while let Some(Ok(mut part)) = payload.next().await {
         // Capture disposition fields up front so the immutable borrow ends
@@ -254,9 +288,6 @@ pub async fn create_voice_upload_handler(
 
         if let Some(fname) = fname_opt {
             filename = fname;
-            if let Some(ct) = part.content_type() {
-                mime = ct.to_string();
-            }
             while let Some(Ok(data)) = part.next().await {
                 if file_bytes.len() + data.len() > MAX_VOICE_UPLOAD_BYTES {
                     return HttpResponse::PayloadTooLarge()
@@ -282,12 +313,21 @@ pub async fn create_voice_upload_handler(
     if file_bytes.is_empty() {
         return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" }));
     }
-    if !mime.starts_with("audio") {
-        mime = guess_audio_mime(Path::new(&filename));
-    }
+
+    // Normalize to WAV so any device format (e.g. .aac / .opus, which Chatterbox
+    // rejects by extension) is accepted.
+    let src_ext = Path::new(&filename).extension().and_then(|e| e.to_str());
+    let wav = match transcode_bytes_to_wav(file_bytes.as_ref(), src_ext).await {
+        Ok(w) => w,
+        Err(e) => {
+            log::error!("voice upload transcode failed: {:?}", e);
+            return HttpResponse::BadRequest()
+                .json(json!({ "error": "couldn't decode that audio file" }));
+        }
+    };
 
     match client
-        .create_voice(&name, file_bytes.to_vec(), &filename, &mime)
+        .create_voice(&name, wav, "reference.wav", "audio/wav")
         .await
     {
         Ok(v) => HttpResponse::Ok().json(v),
@@ -308,8 +348,8 @@ pub struct CreateVoiceFromLibraryRequest {
 }
 
 /// POST /tts/voices/from-library — register a cloned voice from a file already
-/// in a library. Audio files are forwarded as-is; video files have up to 30s
-/// of their audio track extracted (mono, 24 kHz) via ffmpeg.
+/// in a library. Audio and video alike are ffmpeg-normalized to a mono 24 kHz
+/// WAV reference clip (length capped by LLAMA_SWAP_TTS_REF_SECONDS).
 #[post("/tts/voices/from-library")]
 pub async fn create_voice_from_library_handler(
     _claims: Claims,
@@ -346,16 +386,17 @@ pub async fn create_voice_from_library_handler(
             .json(json!({ "error": "file is not an audio or video file" }));
     }
 
-    let (bytes, filename, mime) = match prepare_reference_audio(&abs).await {
-        Ok(t) => t,
+    let wav = match prepare_reference_audio(&abs).await {
+        Ok(b) => b,
         Err(e) => {
             log::error!("voice reference prep failed for {:?}: {:?}", abs, e);
-            return HttpResponse::BadRequest().json(json!({ "error": format!("{e}") }));
+            return HttpResponse::BadRequest()
+                .json(json!({ "error": "couldn't decode that file's audio" }));
         }
     };
 
     match client
-        .create_voice(&voice_name, bytes, &filename, &mime)
+        .create_voice(&voice_name, wav, "reference.wav", "audio/wav")
         .await
     {
         Ok(v) => HttpResponse::Ok().json(v),
@@ -366,44 +407,11 @@ pub async fn create_voice_from_library_handler(
     }
 }
 
-/// Read a library file as reference audio. Audio is returned verbatim; video
-/// has up to 30s of audio extracted to mono 24 kHz WAV via ffmpeg.
-async fn prepare_reference_audio(abs: &Path) -> anyhow::Result<(Vec<u8>, String, String)> {
-    if is_video_file(abs) {
-        let tmp = tempfile::Builder::new()
-            .suffix(".wav")
-            .tempfile()
-            .context("creating temp wav")?;
-        let out = tmp.path().to_path_buf();
-        let abs_s = abs.to_string_lossy().to_string();
-        let out_s = out.to_string_lossy().to_string();
-
-        let output = tokio::process::Command::new("ffmpeg")
-            .args([
-                "-y", "-i", &abs_s, "-vn", "-ac", "1", "-ar", "24000", "-t", "30", "-f", "wav",
-                &out_s,
-            ])
-            .output()
-            .await
-            .context("spawning ffmpeg")?;
-
-        if !output.status.success() {
-            anyhow::bail!(
-                "ffmpeg audio extraction failed: {}",
-                String::from_utf8_lossy(&output.stderr)
-            );
-        }
-        let bytes = std::fs::read(&out).context("reading extracted audio")?;
-        Ok((bytes, "reference.wav".to_string(), "audio/wav".to_string()))
-    } else {
-        let bytes = std::fs::read(abs).context("reading audio file")?;
-        let filename = abs
-            .file_name()
-            .and_then(|f| f.to_str())
-            .unwrap_or("reference")
-            .to_string();
-        Ok((bytes, filename, guess_audio_mime(abs)))
-    }
+/// Read a library file (audio or video) as a Chatterbox-ready reference: ffmpeg
+/// decodes/extracts its audio to mono 24 kHz WAV. Reading straight from the
+/// library path avoids slurping a (possibly large) video into memory.
+async fn prepare_reference_audio(abs: &Path) -> anyhow::Result<Vec<u8>> {
+    run_ffmpeg_to_wav(&abs.to_string_lossy()).await
 }
 
 #[cfg(test)]
@@ -447,18 +455,6 @@ mod tests {
         assert_eq!(sanitize_voice_name(&long).unwrap().len(), 64);
     }
 
-    #[test]
-    fn guess_audio_mime_maps_known_extensions() {
-        assert_eq!(guess_audio_mime(Path::new("clip.wav")), "audio/wav");
-        assert_eq!(guess_audio_mime(Path::new("clip.MP3")), "audio/mpeg");
-        assert_eq!(guess_audio_mime(Path::new("clip.m4a")), "audio/mp4");
-        assert_eq!(guess_audio_mime(Path::new("clip.flac")), "audio/flac");
-        assert_eq!(
-            guess_audio_mime(Path::new("clip.xyz")),
-            "application/octet-stream"
-        );
-    }
-
     #[test]
     fn clean_for_tts_strips_markdown() {
         assert_eq!(

From ccacfe1113f74ac5e96231366ffbef26f8180e33 Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Tue, 2 Jun 2026 23:10:43 -0400
Subject: [PATCH 05/10] Instrument TTS handlers with OTel spans (codebase
 standard)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Each /tts handler now opens an http.tts.* span via extract_context_from_request
+ global_tracer().start_with_context, sets Status::Ok / Status::error on every
outcome, and records useful attributes (model, format, voice_name, byte counts)
— matching the insight handlers. Prometheus request metrics were already
covered by the app-wide actix-web-prom middleware.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/ai/tts.rs | 69 +++++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 64 insertions(+), 5 deletions(-)

diff --git a/src/ai/tts.rs b/src/ai/tts.rs
index 59b4a80..9c98bee 100644
--- a/src/ai/tts.rs
+++ b/src/ai/tts.rs
@@ -6,11 +6,13 @@
 // (audio read directly; video has its audio track extracted via ffmpeg).
 
 use actix_multipart::Multipart;
-use actix_web::{HttpResponse, Responder, get, post, web};
+use actix_web::{HttpRequest, HttpResponse, Responder, get, post, web};
 use anyhow::Context;
 use base64::Engine;
 use bytes::{BufMut, BytesMut};
 use futures::StreamExt;
+use opentelemetry::KeyValue;
+use opentelemetry::trace::{Span, Status, Tracer};
 use regex::Regex;
 use serde::{Deserialize, Serialize};
 use serde_json::json;
@@ -21,6 +23,7 @@ use crate::data::Claims;
 use crate::file_types::{is_audio_file, is_video_file};
 use crate::files::is_valid_full_path;
 use crate::libraries;
+use crate::otel::{extract_context_from_request, global_tracer};
 use crate::state::AppState;
 
 /// Hard cap on an uploaded voice-reference clip. Chatterbox itself caps the
@@ -191,15 +194,21 @@ pub struct TtsSpeechResponse {
 /// return base64-encoded audio for `data:` URI playback on the client.
 #[post("/tts/speech")]
 pub async fn tts_speech_handler(
+    http_request: HttpRequest,
     _claims: Claims,
     req: web::Json<TtsSpeechRequest>,
     app_state: web::Data<AppState>,
 ) -> impl Responder {
+    let parent_context = extract_context_from_request(&http_request);
+    let mut span = global_tracer().start_with_context("http.tts.speech", &parent_context);
+
     let text = clean_for_tts(&req.text);
     if text.is_empty() {
+        span.set_status(Status::error("text is required"));
         return HttpResponse::BadRequest().json(json!({ "error": "text is required" }));
     }
     let Some(client) = app_state.llamacpp.as_ref() else {
+        span.set_status(Status::error("tts backend not configured"));
         return HttpResponse::ServiceUnavailable()
             .json(json!({ "error": "TTS backend not configured (set LLAMA_SWAP_URL)" }));
     };
@@ -216,6 +225,11 @@ pub async fn tts_speech_handler(
         .filter(|s| !s.is_empty())
         .or(dv.as_deref());
 
+    span.set_attribute(KeyValue::new("tts.model", client.tts_model.clone()));
+    span.set_attribute(KeyValue::new("tts.format", format.to_string()));
+    span.set_attribute(KeyValue::new("tts.has_voice", voice.is_some()));
+    span.set_attribute(KeyValue::new("tts.text_len", text.len() as i64));
+
     // Clamp generation knobs to Chatterbox's documented ranges before forwarding.
     let exaggeration = req.exaggeration.map(|x| x.clamp(0.25, 2.0));
     let cfg_weight = req.cfg_weight.map(|x| x.clamp(0.0, 1.0));
@@ -226,6 +240,8 @@ pub async fn tts_speech_handler(
         .await
     {
         Ok(bytes) => {
+            span.set_attribute(KeyValue::new("tts.audio_bytes", bytes.len() as i64));
+            span.set_status(Status::Ok);
             let audio_base64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
             HttpResponse::Ok().json(TtsSpeechResponse {
                 audio_base64,
@@ -233,6 +249,7 @@ pub async fn tts_speech_handler(
             })
         }
         Err(e) => {
+            span.set_status(Status::error("tts synthesis failed"));
             log::error!("TTS synth failed: {:?}", e);
             HttpResponse::BadGateway().json(json!({ "error": format!("TTS failed: {e}") }))
         }
@@ -242,16 +259,25 @@ pub async fn tts_speech_handler(
 /// GET /tts/voices — list the Chatterbox voice library (raw passthrough).
 #[get("/tts/voices")]
 pub async fn list_voices_handler(
+    http_request: HttpRequest,
     _claims: Claims,
     app_state: web::Data<AppState>,
 ) -> impl Responder {
+    let parent_context = extract_context_from_request(&http_request);
+    let mut span = global_tracer().start_with_context("http.tts.voices.list", &parent_context);
+
     let Some(client) = app_state.llamacpp.as_ref() else {
+        span.set_status(Status::error("tts backend not configured"));
         return HttpResponse::ServiceUnavailable()
             .json(json!({ "error": "TTS backend not configured" }));
     };
     match client.list_voices().await {
-        Ok(v) => HttpResponse::Ok().json(v),
+        Ok(v) => {
+            span.set_status(Status::Ok);
+            HttpResponse::Ok().json(v)
+        }
         Err(e) => {
+            span.set_status(Status::error("list_voices failed"));
             log::error!("list_voices failed: {:?}", e);
             HttpResponse::BadGateway().json(json!({ "error": format!("{e}") }))
         }
@@ -262,11 +288,16 @@ pub async fn list_voices_handler(
 /// clip. Multipart fields: `voice_name` (text) + a file part (`voice_file`).
 #[post("/tts/voices/upload")]
 pub async fn create_voice_upload_handler(
+    http_request: HttpRequest,
     _claims: Claims,
     mut payload: Multipart,
     app_state: web::Data<AppState>,
 ) -> impl Responder {
+    let parent_context = extract_context_from_request(&http_request);
+    let mut span = global_tracer().start_with_context("http.tts.voices.upload", &parent_context);
+
     let Some(client) = app_state.llamacpp.as_ref() else {
+        span.set_status(Status::error("tts backend not configured"));
         return HttpResponse::ServiceUnavailable()
             .json(json!({ "error": "TTS backend not configured" }));
     };
@@ -290,6 +321,7 @@ pub async fn create_voice_upload_handler(
             filename = fname;
             while let Some(Ok(data)) = part.next().await {
                 if file_bytes.len() + data.len() > MAX_VOICE_UPLOAD_BYTES {
+                    span.set_status(Status::error("voice clip exceeds limit"));
                     return HttpResponse::PayloadTooLarge()
                         .json(json!({ "error": "voice clip exceeds 25 MB" }));
                 }
@@ -307,12 +339,16 @@ pub async fn create_voice_upload_handler(
     }
 
     let Some(name) = voice_name.as_deref().and_then(sanitize_voice_name) else {
+        span.set_status(Status::error("voice_name is required"));
         return HttpResponse::BadRequest()
             .json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
     };
     if file_bytes.is_empty() {
+        span.set_status(Status::error("voice_file is required"));
         return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" }));
     }
+    span.set_attribute(KeyValue::new("tts.voice_name", name.clone()));
+    span.set_attribute(KeyValue::new("tts.upload_bytes", file_bytes.len() as i64));
 
     // Normalize to WAV so any device format (e.g. .aac / .opus, which Chatterbox
     // rejects by extension) is accepted.
@@ -320,6 +356,7 @@ pub async fn create_voice_upload_handler(
     let wav = match transcode_bytes_to_wav(file_bytes.as_ref(), src_ext).await {
         Ok(w) => w,
         Err(e) => {
+            span.set_status(Status::error("audio decode failed"));
             log::error!("voice upload transcode failed: {:?}", e);
             return HttpResponse::BadRequest()
                 .json(json!({ "error": "couldn't decode that audio file" }));
@@ -330,8 +367,12 @@ pub async fn create_voice_upload_handler(
         .create_voice(&name, wav, "reference.wav", "audio/wav")
         .await
     {
-        Ok(v) => HttpResponse::Ok().json(v),
+        Ok(v) => {
+            span.set_status(Status::Ok);
+            HttpResponse::Ok().json(v)
+        }
         Err(e) => {
+            span.set_status(Status::error("create_voice failed"));
             log::error!("create_voice (upload) failed: {:?}", e);
             HttpResponse::BadGateway().json(json!({ "error": format!("{e}") }))
         }
@@ -352,15 +393,22 @@ pub struct CreateVoiceFromLibraryRequest {
 /// WAV reference clip (length capped by LLAMA_SWAP_TTS_REF_SECONDS).
 #[post("/tts/voices/from-library")]
 pub async fn create_voice_from_library_handler(
+    http_request: HttpRequest,
     _claims: Claims,
     req: web::Json<CreateVoiceFromLibraryRequest>,
     app_state: web::Data<AppState>,
 ) -> impl Responder {
+    let parent_context = extract_context_from_request(&http_request);
+    let mut span =
+        global_tracer().start_with_context("http.tts.voices.from_library", &parent_context);
+
     let Some(client) = app_state.llamacpp.as_ref() else {
+        span.set_status(Status::error("tts backend not configured"));
         return HttpResponse::ServiceUnavailable()
             .json(json!({ "error": "TTS backend not configured" }));
     };
     let Some(voice_name) = sanitize_voice_name(&req.voice_name) else {
+        span.set_status(Status::error("voice_name is required"));
         return HttpResponse::BadRequest()
             .json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
     };
@@ -368,13 +416,17 @@ pub async fn create_voice_from_library_handler(
     let library = match libraries::resolve_library_param(&app_state, req.library.as_deref()) {
         Ok(Some(l)) => l,
         Ok(None) => app_state.primary_library(),
-        Err(msg) => return HttpResponse::BadRequest().json(json!({ "error": msg })),
+        Err(msg) => {
+            span.set_status(Status::error("invalid library"));
+            return HttpResponse::BadRequest().json(json!({ "error": msg }));
+        }
     };
 
     // is_valid_full_path confines the path to the library root (no traversal).
     let abs = match is_valid_full_path(&library.root_path, &req.path, false) {
         Some(p) if p.exists() => p,
         _ => {
+            span.set_status(Status::error("file not found"));
             return HttpResponse::NotFound().json(json!({ "error": "file not found in library" }));
         }
     };
@@ -382,13 +434,16 @@ pub async fn create_voice_from_library_handler(
     // Only real audio/video sources are valid voice references — refuse to
     // slurp arbitrary library files into memory / ffmpeg.
     if !is_audio_file(&abs) && !is_video_file(&abs) {
+        span.set_status(Status::error("not an audio/video file"));
         return HttpResponse::BadRequest()
             .json(json!({ "error": "file is not an audio or video file" }));
     }
+    span.set_attribute(KeyValue::new("tts.voice_name", voice_name.clone()));
 
     let wav = match prepare_reference_audio(&abs).await {
         Ok(b) => b,
         Err(e) => {
+            span.set_status(Status::error("audio decode failed"));
             log::error!("voice reference prep failed for {:?}: {:?}", abs, e);
             return HttpResponse::BadRequest()
                 .json(json!({ "error": "couldn't decode that file's audio" }));
@@ -399,8 +454,12 @@ pub async fn create_voice_from_library_handler(
         .create_voice(&voice_name, wav, "reference.wav", "audio/wav")
         .await
     {
-        Ok(v) => HttpResponse::Ok().json(v),
+        Ok(v) => {
+            span.set_status(Status::Ok);
+            HttpResponse::Ok().json(v)
+        }
         Err(e) => {
+            span.set_status(Status::error("create_voice failed"));
             log::error!("create_voice (from-library) failed: {:?}", e);
             HttpResponse::BadGateway().json(json!({ "error": format!("{e}") }))
         }

From 9978b28b52bf4ecf6673109bed5a4384f3f59b4b Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Tue, 2 Jun 2026 23:15:39 -0400
Subject: [PATCH 06/10] Document TTS endpoints + env in CLAUDE.md

Sync CLAUDE.md with the Chatterbox TTS feature: the /tts/* endpoints and the
LLAMA_SWAP_TTS_MODEL / _VOICE / _REF_SECONDS env vars (only need LLAMA_SWAP_URL).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 CLAUDE.md | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/CLAUDE.md b/CLAUDE.md
index 7f1da76..b5e1ee2 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -477,6 +477,12 @@ GET  /insights/models                (local-backend models + capabilities; Ollam
 GET  /insights/openrouter/models     (curated OpenRouter allowlist)
 POST /insights/rate                  (thumbs up/down for training data)
 
+// Text-to-Speech (Chatterbox via llama-swap; needs LLAMA_SWAP_URL)
+POST /tts/speech                     (read-aloud: { text, voice?, ... } -> { audio_base64, format })
+GET  /tts/voices                     (Chatterbox voice library)
+POST /tts/voices/upload              (clone a voice from an uploaded clip; multipart)
+POST /tts/voices/from-library        (clone a voice from a library audio/video file)
+
 // Insight Chat Continuation
 POST /insights/chat                  (single-turn reply, non-streaming)
 POST /insights/chat/stream           (SSE: text / tool_call / tool_result / truncated / done)
@@ -652,6 +658,15 @@ LLAMA_SWAP_ALLOWED_MODELS=chat,coder            # Curated allowlist surfaced by
                                                 # Empty = picker shows only the configured primary model.
 LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180          # Per-request timeout; bump for slow CPU offload
 
+# Text-to-speech (Chatterbox served behind llama-swap). Only needs
+# LLAMA_SWAP_URL — independent of LLM_BACKEND. Powers /tts/speech (read-aloud)
+# and /tts/voices* (voice cloning). Reference audio is ffmpeg-normalized to WAV
+# server-side, so any source format works.
+LLAMA_SWAP_TTS_MODEL=chatterbox                # TTS model id in config.yaml (default: chatterbox)
+LLAMA_SWAP_TTS_VOICE=m                         # Default voice when /tts/speech omits one (optional)
+LLAMA_SWAP_TTS_REF_SECONDS=30                  # Max voice-clone reference clip length, seconds
+                                               # (Chatterbox is zero-shot; ~10-20s clean ref is ideal)
+
 # Insight Chat Continuation
 AGENTIC_CHAT_MAX_ITERATIONS=6                  # Cap on tool-calling iterations per chat turn (default 6)
 ```

From d8dd260c6bfa22b0e6b78092e5e2c62a4a41fc38 Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Wed, 3 Jun 2026 10:25:06 -0400
Subject: [PATCH 07/10] Give TTS synthesis its own (longer) request timeout

Long insights are chunked + synthesized server-side and can run past the shared
180s chat/embedding client timeout, causing spurious timeouts. /tts/speech now
uses a per-request timeout from LLAMA_SWAP_TTS_REQUEST_TIMEOUT_SECONDS
(default 600), overriding the client default without affecting chat/embeddings.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .env.example       |  1 +
 CLAUDE.md          |  2 ++
 README.md          |  4 ++++
 src/ai/llamacpp.rs | 10 ++++++++++
 4 files changed, 17 insertions(+)

diff --git a/.env.example b/.env.example
index 2b6cff0..a45fdd5 100644
--- a/.env.example
+++ b/.env.example
@@ -88,6 +88,7 @@ AGENTIC_CHAT_MAX_ITERATIONS=6
 # LLAMA_SWAP_TTS_MODEL=chatterbox        # TTS model id in config.yaml
 # LLAMA_SWAP_TTS_VOICE=m                 # default voice when a request omits one
 # LLAMA_SWAP_TTS_REF_SECONDS=30          # max voice-clone reference clip length (s)
+# LLAMA_SWAP_TTS_REQUEST_TIMEOUT_SECONDS=600   # synth timeout (long chunked text)
 
 # ── AI Insights — sibling services (optional) ───────────────────────────
 # Apollo (places, face inference, CLIP encoders). Single-Apollo deploys
diff --git a/CLAUDE.md b/CLAUDE.md
index b5e1ee2..fba33e0 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -666,6 +666,8 @@ LLAMA_SWAP_TTS_MODEL=chatterbox                # TTS model id in config.yaml (de
 LLAMA_SWAP_TTS_VOICE=m                         # Default voice when /tts/speech omits one (optional)
 LLAMA_SWAP_TTS_REF_SECONDS=30                  # Max voice-clone reference clip length, seconds
                                                # (Chatterbox is zero-shot; ~10-20s clean ref is ideal)
+LLAMA_SWAP_TTS_REQUEST_TIMEOUT_SECONDS=600     # Per-request synth timeout (long chunked insights take
+                                               # minutes); overrides the shared client timeout for /tts/speech
 
 # Insight Chat Continuation
 AGENTIC_CHAT_MAX_ITERATIONS=6                  # Cap on tool-calling iterations per chat turn (default 6)
diff --git a/README.md b/README.md
index 0b678df..58ddc81 100644
--- a/README.md
+++ b/README.md
@@ -169,6 +169,10 @@ Env:
   [default: `30`]. Reference audio is ffmpeg-normalized to mono 24 kHz WAV (so any
   source format works); Chatterbox is zero-shot, so a clean ~10–20s sample is the
   sweet spot — more rarely helps.
+- `LLAMA_SWAP_TTS_REQUEST_TIMEOUT_SECONDS` - per-request synthesis timeout in
+  seconds [default: `600`]. Long insights are chunked + synthesized server-side
+  and can take minutes; this is separate from (and overrides, for `/tts/speech`)
+  the shared `LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS`.
 
 #### Fallback Behavior
 - Primary server is tried first with 5-second connection timeout
diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs
index 2946688..d56b645 100644
--- a/src/ai/llamacpp.rs
+++ b/src/ai/llamacpp.rs
@@ -170,9 +170,19 @@ impl LlamaCppClient {
             body["temperature"] = json!(x);
         }
 
+        // TTS gets its own (longer) timeout: synthesizing a long, internally
+        // chunked insight can take minutes, well past the shared chat/embedding
+        // client timeout. Per-request `.timeout()` overrides the client default.
+        let tts_timeout = std::env::var("LLAMA_SWAP_TTS_REQUEST_TIMEOUT_SECONDS")
+            .ok()
+            .and_then(|v| v.parse::<u64>().ok())
+            .filter(|n| *n > 0)
+            .unwrap_or(600);
+
         let resp = self
             .client
             .post(&url)
+            .timeout(Duration::from_secs(tts_timeout))
             .json(&body)
             .send()
             .await

From cab867da609a3c8356a9e36c62faf86da75a7da1 Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Wed, 3 Jun 2026 14:02:56 -0400
Subject: [PATCH 08/10] Serialize /tts/speech with a single permit; 429 when
 busy

The Chatterbox wrapper has no internal lock or cancellation, so concurrent
synth requests contend on the single GPU and abandoned (timed-out) jobs
cascade into stacked slowness. Gate synthesis behind a one-permit semaphore
and fast-fail concurrent requests with 429 instead of queueing.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 README.md     |  3 ++-
 src/ai/tts.rs | 17 +++++++++++++++++
 2 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 58ddc81..39ebe30 100644
--- a/README.md
+++ b/README.md
@@ -154,7 +154,8 @@ is built whenever that's set — independent of `LLM_BACKEND`). Endpoints:
 - `POST /tts/speech` — body `{ text, voice?, format?, exaggeration?, cfg_weight?,
   temperature? }`; returns `{ audio_base64, format }`. Input is cleaned
   server-side (markdown + emoji stripped) and the generation knobs are clamped
-  to Chatterbox's ranges.
+  to Chatterbox's ranges. Synthesis is serialized (one at a time — the upstream
+  has no GPU lock of its own); a concurrent request gets a fast `429`.
 - `GET /tts/voices` — list the voice library.
 - `POST /tts/voices/upload` — multipart `voice_name` + `voice_file`; clone a
   voice from an uploaded clip (≤25 MB).
diff --git a/src/ai/tts.rs b/src/ai/tts.rs
index 9c98bee..2c2009b 100644
--- a/src/ai/tts.rs
+++ b/src/ai/tts.rs
@@ -18,6 +18,7 @@ use serde::{Deserialize, Serialize};
 use serde_json::json;
 use std::path::Path;
 use std::sync::LazyLock;
+use tokio::sync::Semaphore;
 
 use crate::data::Claims;
 use crate::file_types::{is_audio_file, is_video_file};
@@ -31,6 +32,14 @@ use crate::state::AppState;
 /// upload can't balloon ImageApi memory before we ever forward it.
 const MAX_VOICE_UPLOAD_BYTES: usize = 25 * 1024 * 1024; // 25 MB
 
+/// Serialize speech synthesis: the Chatterbox server has no internal lock or
+/// queue, so concurrent requests contend on the single GPU and cascade into
+/// timeouts. One permit; when busy we fast-fail with 429 rather than queue —
+/// the app surfaces "busy" immediately, and typical jobs clear in well under a
+/// minute. (An abandoned upstream job can still occupy the GPU until it
+/// finishes — that's a wrapper limitation; the chunked-queue plan fixes it.)
+static TTS_PERMIT: LazyLock<Semaphore> = LazyLock::new(|| Semaphore::new(1));
+
 /// Sanitize a user-supplied voice name. The name is forwarded to Chatterbox
 /// where it becomes a filename in the voice-library directory, so we restrict
 /// it to a safe charset (alphanumerics, dash, underscore) — no path
@@ -235,6 +244,14 @@ pub async fn tts_speech_handler(
     let cfg_weight = req.cfg_weight.map(|x| x.clamp(0.0, 1.0));
     let temperature = req.temperature.map(|x| x.clamp(0.05, 5.0));
 
+    // One synthesis at a time (see TTS_PERMIT) — fast-fail when busy.
+    let Ok(_permit) = TTS_PERMIT.try_acquire() else {
+        span.set_status(Status::error("tts busy"));
+        return HttpResponse::TooManyRequests().json(json!({
+            "error": "TTS is busy with another request — try again shortly"
+        }));
+    };
+
     match client
         .text_to_speech(&text, voice, format, exaggeration, cfg_weight, temperature)
         .await

From dec6f21af9ae91e6d36527c61451f7c145e57bbb Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Wed, 3 Jun 2026 14:07:10 -0400
Subject: [PATCH 09/10] Bump version to 1.3.0

TTS feature release: /tts/speech + voice library endpoints (Chatterbox via
llama-swap), input cleaning, tuning knobs, WAV-normalized voice cloning,
OTel spans, dedicated synth timeout, and single-flight serialization.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 Cargo.lock | 2 +-
 Cargo.toml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index d35048c..a35a7d2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2051,7 +2051,7 @@ dependencies = [
 
 [[package]]
 name = "image-api"
-version = "1.2.0"
+version = "1.3.0"
 dependencies = [
  "actix",
  "actix-cors",
diff --git a/Cargo.toml b/Cargo.toml
index 6807778..3b3a08a 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "image-api"
-version = "1.2.0"
+version = "1.3.0"
 authors = ["Cameron Cordes <cameronc.dev@gmail.com>"]
 edition = "2024"
 

From 412da2ce8ed7b29955e5ec750cc3df35c1a121a1 Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Thu, 4 Jun 2026 09:12:43 -0400
Subject: [PATCH 10/10] Collapse blank lines to a single break in TTS text
 cleaning
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Chatterbox inserts a long pause — sometimes ~20s of silence — for each
blank line it sees, and insight text is markdown full of paragraph
breaks. clean_for_tts previously preserved paragraph structure
(\n{3,} -> \n\n), so every paragraph boundary still reached the model
as a double newline. Now any run of 2+ newlines, including
whitespace-only blank lines, collapses to a single newline so the
worst pause a break can cause is a normal line-break pause.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 src/ai/tts.rs | 21 ++++++++++++++++++---
 1 file changed, 18 insertions(+), 3 deletions(-)

diff --git a/src/ai/tts.rs b/src/ai/tts.rs
index 2c2009b..b94be36 100644
--- a/src/ai/tts.rs
+++ b/src/ai/tts.rs
@@ -85,7 +85,10 @@ static MD_LIST: LazyLock<Regex> =
 static MD_EMPHASIS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[*_`~]+").unwrap());
 static URL_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"https?://\S+").unwrap());
 static MULTISPACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[ \t]{2,}").unwrap());
-static MULTINEWLINE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\n{3,}").unwrap());
+// Any run of 2+ newlines (incl. whitespace-only blank lines) collapses to ONE
+// newline: Chatterbox inserts a long pause (sometimes ~20s of silence) per
+// blank line, so paragraph breaks must reach it as a single line break at most.
+static MULTINEWLINE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\n(?:[ \t]*\n)+").unwrap());
 
 /// True for emoji / pictographic symbols, which most TTS models either skip or
 /// mispronounce. Covers the main emoji blocks plus dingbats, misc-technical,
@@ -118,7 +121,7 @@ fn clean_for_tts(input: &str) -> String {
     let s = URL_RE.replace_all(&s, " ");
     let s: String = s.chars().filter(|c| !is_emoji_like(*c)).collect();
     let s = MULTISPACE.replace_all(&s, " ");
-    let s = MULTINEWLINE.replace_all(&s, "\n\n");
+    let s = MULTINEWLINE.replace_all(&s, "\n");
     s.trim().to_string()
 }
 
@@ -537,7 +540,7 @@ mod tests {
             clean_for_tts("**Bold** and _italic_ and `code`"),
             "Bold and italic and code"
         );
-        assert_eq!(clean_for_tts("# Title\n\nbody"), "Title\n\nbody");
+        assert_eq!(clean_for_tts("# Title\n\nbody"), "Title\nbody");
         assert_eq!(
             clean_for_tts("See [docs](http://x.com) now"),
             "See docs now"
@@ -556,6 +559,18 @@ mod tests {
         assert_eq!(clean_for_tts("family 👨‍👩‍👧 photo"), "family photo");
     }
 
+    #[test]
+    fn clean_for_tts_collapses_blank_lines_to_single_break() {
+        // Chatterbox pauses (sometimes ~20s) per blank line, so paragraph
+        // breaks must collapse to a single newline.
+        assert_eq!(clean_for_tts("para one\n\npara two"), "para one\npara two");
+        assert_eq!(clean_for_tts("a\n\n\n\nb"), "a\nb");
+        // Whitespace-only "blank" lines collapse too.
+        assert_eq!(clean_for_tts("a\n  \t \nb"), "a\nb");
+        // A single newline is left alone.
+        assert_eq!(clean_for_tts("a\nb"), "a\nb");
+    }
+
     #[test]
     fn clean_for_tts_preserves_bracket_tags() {
         // Non-turbo Chatterbox ignores these; a future Turbo uses them as