From 69268d03fe18f6b222ba75906798b5cfe258bfa0 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Tue, 2 Jun 2026 22:04:42 -0400 Subject: [PATCH 01/10] Add TTS endpoints backed by Chatterbox via llama-swap LlamaCppClient gains text_to_speech (OpenAI /audio/speech), list_voices and create_voice (voice library at the swap-root /upstream//voices passthrough), plus a tts_model slot configured via LLAMA_SWAP_TTS_MODEL (default "chatterbox"). New Claims-gated routes: - POST /tts/speech -> { audio_base64, format } for data: URI playback - GET /tts/voices -> voice library passthrough - POST /tts/voices/upload -> clone a voice from an uploaded clip (multipart) - POST /tts/voices/from-library -> clone from a library file (ffmpeg-extracts audio from video; audio forwarded as-is) Security: voice_name sanitized to [A-Za-z0-9_-] (it becomes an upstream filename), 25 MB upload cap, library refs restricted to real audio/video, path confined via is_valid_full_path. Adds is_audio_file + unit tests for the sanitizer, mime guesser, and swap-root derivation. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/ai/llamacpp.rs | 136 ++++++++++++++++ src/ai/mod.rs | 5 + src/ai/tts.rs | 393 +++++++++++++++++++++++++++++++++++++++++++++ src/file_types.rs | 17 ++ src/main.rs | 4 + src/state.rs | 3 + 6 files changed, 558 insertions(+) create mode 100644 src/ai/tts.rs diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs index e2ba00d..afd7f1b 100644 --- a/src/ai/llamacpp.rs +++ b/src/ai/llamacpp.rs @@ -36,6 +36,7 @@ const DEFAULT_BASE_URL: &str = "http://localhost:9292/v1"; const DEFAULT_PRIMARY_MODEL: &str = "chat"; const DEFAULT_VISION_MODEL: &str = "vision"; const DEFAULT_EMBEDDING_MODEL: &str = "embed"; +const DEFAULT_TTS_MODEL: &str = "chatterbox"; const DEFAULT_REQUEST_TIMEOUT_SECS: u64 = 180; /// OpenAI-compatible client targeting a llama-swap proxy in front of one or @@ -54,6 +55,10 @@ pub struct LlamaCppClient { /// to `primary_model` so describe_image works out of the box; override /// via `LLAMA_SWAP_VISION_MODEL` for a dedicated vision slot. pub vision_model: String, + /// TTS model slot id (e.g. `"chatterbox"`). Routes `text_to_speech` and + /// is the `/upstream//voices` path segment for the voice library. + /// Override via `LLAMA_SWAP_TTS_MODEL`. + pub tts_model: String, num_ctx: Option, temperature: Option, top_p: Option, @@ -78,6 +83,7 @@ impl LlamaCppClient { primary_model: pm.clone(), embedding_model: DEFAULT_EMBEDDING_MODEL.to_string(), vision_model: pm, + tts_model: DEFAULT_TTS_MODEL.to_string(), num_ctx: None, temperature: None, top_p: None, @@ -111,6 +117,116 @@ impl LlamaCppClient { self.min_p = min_p; } + pub fn set_tts_model(&mut self, model: String) { + self.tts_model = model; + } + + // --- TTS (Chatterbox behind llama-swap) --------------------------------- + // + // Speech synthesis uses the OpenAI-compatible `{base_url}/audio/speech` + // endpoint (llama-swap routes by the `model` field). The voice *library* + // (list / create cloned voices) is NOT an OpenAI endpoint — it lives on the + // upstream server directly, reached via llama-swap's passthrough at + // `{swap_root}/upstream//voices`. + + /// Root of the llama-swap proxy: `base_url` with a trailing `/v1` removed. + /// The `/upstream/...` passthrough lives here, not under `/v1`. + fn swap_root(&self) -> &str { + let b = self.base_url.trim_end_matches('/'); + b.strip_suffix("/v1").unwrap_or(b) + } + + /// Synthesize speech for `input` in an optional named `voice`, returning + /// the raw audio bytes (format per `response_format`, e.g. `"mp3"`/`"wav"`). + pub async fn text_to_speech( + &self, + input: &str, + voice: Option<&str>, + response_format: &str, + ) -> Result> { + let url = format!("{}/audio/speech", self.base_url); + let mut body = json!({ + "model": self.tts_model, + "input": input, + "response_format": response_format, + }); + if let Some(v) = voice { + body["voice"] = Value::String(v.to_string()); + } + + let resp = self + .client + .post(&url) + .json(&body) + .send() + .await + .with_context(|| format!("POST {} failed", url))?; + + if !resp.status().is_success() { + let status = resp.status(); + let text = resp.text().await.unwrap_or_default(); + bail!("llama-swap TTS request failed: {} — {}", status, text); + } + + Ok(resp + .bytes() + .await + .context("reading TTS audio bytes")? + .to_vec()) + } + + /// List voices in the Chatterbox voice library (raw JSON passthrough). + pub async fn list_voices(&self) -> Result { + let url = format!("{}/upstream/{}/voices", self.swap_root(), self.tts_model); + let resp = self + .client + .get(&url) + .send() + .await + .with_context(|| format!("GET {} failed", url))?; + + if !resp.status().is_success() { + let status = resp.status(); + let text = resp.text().await.unwrap_or_default(); + bail!("llama-swap list_voices failed: {} — {}", status, text); + } + resp.json().await.context("parsing voices response") + } + + /// Register a cloned voice from raw audio bytes (multipart `voice_name` + + /// `voice_file`). Returns the upstream JSON response. + pub async fn create_voice( + &self, + voice_name: &str, + audio_bytes: Vec, + filename: &str, + mime: &str, + ) -> Result { + let url = format!("{}/upstream/{}/voices", self.swap_root(), self.tts_model); + let part = reqwest::multipart::Part::bytes(audio_bytes) + .file_name(filename.to_string()) + .mime_str(mime) + .context("invalid audio mime type")?; + let form = reqwest::multipart::Form::new() + .text("voice_name", voice_name.to_string()) + .part("voice_file", part); + + let resp = self + .client + .post(&url) + .multipart(form) + .send() + .await + .with_context(|| format!("POST {} (multipart) failed", url))?; + + if !resp.status().is_success() { + let status = resp.status(); + let text = resp.text().await.unwrap_or_default(); + bail!("llama-swap create_voice failed: {} — {}", status, text); + } + resp.json().await.context("parsing create_voice response") + } + /// Translate canonical messages to the OpenAI-compatible wire shape. /// Behaviorally identical to `OpenRouterClient::messages_to_openai` — /// stringify tool-call arguments, rewrite images into content-parts, attach @@ -1140,4 +1256,24 @@ mod tests { let wire = LlamaCppClient::messages_to_openai(&[msg]); assert_eq!(wire[0]["content"], ""); } + + #[test] + fn swap_root_strips_v1_suffix() { + let c = LlamaCppClient::new(Some("http://localhost:9292/v1".to_string()), None); + assert_eq!(c.swap_root(), "http://localhost:9292"); + + // Tolerates a trailing slash on the base URL. + let c2 = LlamaCppClient::new(Some("http://localhost:9292/v1/".to_string()), None); + assert_eq!(c2.swap_root(), "http://localhost:9292"); + + // No /v1 suffix → returned unchanged. + let c3 = LlamaCppClient::new(Some("http://host:1234".to_string()), None); + assert_eq!(c3.swap_root(), "http://host:1234"); + } + + #[test] + fn tts_model_defaults_to_chatterbox() { + let c = LlamaCppClient::new(None, None); + assert_eq!(c.tts_model, "chatterbox"); + } } diff --git a/src/ai/mod.rs b/src/ai/mod.rs index e9bec09..e61eace 100644 --- a/src/ai/mod.rs +++ b/src/ai/mod.rs @@ -11,6 +11,7 @@ pub mod llm_client; pub mod ollama; pub mod openrouter; pub mod sms_client; +pub mod tts; pub mod turn_registry; // strip_summary_boilerplate is used by binaries (test_daily_summary), not the library @@ -34,6 +35,10 @@ pub use llm_client::{ }; pub use ollama::{EMBEDDING_MODEL, OllamaClient}; pub use sms_client::{SmsApiClient, SmsMessage}; +pub use tts::{ + create_voice_from_library_handler, create_voice_upload_handler, list_voices_handler, + tts_speech_handler, +}; /// Display name used for the user in message transcripts and first-person /// prompt text. Reads the `USER_NAME` env var; defaults to `"Me"`. Models diff --git a/src/ai/tts.rs b/src/ai/tts.rs new file mode 100644 index 0000000..b2bd675 --- /dev/null +++ b/src/ai/tts.rs @@ -0,0 +1,393 @@ +// TTS endpoints: proxy text-to-speech + voice-library management to the +// Chatterbox server that sits behind llama-swap (via LlamaCppClient). Speech +// synthesis returns audio as base64-in-JSON so the mobile app can play it as a +// `data:` URI without a binary-fetch path. Voice cloning registers a named +// voice from either an uploaded clip (device) or an existing library file +// (audio read directly; video has its audio track extracted via ffmpeg). + +use actix_multipart::Multipart; +use actix_web::{HttpResponse, Responder, get, post, web}; +use anyhow::Context; +use base64::Engine; +use bytes::{BufMut, BytesMut}; +use futures::StreamExt; +use serde::{Deserialize, Serialize}; +use serde_json::json; +use std::path::Path; + +use crate::data::Claims; +use crate::file_types::{is_audio_file, is_video_file}; +use crate::files::is_valid_full_path; +use crate::libraries; +use crate::state::AppState; + +/// Hard cap on an uploaded voice-reference clip. Chatterbox itself caps the +/// payload (~60s clip); this is a defensive ceiling so a hostile/oversized +/// upload can't balloon ImageApi memory before we ever forward it. +const MAX_VOICE_UPLOAD_BYTES: usize = 25 * 1024 * 1024; // 25 MB + +/// Sanitize a user-supplied voice name. The name is forwarded to Chatterbox +/// where it becomes a filename in the voice-library directory, so we restrict +/// it to a safe charset (alphanumerics, dash, underscore) — no path +/// separators, dots, or whitespace — and bound its length. Returns `None` +/// when nothing usable remains. +fn sanitize_voice_name(raw: &str) -> Option { + let cleaned: String = raw + .trim() + .chars() + .map(|c| { + if c.is_ascii_alphanumeric() || c == '-' || c == '_' { + c + } else { + '-' + } + }) + .collect(); + let cleaned = cleaned.trim_matches('-').to_string(); + if cleaned.is_empty() { + return None; + } + Some(cleaned.chars().take(64).collect()) +} + +/// Optional default voice for synthesis when the request doesn't name one. +/// Set `LLAMA_SWAP_TTS_VOICE=m` to read insights in a cloned voice by default. +fn default_voice() -> Option { + std::env::var("LLAMA_SWAP_TTS_VOICE") + .ok() + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) +} + +fn guess_audio_mime(path: &Path) -> String { + match path + .extension() + .and_then(|e| e.to_str()) + .map(|e| e.to_lowercase()) + .as_deref() + { + Some("wav") => "audio/wav", + Some("mp3") => "audio/mpeg", + Some("m4a") | Some("mp4") | Some("aac") => "audio/mp4", + Some("flac") => "audio/flac", + Some("ogg") | Some("oga") => "audio/ogg", + _ => "application/octet-stream", + } + .to_string() +} + +#[derive(Debug, Deserialize)] +pub struct TtsSpeechRequest { + pub text: String, + #[serde(default)] + pub voice: Option, + /// Audio container, e.g. `"mp3"` (default) or `"wav"`. + #[serde(default)] + pub format: Option, +} + +#[derive(Debug, Serialize)] +pub struct TtsSpeechResponse { + pub audio_base64: String, + pub format: String, +} + +/// POST /tts/speech — synthesize `text` (optionally in a named `voice`) and +/// return base64-encoded audio for `data:` URI playback on the client. +#[post("/tts/speech")] +pub async fn tts_speech_handler( + _claims: Claims, + req: web::Json, + app_state: web::Data, +) -> impl Responder { + let text = req.text.trim(); + if text.is_empty() { + return HttpResponse::BadRequest().json(json!({ "error": "text is required" })); + } + let Some(client) = app_state.llamacpp.as_ref() else { + return HttpResponse::ServiceUnavailable() + .json(json!({ "error": "TTS backend not configured (set LLAMA_SWAP_URL)" })); + }; + + let format = req + .format + .as_deref() + .filter(|s| !s.is_empty()) + .unwrap_or("mp3"); + let dv = default_voice(); + let voice = req + .voice + .as_deref() + .filter(|s| !s.is_empty()) + .or(dv.as_deref()); + + match client.text_to_speech(text, voice, format).await { + Ok(bytes) => { + let audio_base64 = base64::engine::general_purpose::STANDARD.encode(&bytes); + HttpResponse::Ok().json(TtsSpeechResponse { + audio_base64, + format: format.to_string(), + }) + } + Err(e) => { + log::error!("TTS synth failed: {:?}", e); + HttpResponse::BadGateway().json(json!({ "error": format!("TTS failed: {e}") })) + } + } +} + +/// GET /tts/voices — list the Chatterbox voice library (raw passthrough). +#[get("/tts/voices")] +pub async fn list_voices_handler( + _claims: Claims, + app_state: web::Data, +) -> impl Responder { + let Some(client) = app_state.llamacpp.as_ref() else { + return HttpResponse::ServiceUnavailable() + .json(json!({ "error": "TTS backend not configured" })); + }; + match client.list_voices().await { + Ok(v) => HttpResponse::Ok().json(v), + Err(e) => { + log::error!("list_voices failed: {:?}", e); + HttpResponse::BadGateway().json(json!({ "error": format!("{e}") })) + } + } +} + +/// POST /tts/voices/upload — register a cloned voice from an uploaded audio +/// clip. Multipart fields: `voice_name` (text) + a file part (`voice_file`). +#[post("/tts/voices/upload")] +pub async fn create_voice_upload_handler( + _claims: Claims, + mut payload: Multipart, + app_state: web::Data, +) -> impl Responder { + let Some(client) = app_state.llamacpp.as_ref() else { + return HttpResponse::ServiceUnavailable() + .json(json!({ "error": "TTS backend not configured" })); + }; + + let mut voice_name: Option = None; + let mut file_bytes = BytesMut::new(); + let mut filename = "voice.wav".to_string(); + let mut mime = "application/octet-stream".to_string(); + + while let Some(Ok(mut part)) = payload.next().await { + // Capture disposition fields up front so the immutable borrow ends + // before we mutably stream the part body (mirrors handlers/image.rs). + let (fname_opt, name_opt) = { + let cd = part.content_disposition(); + ( + cd.and_then(|c| c.get_filename()).map(|s| s.to_string()), + cd.and_then(|c| c.get_name()).map(|s| s.to_string()), + ) + }; + + if let Some(fname) = fname_opt { + filename = fname; + if let Some(ct) = part.content_type() { + mime = ct.to_string(); + } + while let Some(Ok(data)) = part.next().await { + if file_bytes.len() + data.len() > MAX_VOICE_UPLOAD_BYTES { + return HttpResponse::PayloadTooLarge() + .json(json!({ "error": "voice clip exceeds 25 MB" })); + } + file_bytes.put(data); + } + } else if name_opt.as_deref() == Some("voice_name") { + let mut buf = BytesMut::new(); + while let Some(Ok(data)) = part.next().await { + buf.put(data); + } + voice_name = Some(String::from_utf8_lossy(&buf).trim().to_string()); + } else { + while let Some(Ok(_)) = part.next().await {} + } + } + + let Some(name) = voice_name.as_deref().and_then(sanitize_voice_name) else { + return HttpResponse::BadRequest() + .json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" })); + }; + if file_bytes.is_empty() { + return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" })); + } + if !mime.starts_with("audio") { + mime = guess_audio_mime(Path::new(&filename)); + } + + match client + .create_voice(&name, file_bytes.to_vec(), &filename, &mime) + .await + { + Ok(v) => HttpResponse::Ok().json(v), + Err(e) => { + log::error!("create_voice (upload) failed: {:?}", e); + HttpResponse::BadGateway().json(json!({ "error": format!("{e}") })) + } + } +} + +#[derive(Debug, Deserialize)] +pub struct CreateVoiceFromLibraryRequest { + pub voice_name: String, + /// Library-relative path to an audio or video file. + pub path: String, + #[serde(default)] + pub library: Option, +} + +/// POST /tts/voices/from-library — register a cloned voice from a file already +/// in a library. Audio files are forwarded as-is; video files have up to 30s +/// of their audio track extracted (mono, 24 kHz) via ffmpeg. +#[post("/tts/voices/from-library")] +pub async fn create_voice_from_library_handler( + _claims: Claims, + req: web::Json, + app_state: web::Data, +) -> impl Responder { + let Some(client) = app_state.llamacpp.as_ref() else { + return HttpResponse::ServiceUnavailable() + .json(json!({ "error": "TTS backend not configured" })); + }; + let Some(voice_name) = sanitize_voice_name(&req.voice_name) else { + return HttpResponse::BadRequest() + .json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" })); + }; + + let library = match libraries::resolve_library_param(&app_state, req.library.as_deref()) { + Ok(Some(l)) => l, + Ok(None) => app_state.primary_library(), + Err(msg) => return HttpResponse::BadRequest().json(json!({ "error": msg })), + }; + + // is_valid_full_path confines the path to the library root (no traversal). + let abs = match is_valid_full_path(&library.root_path, &req.path, false) { + Some(p) if p.exists() => p, + _ => { + return HttpResponse::NotFound().json(json!({ "error": "file not found in library" })); + } + }; + + // Only real audio/video sources are valid voice references — refuse to + // slurp arbitrary library files into memory / ffmpeg. + if !is_audio_file(&abs) && !is_video_file(&abs) { + return HttpResponse::BadRequest() + .json(json!({ "error": "file is not an audio or video file" })); + } + + let (bytes, filename, mime) = match prepare_reference_audio(&abs).await { + Ok(t) => t, + Err(e) => { + log::error!("voice reference prep failed for {:?}: {:?}", abs, e); + return HttpResponse::BadRequest().json(json!({ "error": format!("{e}") })); + } + }; + + match client + .create_voice(&voice_name, bytes, &filename, &mime) + .await + { + Ok(v) => HttpResponse::Ok().json(v), + Err(e) => { + log::error!("create_voice (from-library) failed: {:?}", e); + HttpResponse::BadGateway().json(json!({ "error": format!("{e}") })) + } + } +} + +/// Read a library file as reference audio. Audio is returned verbatim; video +/// has up to 30s of audio extracted to mono 24 kHz WAV via ffmpeg. +async fn prepare_reference_audio(abs: &Path) -> anyhow::Result<(Vec, String, String)> { + if is_video_file(abs) { + let tmp = tempfile::Builder::new() + .suffix(".wav") + .tempfile() + .context("creating temp wav")?; + let out = tmp.path().to_path_buf(); + let abs_s = abs.to_string_lossy().to_string(); + let out_s = out.to_string_lossy().to_string(); + + let output = tokio::process::Command::new("ffmpeg") + .args([ + "-y", "-i", &abs_s, "-vn", "-ac", "1", "-ar", "24000", "-t", "30", "-f", "wav", + &out_s, + ]) + .output() + .await + .context("spawning ffmpeg")?; + + if !output.status.success() { + anyhow::bail!( + "ffmpeg audio extraction failed: {}", + String::from_utf8_lossy(&output.stderr) + ); + } + let bytes = std::fs::read(&out).context("reading extracted audio")?; + Ok((bytes, "reference.wav".to_string(), "audio/wav".to_string())) + } else { + let bytes = std::fs::read(abs).context("reading audio file")?; + let filename = abs + .file_name() + .and_then(|f| f.to_str()) + .unwrap_or("reference") + .to_string(); + Ok((bytes, filename, guess_audio_mime(abs))) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn sanitize_voice_name_keeps_safe_chars() { + assert_eq!(sanitize_voice_name("m").as_deref(), Some("m")); + assert_eq!( + sanitize_voice_name(" Cameron ").as_deref(), + Some("Cameron") + ); + assert_eq!( + sanitize_voice_name("voice_01-a").as_deref(), + Some("voice_01-a") + ); + } + + #[test] + fn sanitize_voice_name_strips_unsafe_chars() { + // Path separators / dots / spaces become '-' and are trimmed at edges. + assert_eq!(sanitize_voice_name("a b.c").as_deref(), Some("a-b-c")); + assert_eq!( + sanitize_voice_name("../etc/passwd").as_deref(), + Some("etc-passwd") + ); + } + + #[test] + fn sanitize_voice_name_rejects_empty_or_all_unsafe() { + assert_eq!(sanitize_voice_name(""), None); + assert_eq!(sanitize_voice_name(" "), None); + assert_eq!(sanitize_voice_name("../../"), None); + assert_eq!(sanitize_voice_name("...."), None); + } + + #[test] + fn sanitize_voice_name_bounds_length() { + let long = "a".repeat(200); + assert_eq!(sanitize_voice_name(&long).unwrap().len(), 64); + } + + #[test] + fn guess_audio_mime_maps_known_extensions() { + assert_eq!(guess_audio_mime(Path::new("clip.wav")), "audio/wav"); + assert_eq!(guess_audio_mime(Path::new("clip.MP3")), "audio/mpeg"); + assert_eq!(guess_audio_mime(Path::new("clip.m4a")), "audio/mp4"); + assert_eq!(guess_audio_mime(Path::new("clip.flac")), "audio/flac"); + assert_eq!( + guess_audio_mime(Path::new("clip.xyz")), + "application/octet-stream" + ); + } +} diff --git a/src/file_types.rs b/src/file_types.rs index 33f71dd..b834cba 100644 --- a/src/file_types.rs +++ b/src/file_types.rs @@ -22,6 +22,10 @@ pub fn needs_ffmpeg_thumbnail(path: &Path) -> bool { /// Supported video file extensions pub const VIDEO_EXTENSIONS: &[&str] = &["mp4", "mov", "avi", "mkv"]; +/// Audio file extensions accepted as voice-clone references (TTS). Mirrors +/// the formats Chatterbox can decode (wav/mp3/flac/m4a/aac/ogg). +pub const AUDIO_EXTENSIONS: &[&str] = &["wav", "mp3", "flac", "m4a", "aac", "ogg", "oga", "opus"]; + /// Filenames that are filesystem metadata, not real media — exact /// basename match. Extend if a new platform sidecar appears (Windows /// Thumbs.db / desktop.ini live here too if those libraries land). @@ -75,6 +79,19 @@ pub fn is_video_file(path: &Path) -> bool { } } +/// Check if a path has an audio extension (voice-clone references) +pub fn is_audio_file(path: &Path) -> bool { + if is_filesystem_metadata(path) { + return false; + } + if let Some(ext) = path.extension().and_then(|e| e.to_str()) { + let ext_lower = ext.to_lowercase(); + AUDIO_EXTENSIONS.contains(&ext_lower.as_str()) + } else { + false + } +} + /// Check if a path has a supported media extension (image or video) pub fn is_media_file(path: &Path) -> bool { is_image_file(path) || is_video_file(path) diff --git a/src/main.rs b/src/main.rs index 4099a5d..8b06228 100644 --- a/src/main.rs +++ b/src/main.rs @@ -362,6 +362,10 @@ fn main() -> std::io::Result<()> { .service(ai::cancel_turn_handler) .service(ai::rate_insight_handler) .service(ai::export_training_data_handler) + .service(ai::tts_speech_handler) + .service(ai::list_voices_handler) + .service(ai::create_voice_upload_handler) + .service(ai::create_voice_from_library_handler) .service(libraries::list_libraries) .service(libraries::patch_library) .add_feature(add_tag_services::<_, SqliteTagDao>) diff --git a/src/state.rs b/src/state.rs index f9adda7..ef071a8 100644 --- a/src/state.rs +++ b/src/state.rs @@ -391,6 +391,9 @@ fn build_llamacpp_from_env() -> Option> { if let Ok(model) = env::var("LLAMA_SWAP_VISION_MODEL") { client.set_vision_model(model); } + if let Ok(model) = env::var("LLAMA_SWAP_TTS_MODEL") { + client.set_tts_model(model); + } Some(Arc::new(client)) } From 51be5df2148f53014ef5f0161c5f305dc17e0a92 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Tue, 2 Jun 2026 22:15:05 -0400 Subject: [PATCH 02/10] Clean insight text for TTS and pass through Chatterbox tuning knobs /tts/speech now normalizes input before synthesis: unwraps markdown links/images to visible text, drops heading/list/blockquote/emphasis markers and URLs, strips emoji (which non-turbo Chatterbox mispronounces or skips), and collapses whitespace. Centralized in clean_for_tts so the app, WebUI, and curl all get clean audio. Bracketed tags are deliberately preserved for a future Turbo (paralinguistic) switch. Adds optional exaggeration / cfg_weight / temperature to the request, clamped to Chatterbox's documented ranges and forwarded on the speech body. Unit tests cover markdown/emoji/URL stripping and tag preservation. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/ai/llamacpp.rs | 16 +++++++ src/ai/tts.rs | 104 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 118 insertions(+), 2 deletions(-) diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs index afd7f1b..2946688 100644 --- a/src/ai/llamacpp.rs +++ b/src/ai/llamacpp.rs @@ -138,11 +138,18 @@ impl LlamaCppClient { /// Synthesize speech for `input` in an optional named `voice`, returning /// the raw audio bytes (format per `response_format`, e.g. `"mp3"`/`"wav"`). + /// + /// Chatterbox generation knobs are forwarded when set (caller is expected + /// to have range-clamped them): `exaggeration` (0.25–2.0, emotion), + /// `cfg_weight` (0.0–1.0, pace), `temperature` (0.05–5.0, randomness). pub async fn text_to_speech( &self, input: &str, voice: Option<&str>, response_format: &str, + exaggeration: Option, + cfg_weight: Option, + temperature: Option, ) -> Result> { let url = format!("{}/audio/speech", self.base_url); let mut body = json!({ @@ -153,6 +160,15 @@ impl LlamaCppClient { if let Some(v) = voice { body["voice"] = Value::String(v.to_string()); } + if let Some(x) = exaggeration { + body["exaggeration"] = json!(x); + } + if let Some(x) = cfg_weight { + body["cfg_weight"] = json!(x); + } + if let Some(x) = temperature { + body["temperature"] = json!(x); + } let resp = self .client diff --git a/src/ai/tts.rs b/src/ai/tts.rs index b2bd675..8078132 100644 --- a/src/ai/tts.rs +++ b/src/ai/tts.rs @@ -11,9 +11,11 @@ use anyhow::Context; use base64::Engine; use bytes::{BufMut, BytesMut}; use futures::StreamExt; +use regex::Regex; use serde::{Deserialize, Serialize}; use serde_json::json; use std::path::Path; +use std::sync::LazyLock; use crate::data::Claims; use crate::file_types::{is_audio_file, is_video_file}; @@ -59,6 +61,55 @@ fn default_voice() -> Option { .filter(|s| !s.is_empty()) } +// Markdown / formatting strippers, compiled once. Insight text is markdown, +// which TTS would otherwise read literally ("star star bold star star"). +static MD_IMAGE: LazyLock = LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\([^)]*\)").unwrap()); +static MD_LINK: LazyLock = LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\([^)]*\)").unwrap()); +static MD_HEADING: LazyLock = + LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}#{1,6}\s*").unwrap()); +static MD_BLOCKQUOTE: LazyLock = LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}>\s?").unwrap()); +static MD_LIST: LazyLock = + LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}([-*+]|\d+\.)\s+").unwrap()); +static MD_EMPHASIS: LazyLock = LazyLock::new(|| Regex::new(r"[*_`~]+").unwrap()); +static URL_RE: LazyLock = LazyLock::new(|| Regex::new(r"https?://\S+").unwrap()); +static MULTISPACE: LazyLock = LazyLock::new(|| Regex::new(r"[ \t]{2,}").unwrap()); +static MULTINEWLINE: LazyLock = LazyLock::new(|| Regex::new(r"\n{3,}").unwrap()); + +/// True for emoji / pictographic symbols, which most TTS models either skip or +/// mispronounce. Covers the main emoji blocks plus dingbats, misc-technical, +/// variation selectors, and the ZWJ used to glue emoji sequences. We do NOT +/// strip `[bracketed]` tags — non-turbo Chatterbox ignores them, and a future +/// Turbo switch uses them as paralinguistic cues. +fn is_emoji_like(c: char) -> bool { + let u = c as u32; + matches!(u, + 0x1F000..=0x1FAFF // emoji, pictographs, supplemental symbols, flags + | 0x2300..=0x23FF // misc technical (⌚ ⏰ ⏳ …) + | 0x2600..=0x27BF // misc symbols + dingbats + | 0x2B00..=0x2BFF // misc symbols & arrows (★ ⬆ …) + | 0xFE00..=0xFE0F // variation selectors + | 0x200D // zero-width joiner + ) +} + +/// Normalize insight text for speech: unwrap markdown links/images to their +/// visible text, drop heading/list/blockquote/emphasis markers and URLs, strip +/// emoji, and collapse whitespace. Centralized here so every caller (app, +/// WebUI, curl) gets clean audio. +fn clean_for_tts(input: &str) -> String { + let s = MD_IMAGE.replace_all(input, "$1"); + let s = MD_LINK.replace_all(&s, "$1"); + let s = MD_HEADING.replace_all(&s, ""); + let s = MD_BLOCKQUOTE.replace_all(&s, ""); + let s = MD_LIST.replace_all(&s, ""); + let s = MD_EMPHASIS.replace_all(&s, ""); + let s = URL_RE.replace_all(&s, " "); + let s: String = s.chars().filter(|c| !is_emoji_like(*c)).collect(); + let s = MULTISPACE.replace_all(&s, " "); + let s = MULTINEWLINE.replace_all(&s, "\n\n"); + s.trim().to_string() +} + fn guess_audio_mime(path: &Path) -> String { match path .extension() @@ -84,6 +135,15 @@ pub struct TtsSpeechRequest { /// Audio container, e.g. `"mp3"` (default) or `"wav"`. #[serde(default)] pub format: Option, + /// Chatterbox knobs (clamped server-side). exaggeration 0.25–2.0 (emotion), + /// cfg_weight 0.0–1.0 (pace; ~0.3 for fast speakers, 0 to neutralize a + /// reference accent), temperature 0.05–5.0 (randomness). + #[serde(default)] + pub exaggeration: Option, + #[serde(default)] + pub cfg_weight: Option, + #[serde(default)] + pub temperature: Option, } #[derive(Debug, Serialize)] @@ -100,7 +160,7 @@ pub async fn tts_speech_handler( req: web::Json, app_state: web::Data, ) -> impl Responder { - let text = req.text.trim(); + let text = clean_for_tts(&req.text); if text.is_empty() { return HttpResponse::BadRequest().json(json!({ "error": "text is required" })); } @@ -121,7 +181,15 @@ pub async fn tts_speech_handler( .filter(|s| !s.is_empty()) .or(dv.as_deref()); - match client.text_to_speech(text, voice, format).await { + // Clamp generation knobs to Chatterbox's documented ranges before forwarding. + let exaggeration = req.exaggeration.map(|x| x.clamp(0.25, 2.0)); + let cfg_weight = req.cfg_weight.map(|x| x.clamp(0.0, 1.0)); + let temperature = req.temperature.map(|x| x.clamp(0.05, 5.0)); + + match client + .text_to_speech(&text, voice, format, exaggeration, cfg_weight, temperature) + .await + { Ok(bytes) => { let audio_base64 = base64::engine::general_purpose::STANDARD.encode(&bytes); HttpResponse::Ok().json(TtsSpeechResponse { @@ -390,4 +458,36 @@ mod tests { "application/octet-stream" ); } + + #[test] + fn clean_for_tts_strips_markdown() { + assert_eq!( + clean_for_tts("**Bold** and _italic_ and `code`"), + "Bold and italic and code" + ); + assert_eq!(clean_for_tts("# Title\n\nbody"), "Title\n\nbody"); + assert_eq!( + clean_for_tts("See [docs](http://x.com) now"), + "See docs now" + ); + assert_eq!(clean_for_tts("- one\n- two"), "one\ntwo"); + } + + #[test] + fn clean_for_tts_strips_emoji_and_urls() { + assert_eq!(clean_for_tts("Hello 😀 world 🎉"), "Hello world"); + assert_eq!( + clean_for_tts("visit https://example.com today"), + "visit today" + ); + // ZWJ-glued emoji sequence is fully removed. + assert_eq!(clean_for_tts("family 👨‍👩‍👧 photo"), "family photo"); + } + + #[test] + fn clean_for_tts_preserves_bracket_tags() { + // Non-turbo Chatterbox ignores these; a future Turbo uses them as + // paralinguistic cues — so we must not strip them. + assert_eq!(clean_for_tts("hello [laugh] there"), "hello [laugh] there"); + } } From 35c5ecb427f7118440a0095e8730b2aa82d0e8e8 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Tue, 2 Jun 2026 22:34:34 -0400 Subject: [PATCH 03/10] Document TTS endpoints and env in README + .env.example Adds the /tts/speech and /tts/voices* endpoints plus LLAMA_SWAP_TTS_MODEL / LLAMA_SWAP_TTS_VOICE (TTS only needs LLAMA_SWAP_URL, not LLM_BACKEND=llamacpp). Co-Authored-By: Claude Opus 4.8 (1M context) --- .env.example | 8 ++++++++ README.md | 19 +++++++++++++++++++ 2 files changed, 27 insertions(+) diff --git a/.env.example b/.env.example index f7a1004..835bef5 100644 --- a/.env.example +++ b/.env.example @@ -80,6 +80,14 @@ AGENTIC_CHAT_MAX_ITERATIONS=6 # LLAMA_SWAP_ALLOWED_MODELS=chat,vision,embed # LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180 +# ── Text-to-speech (optional, requires LLAMA_SWAP_URL) ─────────────────── +# TTS routes through the same llama-swap proxy (a Chatterbox model id), so it +# only needs LLAMA_SWAP_URL — it does NOT require LLM_BACKEND=llamacpp. +# Powers POST /tts/speech and the /tts/voices* endpoints (read-aloud insights +# + voice cloning in the mobile app). +# LLAMA_SWAP_TTS_MODEL=chatterbox # TTS model id in config.yaml +# LLAMA_SWAP_TTS_VOICE=m # default voice when a request omits one + # ── AI Insights — sibling services (optional) ─────────────────────────── # Apollo (places, face inference, CLIP encoders). Single-Apollo deploys # typically set only APOLLO_API_BASE_URL and let the face + CLIP diff --git a/README.md b/README.md index b6d764b..12c220f 100644 --- a/README.md +++ b/README.md @@ -147,6 +147,25 @@ so you can rewrite the saved summary from within chat. - `AGENTIC_CHAT_MAX_ITERATIONS` - Cap on tool-calling iterations per chat turn [default: `6`] - Per-request `max_iterations` (when sent by the client) is clamped to this cap +#### Text-to-Speech (Optional) +Reads insights aloud and manages cloned voices via a Chatterbox model served +behind the same llama-swap proxy. Only requires `LLAMA_SWAP_URL` (the TTS client +is built whenever that's set — independent of `LLM_BACKEND`). Endpoints: +- `POST /tts/speech` — body `{ text, voice?, format?, exaggeration?, cfg_weight?, + temperature? }`; returns `{ audio_base64, format }`. Input is cleaned + server-side (markdown + emoji stripped) and the generation knobs are clamped + to Chatterbox's ranges. +- `GET /tts/voices` — list the voice library. +- `POST /tts/voices/upload` — multipart `voice_name` + `voice_file`; clone a + voice from an uploaded clip (≤25 MB). +- `POST /tts/voices/from-library` — body `{ voice_name, path, library? }`; clone + from a library file (audio forwarded as-is; video has its audio extracted via + ffmpeg). + +Env: +- `LLAMA_SWAP_TTS_MODEL` - TTS model id in llama-swap's `config.yaml` [default: `chatterbox`] +- `LLAMA_SWAP_TTS_VOICE` - default voice used when a `/tts/speech` request omits `voice` (optional) + #### Fallback Behavior - Primary server is tried first with 5-second connection timeout - On failure, automatically falls back to secondary server (if configured) From 62d517dcdaba390ceb3db6a53aa86cd37569369a Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Tue, 2 Jun 2026 22:50:08 -0400 Subject: [PATCH 04/10] Normalize voice-clone reference audio to WAV via ffmpeg MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Chatterbox validates the reference clip by file extension and rejects formats like .aac/.opus. Always transcode the reference (upload bytes and library files alike) to mono 24 kHz WAV with ffmpeg before forwarding, so any source format is accepted and the from-library audio/video paths are unified. The reference length cap is now configurable via LLAMA_SWAP_TTS_REF_SECONDS (default 30) — Chatterbox is zero-shot, so a clean ~10-20s clip is the sweet spot. Drops the now-unused mime guesser. Co-Authored-By: Claude Opus 4.8 (1M context) --- .env.example | 1 + README.md | 4 ++ src/ai/tts.rs | 152 ++++++++++++++++++++++++-------------------------- 3 files changed, 79 insertions(+), 78 deletions(-) diff --git a/.env.example b/.env.example index 835bef5..2b6cff0 100644 --- a/.env.example +++ b/.env.example @@ -87,6 +87,7 @@ AGENTIC_CHAT_MAX_ITERATIONS=6 # + voice cloning in the mobile app). # LLAMA_SWAP_TTS_MODEL=chatterbox # TTS model id in config.yaml # LLAMA_SWAP_TTS_VOICE=m # default voice when a request omits one +# LLAMA_SWAP_TTS_REF_SECONDS=30 # max voice-clone reference clip length (s) # ── AI Insights — sibling services (optional) ─────────────────────────── # Apollo (places, face inference, CLIP encoders). Single-Apollo deploys diff --git a/README.md b/README.md index 12c220f..0b678df 100644 --- a/README.md +++ b/README.md @@ -165,6 +165,10 @@ is built whenever that's set — independent of `LLM_BACKEND`). Endpoints: Env: - `LLAMA_SWAP_TTS_MODEL` - TTS model id in llama-swap's `config.yaml` [default: `chatterbox`] - `LLAMA_SWAP_TTS_VOICE` - default voice used when a `/tts/speech` request omits `voice` (optional) +- `LLAMA_SWAP_TTS_REF_SECONDS` - max voice-clone reference clip length in seconds + [default: `30`]. Reference audio is ffmpeg-normalized to mono 24 kHz WAV (so any + source format works); Chatterbox is zero-shot, so a clean ~10–20s sample is the + sweet spot — more rarely helps. #### Fallback Behavior - Primary server is tried first with 5-second connection timeout diff --git a/src/ai/tts.rs b/src/ai/tts.rs index 8078132..59b4a80 100644 --- a/src/ai/tts.rs +++ b/src/ai/tts.rs @@ -110,21 +110,56 @@ fn clean_for_tts(input: &str) -> String { s.trim().to_string() } -fn guess_audio_mime(path: &Path) -> String { - match path - .extension() - .and_then(|e| e.to_str()) - .map(|e| e.to_lowercase()) - .as_deref() - { - Some("wav") => "audio/wav", - Some("mp3") => "audio/mpeg", - Some("m4a") | Some("mp4") | Some("aac") => "audio/mp4", - Some("flac") => "audio/flac", - Some("ogg") | Some("oga") => "audio/ogg", - _ => "application/octet-stream", +/// Decode an audio/video file to mono 24 kHz WAV via ffmpeg, returning the WAV +/// bytes. Chatterbox validates the reference clip by file *extension* and +/// rejects several formats (e.g. `.aac`, `.opus`), so we always normalize to +/// WAV regardless of the source container. Capped at 30s — references only need +/// a few seconds of clean speech. +async fn run_ffmpeg_to_wav(input_path: &str) -> anyhow::Result> { + let out = tempfile::Builder::new() + .suffix(".wav") + .tempfile() + .context("creating temp wav")?; + let out_s = out.path().to_string_lossy().to_string(); + + // Cap the reference clip length. Chatterbox is zero-shot — a clean ~10–20s + // sample is the sweet spot and more rarely helps — so we use the first N + // seconds. Tune via LLAMA_SWAP_TTS_REF_SECONDS (default 30). + let secs = std::env::var("LLAMA_SWAP_TTS_REF_SECONDS") + .ok() + .and_then(|s| s.trim().parse::().ok()) + .filter(|n| *n > 0) + .unwrap_or(30) + .to_string(); + + let output = tokio::process::Command::new("ffmpeg") + .args([ + "-y", "-i", input_path, "-vn", "-ac", "1", "-ar", "24000", "-t", &secs, "-f", "wav", + &out_s, + ]) + .output() + .await + .context("spawning ffmpeg")?; + + if !output.status.success() { + anyhow::bail!("ffmpeg failed: {}", String::from_utf8_lossy(&output.stderr)); } - .to_string() + std::fs::read(&out_s).context("reading transcoded audio") +} + +/// Normalize in-memory upload bytes to WAV: write to a temp file (keeping the +/// source extension as an ffmpeg probe hint) then transcode. +async fn transcode_bytes_to_wav(input: &[u8], src_ext: Option<&str>) -> anyhow::Result> { + let suffix = src_ext + .filter(|e| !e.is_empty()) + .map(|e| format!(".{e}")) + .unwrap_or_else(|| ".bin".to_string()); + let in_tmp = tempfile::Builder::new() + .suffix(&suffix) + .tempfile() + .context("creating temp input")?; + std::fs::write(in_tmp.path(), input).context("writing temp input")?; + run_ffmpeg_to_wav(&in_tmp.path().to_string_lossy()).await } #[derive(Debug, Deserialize)] @@ -239,7 +274,6 @@ pub async fn create_voice_upload_handler( let mut voice_name: Option = None; let mut file_bytes = BytesMut::new(); let mut filename = "voice.wav".to_string(); - let mut mime = "application/octet-stream".to_string(); while let Some(Ok(mut part)) = payload.next().await { // Capture disposition fields up front so the immutable borrow ends @@ -254,9 +288,6 @@ pub async fn create_voice_upload_handler( if let Some(fname) = fname_opt { filename = fname; - if let Some(ct) = part.content_type() { - mime = ct.to_string(); - } while let Some(Ok(data)) = part.next().await { if file_bytes.len() + data.len() > MAX_VOICE_UPLOAD_BYTES { return HttpResponse::PayloadTooLarge() @@ -282,12 +313,21 @@ pub async fn create_voice_upload_handler( if file_bytes.is_empty() { return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" })); } - if !mime.starts_with("audio") { - mime = guess_audio_mime(Path::new(&filename)); - } + + // Normalize to WAV so any device format (e.g. .aac / .opus, which Chatterbox + // rejects by extension) is accepted. + let src_ext = Path::new(&filename).extension().and_then(|e| e.to_str()); + let wav = match transcode_bytes_to_wav(file_bytes.as_ref(), src_ext).await { + Ok(w) => w, + Err(e) => { + log::error!("voice upload transcode failed: {:?}", e); + return HttpResponse::BadRequest() + .json(json!({ "error": "couldn't decode that audio file" })); + } + }; match client - .create_voice(&name, file_bytes.to_vec(), &filename, &mime) + .create_voice(&name, wav, "reference.wav", "audio/wav") .await { Ok(v) => HttpResponse::Ok().json(v), @@ -308,8 +348,8 @@ pub struct CreateVoiceFromLibraryRequest { } /// POST /tts/voices/from-library — register a cloned voice from a file already -/// in a library. Audio files are forwarded as-is; video files have up to 30s -/// of their audio track extracted (mono, 24 kHz) via ffmpeg. +/// in a library. Audio and video alike are ffmpeg-normalized to a mono 24 kHz +/// WAV reference clip (length capped by LLAMA_SWAP_TTS_REF_SECONDS). #[post("/tts/voices/from-library")] pub async fn create_voice_from_library_handler( _claims: Claims, @@ -346,16 +386,17 @@ pub async fn create_voice_from_library_handler( .json(json!({ "error": "file is not an audio or video file" })); } - let (bytes, filename, mime) = match prepare_reference_audio(&abs).await { - Ok(t) => t, + let wav = match prepare_reference_audio(&abs).await { + Ok(b) => b, Err(e) => { log::error!("voice reference prep failed for {:?}: {:?}", abs, e); - return HttpResponse::BadRequest().json(json!({ "error": format!("{e}") })); + return HttpResponse::BadRequest() + .json(json!({ "error": "couldn't decode that file's audio" })); } }; match client - .create_voice(&voice_name, bytes, &filename, &mime) + .create_voice(&voice_name, wav, "reference.wav", "audio/wav") .await { Ok(v) => HttpResponse::Ok().json(v), @@ -366,44 +407,11 @@ pub async fn create_voice_from_library_handler( } } -/// Read a library file as reference audio. Audio is returned verbatim; video -/// has up to 30s of audio extracted to mono 24 kHz WAV via ffmpeg. -async fn prepare_reference_audio(abs: &Path) -> anyhow::Result<(Vec, String, String)> { - if is_video_file(abs) { - let tmp = tempfile::Builder::new() - .suffix(".wav") - .tempfile() - .context("creating temp wav")?; - let out = tmp.path().to_path_buf(); - let abs_s = abs.to_string_lossy().to_string(); - let out_s = out.to_string_lossy().to_string(); - - let output = tokio::process::Command::new("ffmpeg") - .args([ - "-y", "-i", &abs_s, "-vn", "-ac", "1", "-ar", "24000", "-t", "30", "-f", "wav", - &out_s, - ]) - .output() - .await - .context("spawning ffmpeg")?; - - if !output.status.success() { - anyhow::bail!( - "ffmpeg audio extraction failed: {}", - String::from_utf8_lossy(&output.stderr) - ); - } - let bytes = std::fs::read(&out).context("reading extracted audio")?; - Ok((bytes, "reference.wav".to_string(), "audio/wav".to_string())) - } else { - let bytes = std::fs::read(abs).context("reading audio file")?; - let filename = abs - .file_name() - .and_then(|f| f.to_str()) - .unwrap_or("reference") - .to_string(); - Ok((bytes, filename, guess_audio_mime(abs))) - } +/// Read a library file (audio or video) as a Chatterbox-ready reference: ffmpeg +/// decodes/extracts its audio to mono 24 kHz WAV. Reading straight from the +/// library path avoids slurping a (possibly large) video into memory. +async fn prepare_reference_audio(abs: &Path) -> anyhow::Result> { + run_ffmpeg_to_wav(&abs.to_string_lossy()).await } #[cfg(test)] @@ -447,18 +455,6 @@ mod tests { assert_eq!(sanitize_voice_name(&long).unwrap().len(), 64); } - #[test] - fn guess_audio_mime_maps_known_extensions() { - assert_eq!(guess_audio_mime(Path::new("clip.wav")), "audio/wav"); - assert_eq!(guess_audio_mime(Path::new("clip.MP3")), "audio/mpeg"); - assert_eq!(guess_audio_mime(Path::new("clip.m4a")), "audio/mp4"); - assert_eq!(guess_audio_mime(Path::new("clip.flac")), "audio/flac"); - assert_eq!( - guess_audio_mime(Path::new("clip.xyz")), - "application/octet-stream" - ); - } - #[test] fn clean_for_tts_strips_markdown() { assert_eq!( From ccacfe1113f74ac5e96231366ffbef26f8180e33 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Tue, 2 Jun 2026 23:10:43 -0400 Subject: [PATCH 05/10] Instrument TTS handlers with OTel spans (codebase standard) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Each /tts handler now opens an http.tts.* span via extract_context_from_request + global_tracer().start_with_context, sets Status::Ok / Status::error on every outcome, and records useful attributes (model, format, voice_name, byte counts) — matching the insight handlers. Prometheus request metrics were already covered by the app-wide actix-web-prom middleware. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/ai/tts.rs | 69 +++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 64 insertions(+), 5 deletions(-) diff --git a/src/ai/tts.rs b/src/ai/tts.rs index 59b4a80..9c98bee 100644 --- a/src/ai/tts.rs +++ b/src/ai/tts.rs @@ -6,11 +6,13 @@ // (audio read directly; video has its audio track extracted via ffmpeg). use actix_multipart::Multipart; -use actix_web::{HttpResponse, Responder, get, post, web}; +use actix_web::{HttpRequest, HttpResponse, Responder, get, post, web}; use anyhow::Context; use base64::Engine; use bytes::{BufMut, BytesMut}; use futures::StreamExt; +use opentelemetry::KeyValue; +use opentelemetry::trace::{Span, Status, Tracer}; use regex::Regex; use serde::{Deserialize, Serialize}; use serde_json::json; @@ -21,6 +23,7 @@ use crate::data::Claims; use crate::file_types::{is_audio_file, is_video_file}; use crate::files::is_valid_full_path; use crate::libraries; +use crate::otel::{extract_context_from_request, global_tracer}; use crate::state::AppState; /// Hard cap on an uploaded voice-reference clip. Chatterbox itself caps the @@ -191,15 +194,21 @@ pub struct TtsSpeechResponse { /// return base64-encoded audio for `data:` URI playback on the client. #[post("/tts/speech")] pub async fn tts_speech_handler( + http_request: HttpRequest, _claims: Claims, req: web::Json, app_state: web::Data, ) -> impl Responder { + let parent_context = extract_context_from_request(&http_request); + let mut span = global_tracer().start_with_context("http.tts.speech", &parent_context); + let text = clean_for_tts(&req.text); if text.is_empty() { + span.set_status(Status::error("text is required")); return HttpResponse::BadRequest().json(json!({ "error": "text is required" })); } let Some(client) = app_state.llamacpp.as_ref() else { + span.set_status(Status::error("tts backend not configured")); return HttpResponse::ServiceUnavailable() .json(json!({ "error": "TTS backend not configured (set LLAMA_SWAP_URL)" })); }; @@ -216,6 +225,11 @@ pub async fn tts_speech_handler( .filter(|s| !s.is_empty()) .or(dv.as_deref()); + span.set_attribute(KeyValue::new("tts.model", client.tts_model.clone())); + span.set_attribute(KeyValue::new("tts.format", format.to_string())); + span.set_attribute(KeyValue::new("tts.has_voice", voice.is_some())); + span.set_attribute(KeyValue::new("tts.text_len", text.len() as i64)); + // Clamp generation knobs to Chatterbox's documented ranges before forwarding. let exaggeration = req.exaggeration.map(|x| x.clamp(0.25, 2.0)); let cfg_weight = req.cfg_weight.map(|x| x.clamp(0.0, 1.0)); @@ -226,6 +240,8 @@ pub async fn tts_speech_handler( .await { Ok(bytes) => { + span.set_attribute(KeyValue::new("tts.audio_bytes", bytes.len() as i64)); + span.set_status(Status::Ok); let audio_base64 = base64::engine::general_purpose::STANDARD.encode(&bytes); HttpResponse::Ok().json(TtsSpeechResponse { audio_base64, @@ -233,6 +249,7 @@ pub async fn tts_speech_handler( }) } Err(e) => { + span.set_status(Status::error("tts synthesis failed")); log::error!("TTS synth failed: {:?}", e); HttpResponse::BadGateway().json(json!({ "error": format!("TTS failed: {e}") })) } @@ -242,16 +259,25 @@ pub async fn tts_speech_handler( /// GET /tts/voices — list the Chatterbox voice library (raw passthrough). #[get("/tts/voices")] pub async fn list_voices_handler( + http_request: HttpRequest, _claims: Claims, app_state: web::Data, ) -> impl Responder { + let parent_context = extract_context_from_request(&http_request); + let mut span = global_tracer().start_with_context("http.tts.voices.list", &parent_context); + let Some(client) = app_state.llamacpp.as_ref() else { + span.set_status(Status::error("tts backend not configured")); return HttpResponse::ServiceUnavailable() .json(json!({ "error": "TTS backend not configured" })); }; match client.list_voices().await { - Ok(v) => HttpResponse::Ok().json(v), + Ok(v) => { + span.set_status(Status::Ok); + HttpResponse::Ok().json(v) + } Err(e) => { + span.set_status(Status::error("list_voices failed")); log::error!("list_voices failed: {:?}", e); HttpResponse::BadGateway().json(json!({ "error": format!("{e}") })) } @@ -262,11 +288,16 @@ pub async fn list_voices_handler( /// clip. Multipart fields: `voice_name` (text) + a file part (`voice_file`). #[post("/tts/voices/upload")] pub async fn create_voice_upload_handler( + http_request: HttpRequest, _claims: Claims, mut payload: Multipart, app_state: web::Data, ) -> impl Responder { + let parent_context = extract_context_from_request(&http_request); + let mut span = global_tracer().start_with_context("http.tts.voices.upload", &parent_context); + let Some(client) = app_state.llamacpp.as_ref() else { + span.set_status(Status::error("tts backend not configured")); return HttpResponse::ServiceUnavailable() .json(json!({ "error": "TTS backend not configured" })); }; @@ -290,6 +321,7 @@ pub async fn create_voice_upload_handler( filename = fname; while let Some(Ok(data)) = part.next().await { if file_bytes.len() + data.len() > MAX_VOICE_UPLOAD_BYTES { + span.set_status(Status::error("voice clip exceeds limit")); return HttpResponse::PayloadTooLarge() .json(json!({ "error": "voice clip exceeds 25 MB" })); } @@ -307,12 +339,16 @@ pub async fn create_voice_upload_handler( } let Some(name) = voice_name.as_deref().and_then(sanitize_voice_name) else { + span.set_status(Status::error("voice_name is required")); return HttpResponse::BadRequest() .json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" })); }; if file_bytes.is_empty() { + span.set_status(Status::error("voice_file is required")); return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" })); } + span.set_attribute(KeyValue::new("tts.voice_name", name.clone())); + span.set_attribute(KeyValue::new("tts.upload_bytes", file_bytes.len() as i64)); // Normalize to WAV so any device format (e.g. .aac / .opus, which Chatterbox // rejects by extension) is accepted. @@ -320,6 +356,7 @@ pub async fn create_voice_upload_handler( let wav = match transcode_bytes_to_wav(file_bytes.as_ref(), src_ext).await { Ok(w) => w, Err(e) => { + span.set_status(Status::error("audio decode failed")); log::error!("voice upload transcode failed: {:?}", e); return HttpResponse::BadRequest() .json(json!({ "error": "couldn't decode that audio file" })); @@ -330,8 +367,12 @@ pub async fn create_voice_upload_handler( .create_voice(&name, wav, "reference.wav", "audio/wav") .await { - Ok(v) => HttpResponse::Ok().json(v), + Ok(v) => { + span.set_status(Status::Ok); + HttpResponse::Ok().json(v) + } Err(e) => { + span.set_status(Status::error("create_voice failed")); log::error!("create_voice (upload) failed: {:?}", e); HttpResponse::BadGateway().json(json!({ "error": format!("{e}") })) } @@ -352,15 +393,22 @@ pub struct CreateVoiceFromLibraryRequest { /// WAV reference clip (length capped by LLAMA_SWAP_TTS_REF_SECONDS). #[post("/tts/voices/from-library")] pub async fn create_voice_from_library_handler( + http_request: HttpRequest, _claims: Claims, req: web::Json, app_state: web::Data, ) -> impl Responder { + let parent_context = extract_context_from_request(&http_request); + let mut span = + global_tracer().start_with_context("http.tts.voices.from_library", &parent_context); + let Some(client) = app_state.llamacpp.as_ref() else { + span.set_status(Status::error("tts backend not configured")); return HttpResponse::ServiceUnavailable() .json(json!({ "error": "TTS backend not configured" })); }; let Some(voice_name) = sanitize_voice_name(&req.voice_name) else { + span.set_status(Status::error("voice_name is required")); return HttpResponse::BadRequest() .json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" })); }; @@ -368,13 +416,17 @@ pub async fn create_voice_from_library_handler( let library = match libraries::resolve_library_param(&app_state, req.library.as_deref()) { Ok(Some(l)) => l, Ok(None) => app_state.primary_library(), - Err(msg) => return HttpResponse::BadRequest().json(json!({ "error": msg })), + Err(msg) => { + span.set_status(Status::error("invalid library")); + return HttpResponse::BadRequest().json(json!({ "error": msg })); + } }; // is_valid_full_path confines the path to the library root (no traversal). let abs = match is_valid_full_path(&library.root_path, &req.path, false) { Some(p) if p.exists() => p, _ => { + span.set_status(Status::error("file not found")); return HttpResponse::NotFound().json(json!({ "error": "file not found in library" })); } }; @@ -382,13 +434,16 @@ pub async fn create_voice_from_library_handler( // Only real audio/video sources are valid voice references — refuse to // slurp arbitrary library files into memory / ffmpeg. if !is_audio_file(&abs) && !is_video_file(&abs) { + span.set_status(Status::error("not an audio/video file")); return HttpResponse::BadRequest() .json(json!({ "error": "file is not an audio or video file" })); } + span.set_attribute(KeyValue::new("tts.voice_name", voice_name.clone())); let wav = match prepare_reference_audio(&abs).await { Ok(b) => b, Err(e) => { + span.set_status(Status::error("audio decode failed")); log::error!("voice reference prep failed for {:?}: {:?}", abs, e); return HttpResponse::BadRequest() .json(json!({ "error": "couldn't decode that file's audio" })); @@ -399,8 +454,12 @@ pub async fn create_voice_from_library_handler( .create_voice(&voice_name, wav, "reference.wav", "audio/wav") .await { - Ok(v) => HttpResponse::Ok().json(v), + Ok(v) => { + span.set_status(Status::Ok); + HttpResponse::Ok().json(v) + } Err(e) => { + span.set_status(Status::error("create_voice failed")); log::error!("create_voice (from-library) failed: {:?}", e); HttpResponse::BadGateway().json(json!({ "error": format!("{e}") })) } From 9978b28b52bf4ecf6673109bed5a4384f3f59b4b Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Tue, 2 Jun 2026 23:15:39 -0400 Subject: [PATCH 06/10] Document TTS endpoints + env in CLAUDE.md Sync CLAUDE.md with the Chatterbox TTS feature: the /tts/* endpoints and the LLAMA_SWAP_TTS_MODEL / _VOICE / _REF_SECONDS env vars (only need LLAMA_SWAP_URL). Co-Authored-By: Claude Opus 4.8 (1M context) --- CLAUDE.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index 7f1da76..b5e1ee2 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -477,6 +477,12 @@ GET /insights/models (local-backend models + capabilities; Ollam GET /insights/openrouter/models (curated OpenRouter allowlist) POST /insights/rate (thumbs up/down for training data) +// Text-to-Speech (Chatterbox via llama-swap; needs LLAMA_SWAP_URL) +POST /tts/speech (read-aloud: { text, voice?, ... } -> { audio_base64, format }) +GET /tts/voices (Chatterbox voice library) +POST /tts/voices/upload (clone a voice from an uploaded clip; multipart) +POST /tts/voices/from-library (clone a voice from a library audio/video file) + // Insight Chat Continuation POST /insights/chat (single-turn reply, non-streaming) POST /insights/chat/stream (SSE: text / tool_call / tool_result / truncated / done) @@ -652,6 +658,15 @@ LLAMA_SWAP_ALLOWED_MODELS=chat,coder # Curated allowlist surfaced by # Empty = picker shows only the configured primary model. LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180 # Per-request timeout; bump for slow CPU offload +# Text-to-speech (Chatterbox served behind llama-swap). Only needs +# LLAMA_SWAP_URL — independent of LLM_BACKEND. Powers /tts/speech (read-aloud) +# and /tts/voices* (voice cloning). Reference audio is ffmpeg-normalized to WAV +# server-side, so any source format works. +LLAMA_SWAP_TTS_MODEL=chatterbox # TTS model id in config.yaml (default: chatterbox) +LLAMA_SWAP_TTS_VOICE=m # Default voice when /tts/speech omits one (optional) +LLAMA_SWAP_TTS_REF_SECONDS=30 # Max voice-clone reference clip length, seconds + # (Chatterbox is zero-shot; ~10-20s clean ref is ideal) + # Insight Chat Continuation AGENTIC_CHAT_MAX_ITERATIONS=6 # Cap on tool-calling iterations per chat turn (default 6) ``` From d8dd260c6bfa22b0e6b78092e5e2c62a4a41fc38 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Wed, 3 Jun 2026 10:25:06 -0400 Subject: [PATCH 07/10] Give TTS synthesis its own (longer) request timeout Long insights are chunked + synthesized server-side and can run past the shared 180s chat/embedding client timeout, causing spurious timeouts. /tts/speech now uses a per-request timeout from LLAMA_SWAP_TTS_REQUEST_TIMEOUT_SECONDS (default 600), overriding the client default without affecting chat/embeddings. Co-Authored-By: Claude Opus 4.8 (1M context) --- .env.example | 1 + CLAUDE.md | 2 ++ README.md | 4 ++++ src/ai/llamacpp.rs | 10 ++++++++++ 4 files changed, 17 insertions(+) diff --git a/.env.example b/.env.example index 2b6cff0..a45fdd5 100644 --- a/.env.example +++ b/.env.example @@ -88,6 +88,7 @@ AGENTIC_CHAT_MAX_ITERATIONS=6 # LLAMA_SWAP_TTS_MODEL=chatterbox # TTS model id in config.yaml # LLAMA_SWAP_TTS_VOICE=m # default voice when a request omits one # LLAMA_SWAP_TTS_REF_SECONDS=30 # max voice-clone reference clip length (s) +# LLAMA_SWAP_TTS_REQUEST_TIMEOUT_SECONDS=600 # synth timeout (long chunked text) # ── AI Insights — sibling services (optional) ─────────────────────────── # Apollo (places, face inference, CLIP encoders). Single-Apollo deploys diff --git a/CLAUDE.md b/CLAUDE.md index b5e1ee2..fba33e0 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -666,6 +666,8 @@ LLAMA_SWAP_TTS_MODEL=chatterbox # TTS model id in config.yaml (de LLAMA_SWAP_TTS_VOICE=m # Default voice when /tts/speech omits one (optional) LLAMA_SWAP_TTS_REF_SECONDS=30 # Max voice-clone reference clip length, seconds # (Chatterbox is zero-shot; ~10-20s clean ref is ideal) +LLAMA_SWAP_TTS_REQUEST_TIMEOUT_SECONDS=600 # Per-request synth timeout (long chunked insights take + # minutes); overrides the shared client timeout for /tts/speech # Insight Chat Continuation AGENTIC_CHAT_MAX_ITERATIONS=6 # Cap on tool-calling iterations per chat turn (default 6) diff --git a/README.md b/README.md index 0b678df..58ddc81 100644 --- a/README.md +++ b/README.md @@ -169,6 +169,10 @@ Env: [default: `30`]. Reference audio is ffmpeg-normalized to mono 24 kHz WAV (so any source format works); Chatterbox is zero-shot, so a clean ~10–20s sample is the sweet spot — more rarely helps. +- `LLAMA_SWAP_TTS_REQUEST_TIMEOUT_SECONDS` - per-request synthesis timeout in + seconds [default: `600`]. Long insights are chunked + synthesized server-side + and can take minutes; this is separate from (and overrides, for `/tts/speech`) + the shared `LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS`. #### Fallback Behavior - Primary server is tried first with 5-second connection timeout diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs index 2946688..d56b645 100644 --- a/src/ai/llamacpp.rs +++ b/src/ai/llamacpp.rs @@ -170,9 +170,19 @@ impl LlamaCppClient { body["temperature"] = json!(x); } + // TTS gets its own (longer) timeout: synthesizing a long, internally + // chunked insight can take minutes, well past the shared chat/embedding + // client timeout. Per-request `.timeout()` overrides the client default. + let tts_timeout = std::env::var("LLAMA_SWAP_TTS_REQUEST_TIMEOUT_SECONDS") + .ok() + .and_then(|v| v.parse::().ok()) + .filter(|n| *n > 0) + .unwrap_or(600); + let resp = self .client .post(&url) + .timeout(Duration::from_secs(tts_timeout)) .json(&body) .send() .await From cab867da609a3c8356a9e36c62faf86da75a7da1 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Wed, 3 Jun 2026 14:02:56 -0400 Subject: [PATCH 08/10] Serialize /tts/speech with a single permit; 429 when busy The Chatterbox wrapper has no internal lock or cancellation, so concurrent synth requests contend on the single GPU and abandoned (timed-out) jobs cascade into stacked slowness. Gate synthesis behind a one-permit semaphore and fast-fail concurrent requests with 429 instead of queueing. Co-Authored-By: Claude Opus 4.8 (1M context) --- README.md | 3 ++- src/ai/tts.rs | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 58ddc81..39ebe30 100644 --- a/README.md +++ b/README.md @@ -154,7 +154,8 @@ is built whenever that's set — independent of `LLM_BACKEND`). Endpoints: - `POST /tts/speech` — body `{ text, voice?, format?, exaggeration?, cfg_weight?, temperature? }`; returns `{ audio_base64, format }`. Input is cleaned server-side (markdown + emoji stripped) and the generation knobs are clamped - to Chatterbox's ranges. + to Chatterbox's ranges. Synthesis is serialized (one at a time — the upstream + has no GPU lock of its own); a concurrent request gets a fast `429`. - `GET /tts/voices` — list the voice library. - `POST /tts/voices/upload` — multipart `voice_name` + `voice_file`; clone a voice from an uploaded clip (≤25 MB). diff --git a/src/ai/tts.rs b/src/ai/tts.rs index 9c98bee..2c2009b 100644 --- a/src/ai/tts.rs +++ b/src/ai/tts.rs @@ -18,6 +18,7 @@ use serde::{Deserialize, Serialize}; use serde_json::json; use std::path::Path; use std::sync::LazyLock; +use tokio::sync::Semaphore; use crate::data::Claims; use crate::file_types::{is_audio_file, is_video_file}; @@ -31,6 +32,14 @@ use crate::state::AppState; /// upload can't balloon ImageApi memory before we ever forward it. const MAX_VOICE_UPLOAD_BYTES: usize = 25 * 1024 * 1024; // 25 MB +/// Serialize speech synthesis: the Chatterbox server has no internal lock or +/// queue, so concurrent requests contend on the single GPU and cascade into +/// timeouts. One permit; when busy we fast-fail with 429 rather than queue — +/// the app surfaces "busy" immediately, and typical jobs clear in well under a +/// minute. (An abandoned upstream job can still occupy the GPU until it +/// finishes — that's a wrapper limitation; the chunked-queue plan fixes it.) +static TTS_PERMIT: LazyLock = LazyLock::new(|| Semaphore::new(1)); + /// Sanitize a user-supplied voice name. The name is forwarded to Chatterbox /// where it becomes a filename in the voice-library directory, so we restrict /// it to a safe charset (alphanumerics, dash, underscore) — no path @@ -235,6 +244,14 @@ pub async fn tts_speech_handler( let cfg_weight = req.cfg_weight.map(|x| x.clamp(0.0, 1.0)); let temperature = req.temperature.map(|x| x.clamp(0.05, 5.0)); + // One synthesis at a time (see TTS_PERMIT) — fast-fail when busy. + let Ok(_permit) = TTS_PERMIT.try_acquire() else { + span.set_status(Status::error("tts busy")); + return HttpResponse::TooManyRequests().json(json!({ + "error": "TTS is busy with another request — try again shortly" + })); + }; + match client .text_to_speech(&text, voice, format, exaggeration, cfg_weight, temperature) .await From dec6f21af9ae91e6d36527c61451f7c145e57bbb Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Wed, 3 Jun 2026 14:07:10 -0400 Subject: [PATCH 09/10] Bump version to 1.3.0 TTS feature release: /tts/speech + voice library endpoints (Chatterbox via llama-swap), input cleaning, tuning knobs, WAV-normalized voice cloning, OTel spans, dedicated synth timeout, and single-flight serialization. Co-Authored-By: Claude Opus 4.8 (1M context) --- Cargo.lock | 2 +- Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d35048c..a35a7d2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2051,7 +2051,7 @@ dependencies = [ [[package]] name = "image-api" -version = "1.2.0" +version = "1.3.0" dependencies = [ "actix", "actix-cors", diff --git a/Cargo.toml b/Cargo.toml index 6807778..3b3a08a 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "image-api" -version = "1.2.0" +version = "1.3.0" authors = ["Cameron Cordes "] edition = "2024" From 412da2ce8ed7b29955e5ec750cc3df35c1a121a1 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Thu, 4 Jun 2026 09:12:43 -0400 Subject: [PATCH 10/10] Collapse blank lines to a single break in TTS text cleaning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Chatterbox inserts a long pause — sometimes ~20s of silence — for each blank line it sees, and insight text is markdown full of paragraph breaks. clean_for_tts previously preserved paragraph structure (\n{3,} -> \n\n), so every paragraph boundary still reached the model as a double newline. Now any run of 2+ newlines, including whitespace-only blank lines, collapses to a single newline so the worst pause a break can cause is a normal line-break pause. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/ai/tts.rs | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/src/ai/tts.rs b/src/ai/tts.rs index 2c2009b..b94be36 100644 --- a/src/ai/tts.rs +++ b/src/ai/tts.rs @@ -85,7 +85,10 @@ static MD_LIST: LazyLock = static MD_EMPHASIS: LazyLock = LazyLock::new(|| Regex::new(r"[*_`~]+").unwrap()); static URL_RE: LazyLock = LazyLock::new(|| Regex::new(r"https?://\S+").unwrap()); static MULTISPACE: LazyLock = LazyLock::new(|| Regex::new(r"[ \t]{2,}").unwrap()); -static MULTINEWLINE: LazyLock = LazyLock::new(|| Regex::new(r"\n{3,}").unwrap()); +// Any run of 2+ newlines (incl. whitespace-only blank lines) collapses to ONE +// newline: Chatterbox inserts a long pause (sometimes ~20s of silence) per +// blank line, so paragraph breaks must reach it as a single line break at most. +static MULTINEWLINE: LazyLock = LazyLock::new(|| Regex::new(r"\n(?:[ \t]*\n)+").unwrap()); /// True for emoji / pictographic symbols, which most TTS models either skip or /// mispronounce. Covers the main emoji blocks plus dingbats, misc-technical, @@ -118,7 +121,7 @@ fn clean_for_tts(input: &str) -> String { let s = URL_RE.replace_all(&s, " "); let s: String = s.chars().filter(|c| !is_emoji_like(*c)).collect(); let s = MULTISPACE.replace_all(&s, " "); - let s = MULTINEWLINE.replace_all(&s, "\n\n"); + let s = MULTINEWLINE.replace_all(&s, "\n"); s.trim().to_string() } @@ -537,7 +540,7 @@ mod tests { clean_for_tts("**Bold** and _italic_ and `code`"), "Bold and italic and code" ); - assert_eq!(clean_for_tts("# Title\n\nbody"), "Title\n\nbody"); + assert_eq!(clean_for_tts("# Title\n\nbody"), "Title\nbody"); assert_eq!( clean_for_tts("See [docs](http://x.com) now"), "See docs now" @@ -556,6 +559,18 @@ mod tests { assert_eq!(clean_for_tts("family 👨‍👩‍👧 photo"), "family photo"); } + #[test] + fn clean_for_tts_collapses_blank_lines_to_single_break() { + // Chatterbox pauses (sometimes ~20s) per blank line, so paragraph + // breaks must collapse to a single newline. + assert_eq!(clean_for_tts("para one\n\npara two"), "para one\npara two"); + assert_eq!(clean_for_tts("a\n\n\n\nb"), "a\nb"); + // Whitespace-only "blank" lines collapse too. + assert_eq!(clean_for_tts("a\n \t \nb"), "a\nb"); + // A single newline is left alone. + assert_eq!(clean_for_tts("a\nb"), "a\nb"); + } + #[test] fn clean_for_tts_preserves_bracket_tags() { // Non-turbo Chatterbox ignores these; a future Turbo uses them as