// TTS endpoints: proxy text-to-speech + voice-library management to the // Chatterbox server that sits behind llama-swap (via LlamaCppClient). Speech // synthesis returns audio as base64-in-JSON so the mobile app can play it as a // `data:` URI without a binary-fetch path. Voice cloning registers a named // voice from either an uploaded clip (device) or an existing library file // (audio read directly; video has its audio track extracted via ffmpeg). use actix_multipart::Multipart; use actix_web::{HttpRequest, HttpResponse, Responder, get, post, web}; use anyhow::Context; use base64::Engine; use bytes::{BufMut, BytesMut}; use futures::StreamExt; use opentelemetry::KeyValue; use opentelemetry::trace::{Span, Status, Tracer}; use regex::Regex; use serde::{Deserialize, Serialize}; use serde_json::json; use std::path::Path; use std::sync::LazyLock; use tokio::sync::Semaphore; use crate::data::Claims; use crate::file_types::{is_audio_file, is_video_file}; use crate::files::is_valid_full_path; use crate::libraries; use crate::otel::{extract_context_from_request, global_tracer}; use crate::state::AppState; /// Hard cap on an uploaded voice-reference clip. Chatterbox itself caps the /// payload (~60s clip); this is a defensive ceiling so a hostile/oversized /// upload can't balloon ImageApi memory before we ever forward it. const MAX_VOICE_UPLOAD_BYTES: usize = 25 * 1024 * 1024; // 25 MB /// Serialize speech synthesis: the Chatterbox server has no internal lock or /// queue, so concurrent requests contend on the single GPU and cascade into /// timeouts. One permit; when busy we fast-fail with 429 rather than queue — /// the app surfaces "busy" immediately, and typical jobs clear in well under a /// minute. (An abandoned upstream job can still occupy the GPU until it /// finishes — that's a wrapper limitation; the chunked-queue plan fixes it.) static TTS_PERMIT: LazyLock = LazyLock::new(|| Semaphore::new(1)); /// Sanitize a user-supplied voice name. The name is forwarded to Chatterbox /// where it becomes a filename in the voice-library directory, so we restrict /// it to a safe charset (alphanumerics, dash, underscore) — no path /// separators, dots, or whitespace — and bound its length. Returns `None` /// when nothing usable remains. fn sanitize_voice_name(raw: &str) -> Option { let cleaned: String = raw .trim() .chars() .map(|c| { if c.is_ascii_alphanumeric() || c == '-' || c == '_' { c } else { '-' } }) .collect(); let cleaned = cleaned.trim_matches('-').to_string(); if cleaned.is_empty() { return None; } Some(cleaned.chars().take(64).collect()) } /// Optional default voice for synthesis when the request doesn't name one. /// Set `LLAMA_SWAP_TTS_VOICE=m` to read insights in a cloned voice by default. fn default_voice() -> Option { std::env::var("LLAMA_SWAP_TTS_VOICE") .ok() .map(|s| s.trim().to_string()) .filter(|s| !s.is_empty()) } // Markdown / formatting strippers, compiled once. Insight text is markdown, // which TTS would otherwise read literally ("star star bold star star"). static MD_IMAGE: LazyLock = LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\([^)]*\)").unwrap()); static MD_LINK: LazyLock = LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\([^)]*\)").unwrap()); static MD_HEADING: LazyLock = LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}#{1,6}\s*").unwrap()); static MD_BLOCKQUOTE: LazyLock = LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}>\s?").unwrap()); static MD_LIST: LazyLock = LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}([-*+]|\d+\.)\s+").unwrap()); static MD_EMPHASIS: LazyLock = LazyLock::new(|| Regex::new(r"[*_`~]+").unwrap()); static URL_RE: LazyLock = LazyLock::new(|| Regex::new(r"https?://\S+").unwrap()); static MULTISPACE: LazyLock = LazyLock::new(|| Regex::new(r"[ \t]{2,}").unwrap()); // Any run of 2+ newlines (incl. whitespace-only blank lines) collapses to ONE // newline: Chatterbox inserts a long pause (sometimes ~20s of silence) per // blank line, so paragraph breaks must reach it as a single line break at most. static MULTINEWLINE: LazyLock = LazyLock::new(|| Regex::new(r"\n(?:[ \t]*\n)+").unwrap()); /// True for emoji / pictographic symbols, which most TTS models either skip or /// mispronounce. Covers the main emoji blocks plus dingbats, misc-technical, /// variation selectors, and the ZWJ used to glue emoji sequences. We do NOT /// strip `[bracketed]` tags — non-turbo Chatterbox ignores them, and a future /// Turbo switch uses them as paralinguistic cues. fn is_emoji_like(c: char) -> bool { let u = c as u32; matches!(u, 0x1F000..=0x1FAFF // emoji, pictographs, supplemental symbols, flags | 0x2300..=0x23FF // misc technical (⌚ ⏰ ⏳ …) | 0x2600..=0x27BF // misc symbols + dingbats | 0x2B00..=0x2BFF // misc symbols & arrows (★ ⬆ …) | 0xFE00..=0xFE0F // variation selectors | 0x200D // zero-width joiner ) } /// Normalize insight text for speech: unwrap markdown links/images to their /// visible text, drop heading/list/blockquote/emphasis markers and URLs, strip /// emoji, and collapse whitespace. Centralized here so every caller (app, /// WebUI, curl) gets clean audio. fn clean_for_tts(input: &str) -> String { let s = MD_IMAGE.replace_all(input, "$1"); let s = MD_LINK.replace_all(&s, "$1"); let s = MD_HEADING.replace_all(&s, ""); let s = MD_BLOCKQUOTE.replace_all(&s, ""); let s = MD_LIST.replace_all(&s, ""); let s = MD_EMPHASIS.replace_all(&s, ""); let s = URL_RE.replace_all(&s, " "); let s: String = s.chars().filter(|c| !is_emoji_like(*c)).collect(); let s = MULTISPACE.replace_all(&s, " "); let s = MULTINEWLINE.replace_all(&s, "\n"); s.trim().to_string() } /// Decode an audio/video file to mono 24 kHz WAV via ffmpeg, returning the WAV /// bytes. Chatterbox validates the reference clip by file *extension* and /// rejects several formats (e.g. `.aac`, `.opus`), so we always normalize to /// WAV regardless of the source container. Capped at 30s — references only need /// a few seconds of clean speech. async fn run_ffmpeg_to_wav(input_path: &str) -> anyhow::Result> { let out = tempfile::Builder::new() .suffix(".wav") .tempfile() .context("creating temp wav")?; let out_s = out.path().to_string_lossy().to_string(); // Cap the reference clip length. Chatterbox is zero-shot — a clean ~10–20s // sample is the sweet spot and more rarely helps — so we use the first N // seconds. Tune via LLAMA_SWAP_TTS_REF_SECONDS (default 30). let secs = std::env::var("LLAMA_SWAP_TTS_REF_SECONDS") .ok() .and_then(|s| s.trim().parse::().ok()) .filter(|n| *n > 0) .unwrap_or(30) .to_string(); let output = tokio::process::Command::new("ffmpeg") .args([ "-y", "-i", input_path, "-vn", "-ac", "1", "-ar", "24000", "-t", &secs, "-f", "wav", &out_s, ]) .output() .await .context("spawning ffmpeg")?; if !output.status.success() { anyhow::bail!("ffmpeg failed: {}", String::from_utf8_lossy(&output.stderr)); } std::fs::read(&out_s).context("reading transcoded audio") } /// Normalize in-memory upload bytes to WAV: write to a temp file (keeping the /// source extension as an ffmpeg probe hint) then transcode. async fn transcode_bytes_to_wav(input: &[u8], src_ext: Option<&str>) -> anyhow::Result> { let suffix = src_ext .filter(|e| !e.is_empty()) .map(|e| format!(".{e}")) .unwrap_or_else(|| ".bin".to_string()); let in_tmp = tempfile::Builder::new() .suffix(&suffix) .tempfile() .context("creating temp input")?; std::fs::write(in_tmp.path(), input).context("writing temp input")?; run_ffmpeg_to_wav(&in_tmp.path().to_string_lossy()).await } #[derive(Debug, Deserialize)] pub struct TtsSpeechRequest { pub text: String, #[serde(default)] pub voice: Option, /// Audio container, e.g. `"mp3"` (default) or `"wav"`. #[serde(default)] pub format: Option, /// Chatterbox knobs (clamped server-side). exaggeration 0.25–2.0 (emotion), /// cfg_weight 0.0–1.0 (pace; ~0.3 for fast speakers, 0 to neutralize a /// reference accent), temperature 0.05–5.0 (randomness). #[serde(default)] pub exaggeration: Option, #[serde(default)] pub cfg_weight: Option, #[serde(default)] pub temperature: Option, } #[derive(Debug, Serialize)] pub struct TtsSpeechResponse { pub audio_base64: String, pub format: String, } /// POST /tts/speech — synthesize `text` (optionally in a named `voice`) and /// return base64-encoded audio for `data:` URI playback on the client. #[post("/tts/speech")] pub async fn tts_speech_handler( http_request: HttpRequest, _claims: Claims, req: web::Json, app_state: web::Data, ) -> impl Responder { let parent_context = extract_context_from_request(&http_request); let mut span = global_tracer().start_with_context("http.tts.speech", &parent_context); let text = clean_for_tts(&req.text); if text.is_empty() { span.set_status(Status::error("text is required")); return HttpResponse::BadRequest().json(json!({ "error": "text is required" })); } let Some(client) = app_state.llamacpp.as_ref() else { span.set_status(Status::error("tts backend not configured")); return HttpResponse::ServiceUnavailable() .json(json!({ "error": "TTS backend not configured (set LLAMA_SWAP_URL)" })); }; let format = req .format .as_deref() .filter(|s| !s.is_empty()) .unwrap_or("mp3"); let dv = default_voice(); let voice = req .voice .as_deref() .filter(|s| !s.is_empty()) .or(dv.as_deref()); span.set_attribute(KeyValue::new("tts.model", client.tts_model.clone())); span.set_attribute(KeyValue::new("tts.format", format.to_string())); span.set_attribute(KeyValue::new("tts.has_voice", voice.is_some())); span.set_attribute(KeyValue::new("tts.text_len", text.len() as i64)); // Clamp generation knobs to Chatterbox's documented ranges before forwarding. let exaggeration = req.exaggeration.map(|x| x.clamp(0.25, 2.0)); let cfg_weight = req.cfg_weight.map(|x| x.clamp(0.0, 1.0)); let temperature = req.temperature.map(|x| x.clamp(0.05, 5.0)); // One synthesis at a time (see TTS_PERMIT) — fast-fail when busy. let Ok(_permit) = TTS_PERMIT.try_acquire() else { span.set_status(Status::error("tts busy")); return HttpResponse::TooManyRequests().json(json!({ "error": "TTS is busy with another request — try again shortly" })); }; match client .text_to_speech(&text, voice, format, exaggeration, cfg_weight, temperature) .await { Ok(bytes) => { span.set_attribute(KeyValue::new("tts.audio_bytes", bytes.len() as i64)); span.set_status(Status::Ok); let audio_base64 = base64::engine::general_purpose::STANDARD.encode(&bytes); HttpResponse::Ok().json(TtsSpeechResponse { audio_base64, format: format.to_string(), }) } Err(e) => { span.set_status(Status::error("tts synthesis failed")); log::error!("TTS synth failed: {:?}", e); HttpResponse::BadGateway().json(json!({ "error": format!("TTS failed: {e}") })) } } } /// GET /tts/voices — list the Chatterbox voice library (raw passthrough). #[get("/tts/voices")] pub async fn list_voices_handler( http_request: HttpRequest, _claims: Claims, app_state: web::Data, ) -> impl Responder { let parent_context = extract_context_from_request(&http_request); let mut span = global_tracer().start_with_context("http.tts.voices.list", &parent_context); let Some(client) = app_state.llamacpp.as_ref() else { span.set_status(Status::error("tts backend not configured")); return HttpResponse::ServiceUnavailable() .json(json!({ "error": "TTS backend not configured" })); }; match client.list_voices().await { Ok(v) => { span.set_status(Status::Ok); HttpResponse::Ok().json(v) } Err(e) => { span.set_status(Status::error("list_voices failed")); log::error!("list_voices failed: {:?}", e); HttpResponse::BadGateway().json(json!({ "error": format!("{e}") })) } } } /// POST /tts/voices/upload — register a cloned voice from an uploaded audio /// clip. Multipart fields: `voice_name` (text) + a file part (`voice_file`). #[post("/tts/voices/upload")] pub async fn create_voice_upload_handler( http_request: HttpRequest, _claims: Claims, mut payload: Multipart, app_state: web::Data, ) -> impl Responder { let parent_context = extract_context_from_request(&http_request); let mut span = global_tracer().start_with_context("http.tts.voices.upload", &parent_context); let Some(client) = app_state.llamacpp.as_ref() else { span.set_status(Status::error("tts backend not configured")); return HttpResponse::ServiceUnavailable() .json(json!({ "error": "TTS backend not configured" })); }; let mut voice_name: Option = None; let mut file_bytes = BytesMut::new(); let mut filename = "voice.wav".to_string(); while let Some(Ok(mut part)) = payload.next().await { // Capture disposition fields up front so the immutable borrow ends // before we mutably stream the part body (mirrors handlers/image.rs). let (fname_opt, name_opt) = { let cd = part.content_disposition(); ( cd.and_then(|c| c.get_filename()).map(|s| s.to_string()), cd.and_then(|c| c.get_name()).map(|s| s.to_string()), ) }; if let Some(fname) = fname_opt { filename = fname; while let Some(Ok(data)) = part.next().await { if file_bytes.len() + data.len() > MAX_VOICE_UPLOAD_BYTES { span.set_status(Status::error("voice clip exceeds limit")); return HttpResponse::PayloadTooLarge() .json(json!({ "error": "voice clip exceeds 25 MB" })); } file_bytes.put(data); } } else if name_opt.as_deref() == Some("voice_name") { let mut buf = BytesMut::new(); while let Some(Ok(data)) = part.next().await { buf.put(data); } voice_name = Some(String::from_utf8_lossy(&buf).trim().to_string()); } else { while let Some(Ok(_)) = part.next().await {} } } let Some(name) = voice_name.as_deref().and_then(sanitize_voice_name) else { span.set_status(Status::error("voice_name is required")); return HttpResponse::BadRequest() .json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" })); }; if file_bytes.is_empty() { span.set_status(Status::error("voice_file is required")); return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" })); } span.set_attribute(KeyValue::new("tts.voice_name", name.clone())); span.set_attribute(KeyValue::new("tts.upload_bytes", file_bytes.len() as i64)); // Normalize to WAV so any device format (e.g. .aac / .opus, which Chatterbox // rejects by extension) is accepted. let src_ext = Path::new(&filename).extension().and_then(|e| e.to_str()); let wav = match transcode_bytes_to_wav(file_bytes.as_ref(), src_ext).await { Ok(w) => w, Err(e) => { span.set_status(Status::error("audio decode failed")); log::error!("voice upload transcode failed: {:?}", e); return HttpResponse::BadRequest() .json(json!({ "error": "couldn't decode that audio file" })); } }; match client .create_voice(&name, wav, "reference.wav", "audio/wav") .await { Ok(v) => { span.set_status(Status::Ok); HttpResponse::Ok().json(v) } Err(e) => { span.set_status(Status::error("create_voice failed")); log::error!("create_voice (upload) failed: {:?}", e); HttpResponse::BadGateway().json(json!({ "error": format!("{e}") })) } } } #[derive(Debug, Deserialize)] pub struct CreateVoiceFromLibraryRequest { pub voice_name: String, /// Library-relative path to an audio or video file. pub path: String, #[serde(default)] pub library: Option, } /// POST /tts/voices/from-library — register a cloned voice from a file already /// in a library. Audio and video alike are ffmpeg-normalized to a mono 24 kHz /// WAV reference clip (length capped by LLAMA_SWAP_TTS_REF_SECONDS). #[post("/tts/voices/from-library")] pub async fn create_voice_from_library_handler( http_request: HttpRequest, _claims: Claims, req: web::Json, app_state: web::Data, ) -> impl Responder { let parent_context = extract_context_from_request(&http_request); let mut span = global_tracer().start_with_context("http.tts.voices.from_library", &parent_context); let Some(client) = app_state.llamacpp.as_ref() else { span.set_status(Status::error("tts backend not configured")); return HttpResponse::ServiceUnavailable() .json(json!({ "error": "TTS backend not configured" })); }; let Some(voice_name) = sanitize_voice_name(&req.voice_name) else { span.set_status(Status::error("voice_name is required")); return HttpResponse::BadRequest() .json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" })); }; let library = match libraries::resolve_library_param(&app_state, req.library.as_deref()) { Ok(Some(l)) => l, Ok(None) => app_state.primary_library(), Err(msg) => { span.set_status(Status::error("invalid library")); return HttpResponse::BadRequest().json(json!({ "error": msg })); } }; // is_valid_full_path confines the path to the library root (no traversal). let abs = match is_valid_full_path(&library.root_path, &req.path, false) { Some(p) if p.exists() => p, _ => { span.set_status(Status::error("file not found")); return HttpResponse::NotFound().json(json!({ "error": "file not found in library" })); } }; // Only real audio/video sources are valid voice references — refuse to // slurp arbitrary library files into memory / ffmpeg. if !is_audio_file(&abs) && !is_video_file(&abs) { span.set_status(Status::error("not an audio/video file")); return HttpResponse::BadRequest() .json(json!({ "error": "file is not an audio or video file" })); } span.set_attribute(KeyValue::new("tts.voice_name", voice_name.clone())); let wav = match prepare_reference_audio(&abs).await { Ok(b) => b, Err(e) => { span.set_status(Status::error("audio decode failed")); log::error!("voice reference prep failed for {:?}: {:?}", abs, e); return HttpResponse::BadRequest() .json(json!({ "error": "couldn't decode that file's audio" })); } }; match client .create_voice(&voice_name, wav, "reference.wav", "audio/wav") .await { Ok(v) => { span.set_status(Status::Ok); HttpResponse::Ok().json(v) } Err(e) => { span.set_status(Status::error("create_voice failed")); log::error!("create_voice (from-library) failed: {:?}", e); HttpResponse::BadGateway().json(json!({ "error": format!("{e}") })) } } } /// Read a library file (audio or video) as a Chatterbox-ready reference: ffmpeg /// decodes/extracts its audio to mono 24 kHz WAV. Reading straight from the /// library path avoids slurping a (possibly large) video into memory. async fn prepare_reference_audio(abs: &Path) -> anyhow::Result> { run_ffmpeg_to_wav(&abs.to_string_lossy()).await } #[cfg(test)] mod tests { use super::*; #[test] fn sanitize_voice_name_keeps_safe_chars() { assert_eq!(sanitize_voice_name("m").as_deref(), Some("m")); assert_eq!( sanitize_voice_name(" Cameron ").as_deref(), Some("Cameron") ); assert_eq!( sanitize_voice_name("voice_01-a").as_deref(), Some("voice_01-a") ); } #[test] fn sanitize_voice_name_strips_unsafe_chars() { // Path separators / dots / spaces become '-' and are trimmed at edges. assert_eq!(sanitize_voice_name("a b.c").as_deref(), Some("a-b-c")); assert_eq!( sanitize_voice_name("../etc/passwd").as_deref(), Some("etc-passwd") ); } #[test] fn sanitize_voice_name_rejects_empty_or_all_unsafe() { assert_eq!(sanitize_voice_name(""), None); assert_eq!(sanitize_voice_name(" "), None); assert_eq!(sanitize_voice_name("../../"), None); assert_eq!(sanitize_voice_name("...."), None); } #[test] fn sanitize_voice_name_bounds_length() { let long = "a".repeat(200); assert_eq!(sanitize_voice_name(&long).unwrap().len(), 64); } #[test] fn clean_for_tts_strips_markdown() { assert_eq!( clean_for_tts("**Bold** and _italic_ and `code`"), "Bold and italic and code" ); assert_eq!(clean_for_tts("# Title\n\nbody"), "Title\nbody"); assert_eq!( clean_for_tts("See [docs](http://x.com) now"), "See docs now" ); assert_eq!(clean_for_tts("- one\n- two"), "one\ntwo"); } #[test] fn clean_for_tts_strips_emoji_and_urls() { assert_eq!(clean_for_tts("Hello 😀 world 🎉"), "Hello world"); assert_eq!( clean_for_tts("visit https://example.com today"), "visit today" ); // ZWJ-glued emoji sequence is fully removed. assert_eq!(clean_for_tts("family 👨‍👩‍👧 photo"), "family photo"); } #[test] fn clean_for_tts_collapses_blank_lines_to_single_break() { // Chatterbox pauses (sometimes ~20s) per blank line, so paragraph // breaks must collapse to a single newline. assert_eq!(clean_for_tts("para one\n\npara two"), "para one\npara two"); assert_eq!(clean_for_tts("a\n\n\n\nb"), "a\nb"); // Whitespace-only "blank" lines collapse too. assert_eq!(clean_for_tts("a\n \t \nb"), "a\nb"); // A single newline is left alone. assert_eq!(clean_for_tts("a\nb"), "a\nb"); } #[test] fn clean_for_tts_preserves_bracket_tags() { // Non-turbo Chatterbox ignores these; a future Turbo uses them as // paralinguistic cues — so we must not strip them. assert_eq!(clean_for_tts("hello [laugh] there"), "hello [laugh] there"); } }