Files
ImageApi/src/ai/tts.rs
T
Cameron Cordes 412da2ce8e Collapse blank lines to a single break in TTS text cleaning
Chatterbox inserts a long pause — sometimes ~20s of silence — for each
blank line it sees, and insight text is markdown full of paragraph
breaks. clean_for_tts previously preserved paragraph structure
(\n{3,} -> \n\n), so every paragraph boundary still reached the model
as a double newline. Now any run of 2+ newlines, including
whitespace-only blank lines, collapses to a single newline so the
worst pause a break can cause is a normal line-break pause.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-04 09:12:43 -04:00

581 lines
23 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// TTS endpoints: proxy text-to-speech + voice-library management to the
// Chatterbox server that sits behind llama-swap (via LlamaCppClient). Speech
// synthesis returns audio as base64-in-JSON so the mobile app can play it as a
// `data:` URI without a binary-fetch path. Voice cloning registers a named
// voice from either an uploaded clip (device) or an existing library file
// (audio read directly; video has its audio track extracted via ffmpeg).
use actix_multipart::Multipart;
use actix_web::{HttpRequest, HttpResponse, Responder, get, post, web};
use anyhow::Context;
use base64::Engine;
use bytes::{BufMut, BytesMut};
use futures::StreamExt;
use opentelemetry::KeyValue;
use opentelemetry::trace::{Span, Status, Tracer};
use regex::Regex;
use serde::{Deserialize, Serialize};
use serde_json::json;
use std::path::Path;
use std::sync::LazyLock;
use tokio::sync::Semaphore;
use crate::data::Claims;
use crate::file_types::{is_audio_file, is_video_file};
use crate::files::is_valid_full_path;
use crate::libraries;
use crate::otel::{extract_context_from_request, global_tracer};
use crate::state::AppState;
/// Hard cap on an uploaded voice-reference clip. Chatterbox itself caps the
/// payload (~60s clip); this is a defensive ceiling so a hostile/oversized
/// upload can't balloon ImageApi memory before we ever forward it.
const MAX_VOICE_UPLOAD_BYTES: usize = 25 * 1024 * 1024; // 25 MB
/// Serialize speech synthesis: the Chatterbox server has no internal lock or
/// queue, so concurrent requests contend on the single GPU and cascade into
/// timeouts. One permit; when busy we fast-fail with 429 rather than queue —
/// the app surfaces "busy" immediately, and typical jobs clear in well under a
/// minute. (An abandoned upstream job can still occupy the GPU until it
/// finishes — that's a wrapper limitation; the chunked-queue plan fixes it.)
static TTS_PERMIT: LazyLock<Semaphore> = LazyLock::new(|| Semaphore::new(1));
/// Sanitize a user-supplied voice name. The name is forwarded to Chatterbox
/// where it becomes a filename in the voice-library directory, so we restrict
/// it to a safe charset (alphanumerics, dash, underscore) — no path
/// separators, dots, or whitespace — and bound its length. Returns `None`
/// when nothing usable remains.
fn sanitize_voice_name(raw: &str) -> Option<String> {
let cleaned: String = raw
.trim()
.chars()
.map(|c| {
if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
c
} else {
'-'
}
})
.collect();
let cleaned = cleaned.trim_matches('-').to_string();
if cleaned.is_empty() {
return None;
}
Some(cleaned.chars().take(64).collect())
}
/// Optional default voice for synthesis when the request doesn't name one.
/// Set `LLAMA_SWAP_TTS_VOICE=m` to read insights in a cloned voice by default.
fn default_voice() -> Option<String> {
std::env::var("LLAMA_SWAP_TTS_VOICE")
.ok()
.map(|s| s.trim().to_string())
.filter(|s| !s.is_empty())
}
// Markdown / formatting strippers, compiled once. Insight text is markdown,
// which TTS would otherwise read literally ("star star bold star star").
static MD_IMAGE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\([^)]*\)").unwrap());
static MD_LINK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\([^)]*\)").unwrap());
static MD_HEADING: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}#{1,6}\s*").unwrap());
static MD_BLOCKQUOTE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}>\s?").unwrap());
static MD_LIST: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}([-*+]|\d+\.)\s+").unwrap());
static MD_EMPHASIS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[*_`~]+").unwrap());
static URL_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"https?://\S+").unwrap());
static MULTISPACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[ \t]{2,}").unwrap());
// Any run of 2+ newlines (incl. whitespace-only blank lines) collapses to ONE
// newline: Chatterbox inserts a long pause (sometimes ~20s of silence) per
// blank line, so paragraph breaks must reach it as a single line break at most.
static MULTINEWLINE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\n(?:[ \t]*\n)+").unwrap());
/// True for emoji / pictographic symbols, which most TTS models either skip or
/// mispronounce. Covers the main emoji blocks plus dingbats, misc-technical,
/// variation selectors, and the ZWJ used to glue emoji sequences. We do NOT
/// strip `[bracketed]` tags — non-turbo Chatterbox ignores them, and a future
/// Turbo switch uses them as paralinguistic cues.
fn is_emoji_like(c: char) -> bool {
let u = c as u32;
matches!(u,
0x1F000..=0x1FAFF // emoji, pictographs, supplemental symbols, flags
| 0x2300..=0x23FF // misc technical (⌚ ⏰ ⏳ …)
| 0x2600..=0x27BF // misc symbols + dingbats
| 0x2B00..=0x2BFF // misc symbols & arrows (★ ⬆ …)
| 0xFE00..=0xFE0F // variation selectors
| 0x200D // zero-width joiner
)
}
/// Normalize insight text for speech: unwrap markdown links/images to their
/// visible text, drop heading/list/blockquote/emphasis markers and URLs, strip
/// emoji, and collapse whitespace. Centralized here so every caller (app,
/// WebUI, curl) gets clean audio.
fn clean_for_tts(input: &str) -> String {
let s = MD_IMAGE.replace_all(input, "$1");
let s = MD_LINK.replace_all(&s, "$1");
let s = MD_HEADING.replace_all(&s, "");
let s = MD_BLOCKQUOTE.replace_all(&s, "");
let s = MD_LIST.replace_all(&s, "");
let s = MD_EMPHASIS.replace_all(&s, "");
let s = URL_RE.replace_all(&s, " ");
let s: String = s.chars().filter(|c| !is_emoji_like(*c)).collect();
let s = MULTISPACE.replace_all(&s, " ");
let s = MULTINEWLINE.replace_all(&s, "\n");
s.trim().to_string()
}
/// Decode an audio/video file to mono 24 kHz WAV via ffmpeg, returning the WAV
/// bytes. Chatterbox validates the reference clip by file *extension* and
/// rejects several formats (e.g. `.aac`, `.opus`), so we always normalize to
/// WAV regardless of the source container. Capped at 30s — references only need
/// a few seconds of clean speech.
async fn run_ffmpeg_to_wav(input_path: &str) -> anyhow::Result<Vec<u8>> {
let out = tempfile::Builder::new()
.suffix(".wav")
.tempfile()
.context("creating temp wav")?;
let out_s = out.path().to_string_lossy().to_string();
// Cap the reference clip length. Chatterbox is zero-shot — a clean ~1020s
// sample is the sweet spot and more rarely helps — so we use the first N
// seconds. Tune via LLAMA_SWAP_TTS_REF_SECONDS (default 30).
let secs = std::env::var("LLAMA_SWAP_TTS_REF_SECONDS")
.ok()
.and_then(|s| s.trim().parse::<u32>().ok())
.filter(|n| *n > 0)
.unwrap_or(30)
.to_string();
let output = tokio::process::Command::new("ffmpeg")
.args([
"-y", "-i", input_path, "-vn", "-ac", "1", "-ar", "24000", "-t", &secs, "-f", "wav",
&out_s,
])
.output()
.await
.context("spawning ffmpeg")?;
if !output.status.success() {
anyhow::bail!("ffmpeg failed: {}", String::from_utf8_lossy(&output.stderr));
}
std::fs::read(&out_s).context("reading transcoded audio")
}
/// Normalize in-memory upload bytes to WAV: write to a temp file (keeping the
/// source extension as an ffmpeg probe hint) then transcode.
async fn transcode_bytes_to_wav(input: &[u8], src_ext: Option<&str>) -> anyhow::Result<Vec<u8>> {
let suffix = src_ext
.filter(|e| !e.is_empty())
.map(|e| format!(".{e}"))
.unwrap_or_else(|| ".bin".to_string());
let in_tmp = tempfile::Builder::new()
.suffix(&suffix)
.tempfile()
.context("creating temp input")?;
std::fs::write(in_tmp.path(), input).context("writing temp input")?;
run_ffmpeg_to_wav(&in_tmp.path().to_string_lossy()).await
}
#[derive(Debug, Deserialize)]
pub struct TtsSpeechRequest {
pub text: String,
#[serde(default)]
pub voice: Option<String>,
/// Audio container, e.g. `"mp3"` (default) or `"wav"`.
#[serde(default)]
pub format: Option<String>,
/// Chatterbox knobs (clamped server-side). exaggeration 0.252.0 (emotion),
/// cfg_weight 0.01.0 (pace; ~0.3 for fast speakers, 0 to neutralize a
/// reference accent), temperature 0.055.0 (randomness).
#[serde(default)]
pub exaggeration: Option<f32>,
#[serde(default)]
pub cfg_weight: Option<f32>,
#[serde(default)]
pub temperature: Option<f32>,
}
#[derive(Debug, Serialize)]
pub struct TtsSpeechResponse {
pub audio_base64: String,
pub format: String,
}
/// POST /tts/speech — synthesize `text` (optionally in a named `voice`) and
/// return base64-encoded audio for `data:` URI playback on the client.
#[post("/tts/speech")]
pub async fn tts_speech_handler(
http_request: HttpRequest,
_claims: Claims,
req: web::Json<TtsSpeechRequest>,
app_state: web::Data<AppState>,
) -> impl Responder {
let parent_context = extract_context_from_request(&http_request);
let mut span = global_tracer().start_with_context("http.tts.speech", &parent_context);
let text = clean_for_tts(&req.text);
if text.is_empty() {
span.set_status(Status::error("text is required"));
return HttpResponse::BadRequest().json(json!({ "error": "text is required" }));
}
let Some(client) = app_state.llamacpp.as_ref() else {
span.set_status(Status::error("tts backend not configured"));
return HttpResponse::ServiceUnavailable()
.json(json!({ "error": "TTS backend not configured (set LLAMA_SWAP_URL)" }));
};
let format = req
.format
.as_deref()
.filter(|s| !s.is_empty())
.unwrap_or("mp3");
let dv = default_voice();
let voice = req
.voice
.as_deref()
.filter(|s| !s.is_empty())
.or(dv.as_deref());
span.set_attribute(KeyValue::new("tts.model", client.tts_model.clone()));
span.set_attribute(KeyValue::new("tts.format", format.to_string()));
span.set_attribute(KeyValue::new("tts.has_voice", voice.is_some()));
span.set_attribute(KeyValue::new("tts.text_len", text.len() as i64));
// Clamp generation knobs to Chatterbox's documented ranges before forwarding.
let exaggeration = req.exaggeration.map(|x| x.clamp(0.25, 2.0));
let cfg_weight = req.cfg_weight.map(|x| x.clamp(0.0, 1.0));
let temperature = req.temperature.map(|x| x.clamp(0.05, 5.0));
// One synthesis at a time (see TTS_PERMIT) — fast-fail when busy.
let Ok(_permit) = TTS_PERMIT.try_acquire() else {
span.set_status(Status::error("tts busy"));
return HttpResponse::TooManyRequests().json(json!({
"error": "TTS is busy with another request — try again shortly"
}));
};
match client
.text_to_speech(&text, voice, format, exaggeration, cfg_weight, temperature)
.await
{
Ok(bytes) => {
span.set_attribute(KeyValue::new("tts.audio_bytes", bytes.len() as i64));
span.set_status(Status::Ok);
let audio_base64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
HttpResponse::Ok().json(TtsSpeechResponse {
audio_base64,
format: format.to_string(),
})
}
Err(e) => {
span.set_status(Status::error("tts synthesis failed"));
log::error!("TTS synth failed: {:?}", e);
HttpResponse::BadGateway().json(json!({ "error": format!("TTS failed: {e}") }))
}
}
}
/// GET /tts/voices — list the Chatterbox voice library (raw passthrough).
#[get("/tts/voices")]
pub async fn list_voices_handler(
http_request: HttpRequest,
_claims: Claims,
app_state: web::Data<AppState>,
) -> impl Responder {
let parent_context = extract_context_from_request(&http_request);
let mut span = global_tracer().start_with_context("http.tts.voices.list", &parent_context);
let Some(client) = app_state.llamacpp.as_ref() else {
span.set_status(Status::error("tts backend not configured"));
return HttpResponse::ServiceUnavailable()
.json(json!({ "error": "TTS backend not configured" }));
};
match client.list_voices().await {
Ok(v) => {
span.set_status(Status::Ok);
HttpResponse::Ok().json(v)
}
Err(e) => {
span.set_status(Status::error("list_voices failed"));
log::error!("list_voices failed: {:?}", e);
HttpResponse::BadGateway().json(json!({ "error": format!("{e}") }))
}
}
}
/// POST /tts/voices/upload — register a cloned voice from an uploaded audio
/// clip. Multipart fields: `voice_name` (text) + a file part (`voice_file`).
#[post("/tts/voices/upload")]
pub async fn create_voice_upload_handler(
http_request: HttpRequest,
_claims: Claims,
mut payload: Multipart,
app_state: web::Data<AppState>,
) -> impl Responder {
let parent_context = extract_context_from_request(&http_request);
let mut span = global_tracer().start_with_context("http.tts.voices.upload", &parent_context);
let Some(client) = app_state.llamacpp.as_ref() else {
span.set_status(Status::error("tts backend not configured"));
return HttpResponse::ServiceUnavailable()
.json(json!({ "error": "TTS backend not configured" }));
};
let mut voice_name: Option<String> = None;
let mut file_bytes = BytesMut::new();
let mut filename = "voice.wav".to_string();
while let Some(Ok(mut part)) = payload.next().await {
// Capture disposition fields up front so the immutable borrow ends
// before we mutably stream the part body (mirrors handlers/image.rs).
let (fname_opt, name_opt) = {
let cd = part.content_disposition();
(
cd.and_then(|c| c.get_filename()).map(|s| s.to_string()),
cd.and_then(|c| c.get_name()).map(|s| s.to_string()),
)
};
if let Some(fname) = fname_opt {
filename = fname;
while let Some(Ok(data)) = part.next().await {
if file_bytes.len() + data.len() > MAX_VOICE_UPLOAD_BYTES {
span.set_status(Status::error("voice clip exceeds limit"));
return HttpResponse::PayloadTooLarge()
.json(json!({ "error": "voice clip exceeds 25 MB" }));
}
file_bytes.put(data);
}
} else if name_opt.as_deref() == Some("voice_name") {
let mut buf = BytesMut::new();
while let Some(Ok(data)) = part.next().await {
buf.put(data);
}
voice_name = Some(String::from_utf8_lossy(&buf).trim().to_string());
} else {
while let Some(Ok(_)) = part.next().await {}
}
}
let Some(name) = voice_name.as_deref().and_then(sanitize_voice_name) else {
span.set_status(Status::error("voice_name is required"));
return HttpResponse::BadRequest()
.json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
};
if file_bytes.is_empty() {
span.set_status(Status::error("voice_file is required"));
return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" }));
}
span.set_attribute(KeyValue::new("tts.voice_name", name.clone()));
span.set_attribute(KeyValue::new("tts.upload_bytes", file_bytes.len() as i64));
// Normalize to WAV so any device format (e.g. .aac / .opus, which Chatterbox
// rejects by extension) is accepted.
let src_ext = Path::new(&filename).extension().and_then(|e| e.to_str());
let wav = match transcode_bytes_to_wav(file_bytes.as_ref(), src_ext).await {
Ok(w) => w,
Err(e) => {
span.set_status(Status::error("audio decode failed"));
log::error!("voice upload transcode failed: {:?}", e);
return HttpResponse::BadRequest()
.json(json!({ "error": "couldn't decode that audio file" }));
}
};
match client
.create_voice(&name, wav, "reference.wav", "audio/wav")
.await
{
Ok(v) => {
span.set_status(Status::Ok);
HttpResponse::Ok().json(v)
}
Err(e) => {
span.set_status(Status::error("create_voice failed"));
log::error!("create_voice (upload) failed: {:?}", e);
HttpResponse::BadGateway().json(json!({ "error": format!("{e}") }))
}
}
}
#[derive(Debug, Deserialize)]
pub struct CreateVoiceFromLibraryRequest {
pub voice_name: String,
/// Library-relative path to an audio or video file.
pub path: String,
#[serde(default)]
pub library: Option<String>,
}
/// POST /tts/voices/from-library — register a cloned voice from a file already
/// in a library. Audio and video alike are ffmpeg-normalized to a mono 24 kHz
/// WAV reference clip (length capped by LLAMA_SWAP_TTS_REF_SECONDS).
#[post("/tts/voices/from-library")]
pub async fn create_voice_from_library_handler(
http_request: HttpRequest,
_claims: Claims,
req: web::Json<CreateVoiceFromLibraryRequest>,
app_state: web::Data<AppState>,
) -> impl Responder {
let parent_context = extract_context_from_request(&http_request);
let mut span =
global_tracer().start_with_context("http.tts.voices.from_library", &parent_context);
let Some(client) = app_state.llamacpp.as_ref() else {
span.set_status(Status::error("tts backend not configured"));
return HttpResponse::ServiceUnavailable()
.json(json!({ "error": "TTS backend not configured" }));
};
let Some(voice_name) = sanitize_voice_name(&req.voice_name) else {
span.set_status(Status::error("voice_name is required"));
return HttpResponse::BadRequest()
.json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
};
let library = match libraries::resolve_library_param(&app_state, req.library.as_deref()) {
Ok(Some(l)) => l,
Ok(None) => app_state.primary_library(),
Err(msg) => {
span.set_status(Status::error("invalid library"));
return HttpResponse::BadRequest().json(json!({ "error": msg }));
}
};
// is_valid_full_path confines the path to the library root (no traversal).
let abs = match is_valid_full_path(&library.root_path, &req.path, false) {
Some(p) if p.exists() => p,
_ => {
span.set_status(Status::error("file not found"));
return HttpResponse::NotFound().json(json!({ "error": "file not found in library" }));
}
};
// Only real audio/video sources are valid voice references — refuse to
// slurp arbitrary library files into memory / ffmpeg.
if !is_audio_file(&abs) && !is_video_file(&abs) {
span.set_status(Status::error("not an audio/video file"));
return HttpResponse::BadRequest()
.json(json!({ "error": "file is not an audio or video file" }));
}
span.set_attribute(KeyValue::new("tts.voice_name", voice_name.clone()));
let wav = match prepare_reference_audio(&abs).await {
Ok(b) => b,
Err(e) => {
span.set_status(Status::error("audio decode failed"));
log::error!("voice reference prep failed for {:?}: {:?}", abs, e);
return HttpResponse::BadRequest()
.json(json!({ "error": "couldn't decode that file's audio" }));
}
};
match client
.create_voice(&voice_name, wav, "reference.wav", "audio/wav")
.await
{
Ok(v) => {
span.set_status(Status::Ok);
HttpResponse::Ok().json(v)
}
Err(e) => {
span.set_status(Status::error("create_voice failed"));
log::error!("create_voice (from-library) failed: {:?}", e);
HttpResponse::BadGateway().json(json!({ "error": format!("{e}") }))
}
}
}
/// Read a library file (audio or video) as a Chatterbox-ready reference: ffmpeg
/// decodes/extracts its audio to mono 24 kHz WAV. Reading straight from the
/// library path avoids slurping a (possibly large) video into memory.
async fn prepare_reference_audio(abs: &Path) -> anyhow::Result<Vec<u8>> {
run_ffmpeg_to_wav(&abs.to_string_lossy()).await
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn sanitize_voice_name_keeps_safe_chars() {
assert_eq!(sanitize_voice_name("m").as_deref(), Some("m"));
assert_eq!(
sanitize_voice_name(" Cameron ").as_deref(),
Some("Cameron")
);
assert_eq!(
sanitize_voice_name("voice_01-a").as_deref(),
Some("voice_01-a")
);
}
#[test]
fn sanitize_voice_name_strips_unsafe_chars() {
// Path separators / dots / spaces become '-' and are trimmed at edges.
assert_eq!(sanitize_voice_name("a b.c").as_deref(), Some("a-b-c"));
assert_eq!(
sanitize_voice_name("../etc/passwd").as_deref(),
Some("etc-passwd")
);
}
#[test]
fn sanitize_voice_name_rejects_empty_or_all_unsafe() {
assert_eq!(sanitize_voice_name(""), None);
assert_eq!(sanitize_voice_name(" "), None);
assert_eq!(sanitize_voice_name("../../"), None);
assert_eq!(sanitize_voice_name("...."), None);
}
#[test]
fn sanitize_voice_name_bounds_length() {
let long = "a".repeat(200);
assert_eq!(sanitize_voice_name(&long).unwrap().len(), 64);
}
#[test]
fn clean_for_tts_strips_markdown() {
assert_eq!(
clean_for_tts("**Bold** and _italic_ and `code`"),
"Bold and italic and code"
);
assert_eq!(clean_for_tts("# Title\n\nbody"), "Title\nbody");
assert_eq!(
clean_for_tts("See [docs](http://x.com) now"),
"See docs now"
);
assert_eq!(clean_for_tts("- one\n- two"), "one\ntwo");
}
#[test]
fn clean_for_tts_strips_emoji_and_urls() {
assert_eq!(clean_for_tts("Hello 😀 world 🎉"), "Hello world");
assert_eq!(
clean_for_tts("visit https://example.com today"),
"visit today"
);
// ZWJ-glued emoji sequence is fully removed.
assert_eq!(clean_for_tts("family 👨‍👩‍👧 photo"), "family photo");
}
#[test]
fn clean_for_tts_collapses_blank_lines_to_single_break() {
// Chatterbox pauses (sometimes ~20s) per blank line, so paragraph
// breaks must collapse to a single newline.
assert_eq!(clean_for_tts("para one\n\npara two"), "para one\npara two");
assert_eq!(clean_for_tts("a\n\n\n\nb"), "a\nb");
// Whitespace-only "blank" lines collapse too.
assert_eq!(clean_for_tts("a\n \t \nb"), "a\nb");
// A single newline is left alone.
assert_eq!(clean_for_tts("a\nb"), "a\nb");
}
#[test]
fn clean_for_tts_preserves_bracket_tags() {
// Non-turbo Chatterbox ignores these; a future Turbo uses them as
// paralinguistic cues — so we must not strip them.
assert_eq!(clean_for_tts("hello [laugh] there"), "hello [laugh] there");
}
}