// TTS endpoints: proxy text-to-speech + voice-library management to the // Chatterbox server that sits behind llama-swap (via LlamaCppClient). Speech // synthesis returns audio as base64-in-JSON so the mobile app can play it as a // `data:` URI without a binary-fetch path. Voice cloning registers a named // voice from either an uploaded clip (device) or an existing library file // (audio read directly; video has its audio track extracted via ffmpeg). use actix_multipart::Multipart; use actix_web::{HttpRequest, HttpResponse, Responder, delete, get, post, web}; use anyhow::Context; use base64::Engine; use bytes::{BufMut, BytesMut}; use futures::StreamExt; use opentelemetry::KeyValue; use opentelemetry::trace::{Span, Status, Tracer}; use regex::Regex; use serde::{Deserialize, Serialize}; use serde_json::{Value, json}; use std::collections::HashMap; use std::path::Path; use std::sync::{LazyLock, Mutex as StdMutex}; use std::time::{Duration, Instant}; use tokio::sync::Semaphore; use uuid::Uuid; use crate::data::Claims; use crate::file_types::{is_audio_file, is_video_file}; use crate::files::is_valid_full_path; use crate::libraries; use crate::otel::{extract_context_from_request, global_tracer}; use crate::state::AppState; /// Hard cap on an uploaded voice-reference clip. Chatterbox itself caps the /// payload (~60s clip); this is a defensive ceiling so a hostile/oversized /// upload can't balloon ImageApi memory before we ever forward it. const MAX_VOICE_UPLOAD_BYTES: usize = 25 * 1024 * 1024; // 25 MB /// Serialize speech synthesis: the Chatterbox server has no internal lock or /// queue, so concurrent requests contend on the single GPU and cascade into /// timeouts. One permit; when busy we fast-fail with 429 rather than queue — /// the app surfaces "busy" immediately, and typical jobs clear in well under a /// minute. (An abandoned upstream job can still occupy the GPU until it /// finishes — that's a wrapper limitation; the chunked-queue plan fixes it.) static TTS_PERMIT: LazyLock = LazyLock::new(|| Semaphore::new(1)); // --- Voice-list cache -------------------------------------------------------- /// Cached raw voice-library JSON. llama-swap's `/upstream//voices` /// passthrough spins the TTS model up just to answer a listing — which can /// evict the resident LLM — so we serve a cached copy and only hit upstream on /// a cold cache, an explicit `?refresh=1`, or after a voice create/delete /// invalidates it (the TTS model is already loaded right then anyway). static VOICES_CACHE: LazyLock>> = LazyLock::new(|| StdMutex::new(None)); fn cached_voices() -> Option { VOICES_CACHE.lock().unwrap().clone() } fn store_voices_cache(v: &Value) { *VOICES_CACHE.lock().unwrap() = Some(v.clone()); } fn invalidate_voices_cache() { *VOICES_CACHE.lock().unwrap() = None; } // --- Async speech jobs ------------------------------------------------------- // // Synthesizing a long insight can take minutes — too long to hang one HTTP // request from a phone that may background the app or drop the connection. // Durable variant: POST /tts/speech/jobs returns a job id immediately, the // synth runs in a spawned task (queuing on TTS_PERMIT instead of fast-failing // 429), and the client polls GET /tts/speech/jobs/{id} until it collects the // audio. State is in-memory only (deliberately lighter than the chat // TurnRegistry): a restart loses jobs, the client surfaces that and retries. #[derive(Clone, Copy, PartialEq, Eq, Debug, Serialize)] #[serde(rename_all = "snake_case")] pub enum TtsJobStatus { Queued, Running, Done, Error, Cancelled, } impl TtsJobStatus { fn is_terminal(self) -> bool { matches!(self, Self::Done | Self::Error | Self::Cancelled) } } struct TtsJob { status: TtsJobStatus, format: String, audio_base64: Option, error: Option, created_at: Instant, finished_at: Option, abort: Option, } /// Finished jobs linger so a client that lost connectivity can still collect /// the result on a later poll; anything older than MAX_AGE is dropped outright /// (aborted first if somehow still running). Swept lazily on each dispatch. const TTS_JOB_RESULT_TTL: Duration = Duration::from_secs(10 * 60); const TTS_JOB_MAX_AGE: Duration = Duration::from_secs(30 * 60); static TTS_JOBS: LazyLock>> = LazyLock::new(|| StdMutex::new(HashMap::new())); fn sweep_stale_jobs(jobs: &mut HashMap, now: Instant) { jobs.retain(|_, job| { let result_expired = job .finished_at .is_some_and(|t| now.duration_since(t) >= TTS_JOB_RESULT_TTL); let too_old = now.duration_since(job.created_at) >= TTS_JOB_MAX_AGE; if too_old && let Some(h) = job.abort.take() { h.abort(); } !(result_expired || too_old) }); } /// Run `f` against a job, if it still exists. fn with_job(id: Uuid, f: impl FnOnce(&mut TtsJob) -> R) -> Option { TTS_JOBS.lock().unwrap().get_mut(&id).map(f) } /// Move a job to a terminal state (first terminal write wins — a cancel that /// raced a completion keeps the cancel). fn finish_job(id: Uuid, status: TtsJobStatus, audio_base64: Option, error: Option) { with_job(id, |job| { if job.status.is_terminal() { return; } job.status = status; job.audio_base64 = audio_base64; job.error = error; job.finished_at = Some(Instant::now()); job.abort = None; }); } /// Sanitize a user-supplied voice name. The name is forwarded to Chatterbox /// where it becomes a filename in the voice-library directory, so we restrict /// it to a safe charset (alphanumerics, dash, underscore) — no path /// separators, dots, or whitespace — and bound its length. Returns `None` /// when nothing usable remains. fn sanitize_voice_name(raw: &str) -> Option { let cleaned: String = raw .trim() .chars() .map(|c| { if c.is_ascii_alphanumeric() || c == '-' || c == '_' { c } else { '-' } }) .collect(); let cleaned = cleaned.trim_matches('-').to_string(); if cleaned.is_empty() { return None; } Some(cleaned.chars().take(64).collect()) } /// Reference-clip cap in seconds for voice cloning. Chatterbox is zero-shot — /// a clean ~10–20s sample is the sweet spot and more rarely helps. Tune via /// `LLAMA_SWAP_TTS_REF_SECONDS` (default 30). fn tts_ref_seconds() -> u32 { std::env::var("LLAMA_SWAP_TTS_REF_SECONDS") .ok() .and_then(|s| s.trim().parse::().ok()) .filter(|n| *n > 0) .unwrap_or(30) } /// Tag a (sanitized) voice name with the reference window used to create it: /// `grandma` → `grandma-30s` (from the start), or `grandma-at1m32s-30s` (30s /// window starting at 1:32). The tag makes the window visible in the voice /// list so clones of the same source from different sections can be compared. /// Skips the append when the name already ends in the same tag; keeps the /// 64-char bound by truncating the base name, never the tag. fn append_ref_window(name: &str, start: f64, secs: u32) -> String { let start_whole = start.round().max(0.0) as u64; let suffix = if start_whole > 0 { // ':' isn't in the safe voice-name charset, so 1:32 becomes 1m32s. let at = if start_whole >= 60 { format!("at{}m{:02}s", start_whole / 60, start_whole % 60) } else { format!("at{start_whole}s") }; format!("-{at}-{secs}s") } else { format!("-{secs}s") }; if name.ends_with(&suffix) { return name.to_string(); } let max_base = 64usize.saturating_sub(suffix.len()); let base: String = name.chars().take(max_base).collect(); let base = base.trim_end_matches('-'); format!("{base}{suffix}") } /// Resolve a caller-supplied reference window into concrete `(start, duration)` /// seconds for ffmpeg. Start defaults to 0; duration defaults to the /// `tts_ref_seconds` cap and is clamped to it (the cap is the most audio the /// TTS backend benefits from, so longer requests are quietly bounded rather /// than rejected). Non-finite or negative values are the caller's bug → Err. fn resolve_ref_window( start_seconds: Option, duration_seconds: Option, ) -> Result<(f64, f64), String> { let cap = f64::from(tts_ref_seconds()); let start = start_seconds.unwrap_or(0.0); if !start.is_finite() || start < 0.0 { return Err("start_seconds must be a non-negative number".to_string()); } let duration = duration_seconds.unwrap_or(cap); if !duration.is_finite() || duration <= 0.0 { return Err("duration_seconds must be a positive number".to_string()); } Ok((start, duration.min(cap))) } /// Optional default voice for synthesis when the request doesn't name one. /// Set `LLAMA_SWAP_TTS_VOICE=m` to read insights in a cloned voice by default. fn default_voice() -> Option { std::env::var("LLAMA_SWAP_TTS_VOICE") .ok() .map(|s| s.trim().to_string()) .filter(|s| !s.is_empty()) } // Markdown / formatting strippers, compiled once. Insight text is markdown, // which TTS would otherwise read literally ("star star bold star star"). static MD_IMAGE: LazyLock = LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\([^)]*\)").unwrap()); static MD_LINK: LazyLock = LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\([^)]*\)").unwrap()); static MD_HEADING: LazyLock = LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}#{1,6}\s*").unwrap()); static MD_BLOCKQUOTE: LazyLock = LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}>\s?").unwrap()); static MD_LIST: LazyLock = LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}([-*+]|\d+\.)\s+").unwrap()); static MD_EMPHASIS: LazyLock = LazyLock::new(|| Regex::new(r"[*_`~]+").unwrap()); static URL_RE: LazyLock = LazyLock::new(|| Regex::new(r"https?://\S+").unwrap()); static MULTISPACE: LazyLock = LazyLock::new(|| Regex::new(r"[ \t]{2,}").unwrap()); // Any run of 2+ newlines (incl. whitespace-only blank lines) collapses to ONE // newline: Chatterbox inserts a long pause (sometimes ~20s of silence) per // blank line, so paragraph breaks must reach it as a single line break at most. static MULTINEWLINE: LazyLock = LazyLock::new(|| Regex::new(r"\n(?:[ \t]*\n)+").unwrap()); /// True for emoji / pictographic symbols, which most TTS models either skip or /// mispronounce. Covers the main emoji blocks plus dingbats, misc-technical, /// variation selectors, and the ZWJ used to glue emoji sequences. We do NOT /// strip `[bracketed]` tags — non-turbo Chatterbox ignores them, and a future /// Turbo switch uses them as paralinguistic cues. fn is_emoji_like(c: char) -> bool { let u = c as u32; matches!(u, 0x1F000..=0x1FAFF // emoji, pictographs, supplemental symbols, flags | 0x2300..=0x23FF // misc technical (⌚ ⏰ ⏳ …) | 0x2600..=0x27BF // misc symbols + dingbats | 0x2B00..=0x2BFF // misc symbols & arrows (★ ⬆ …) | 0xFE00..=0xFE0F // variation selectors | 0x200D // zero-width joiner ) } /// Normalize insight text for speech: unwrap markdown links/images to their /// visible text, drop heading/list/blockquote/emphasis markers and URLs, strip /// emoji, and collapse whitespace. Centralized here so every caller (app, /// WebUI, curl) gets clean audio. fn clean_for_tts(input: &str) -> String { let s = MD_IMAGE.replace_all(input, "$1"); let s = MD_LINK.replace_all(&s, "$1"); let s = MD_HEADING.replace_all(&s, ""); let s = MD_BLOCKQUOTE.replace_all(&s, ""); let s = MD_LIST.replace_all(&s, ""); let s = MD_EMPHASIS.replace_all(&s, ""); let s = URL_RE.replace_all(&s, " "); let s: String = s.chars().filter(|c| !is_emoji_like(*c)).collect(); let s = MULTISPACE.replace_all(&s, " "); let s = MULTINEWLINE.replace_all(&s, "\n"); s.trim().to_string() } /// Full text-preparation pipeline for synthesis: markdown/emoji cleanup, then /// the user's pronunciation overrides (see [`crate::ai::pronunciation`]) on /// the resulting plain text — after cleanup so word boundaries aren't /// obscured by `**WSL**`-style markup. fn prepare_for_tts(input: &str) -> String { crate::ai::pronunciation::apply_pronunciations(&clean_for_tts(input)) } /// Decode an audio/video file to mono 24 kHz WAV via ffmpeg, returning the WAV /// bytes. Chatterbox validates the reference clip by file *extension* and /// rejects several formats (e.g. `.aac`, `.opus`), so we always normalize to /// WAV regardless of the source container. Extracts `duration` seconds starting /// at `start` (see resolve_ref_window) — references only need a few seconds of /// clean speech, which may sit anywhere in a long recording. async fn run_ffmpeg_to_wav(input_path: &str, start: f64, duration: f64) -> anyhow::Result> { let out = tempfile::Builder::new() .suffix(".wav") .tempfile() .context("creating temp wav")?; let out_s = out.path().to_string_lossy().to_string(); let start_s = format!("{start}"); let secs = format!("{duration}"); // -ss before -i is input seeking: fast, and frame accuracy doesn't matter // for picking a speech window. let mut args: Vec<&str> = vec!["-y"]; if start > 0.0 { args.extend(["-ss", &start_s]); } args.extend([ "-i", input_path, "-vn", "-ac", "1", "-ar", "24000", "-t", &secs, "-f", "wav", &out_s, ]); let output = tokio::process::Command::new("ffmpeg") .args(&args) .output() .await .context("spawning ffmpeg")?; if !output.status.success() { anyhow::bail!("ffmpeg failed: {}", String::from_utf8_lossy(&output.stderr)); } std::fs::read(&out_s).context("reading transcoded audio") } /// Normalize in-memory upload bytes to WAV: write to a temp file (keeping the /// source extension as an ffmpeg probe hint) then transcode. async fn transcode_bytes_to_wav( input: &[u8], src_ext: Option<&str>, start: f64, duration: f64, ) -> anyhow::Result> { let suffix = src_ext .filter(|e| !e.is_empty()) .map(|e| format!(".{e}")) .unwrap_or_else(|| ".bin".to_string()); let in_tmp = tempfile::Builder::new() .suffix(&suffix) .tempfile() .context("creating temp input")?; std::fs::write(in_tmp.path(), input).context("writing temp input")?; run_ffmpeg_to_wav(&in_tmp.path().to_string_lossy(), start, duration).await } #[derive(Debug, Deserialize)] pub struct TtsSpeechRequest { pub text: String, #[serde(default)] pub voice: Option, /// Audio container, e.g. `"mp3"` (default) or `"wav"`. #[serde(default)] pub format: Option, /// Chatterbox knobs (clamped server-side). exaggeration 0.25–2.0 (emotion), /// cfg_weight 0.0–1.0 (pace; ~0.3 for fast speakers, 0 to neutralize a /// reference accent), temperature 0.05–5.0 (randomness). #[serde(default)] pub exaggeration: Option, #[serde(default)] pub cfg_weight: Option, #[serde(default)] pub temperature: Option, } #[derive(Debug, Serialize)] pub struct TtsSpeechResponse { pub audio_base64: String, pub format: String, } /// POST /tts/speech — synthesize `text` (optionally in a named `voice`) and /// return base64-encoded audio for `data:` URI playback on the client. #[post("/tts/speech")] pub async fn tts_speech_handler( http_request: HttpRequest, _claims: Claims, req: web::Json, app_state: web::Data, ) -> impl Responder { let parent_context = extract_context_from_request(&http_request); let mut span = global_tracer().start_with_context("http.tts.speech", &parent_context); let text = prepare_for_tts(&req.text); if text.is_empty() { span.set_status(Status::error("text is required")); return HttpResponse::BadRequest().json(json!({ "error": "text is required" })); } let Some(client) = app_state.llamacpp.as_ref() else { span.set_status(Status::error("tts backend not configured")); return HttpResponse::ServiceUnavailable() .json(json!({ "error": "TTS backend not configured (set LLAMA_SWAP_URL)" })); }; let format = req .format .as_deref() .filter(|s| !s.is_empty()) .unwrap_or("mp3"); let dv = default_voice(); let voice = req .voice .as_deref() .filter(|s| !s.is_empty()) .or(dv.as_deref()); span.set_attribute(KeyValue::new("tts.model", client.tts_model.clone())); span.set_attribute(KeyValue::new("tts.format", format.to_string())); span.set_attribute(KeyValue::new("tts.has_voice", voice.is_some())); span.set_attribute(KeyValue::new("tts.text_len", text.len() as i64)); // Clamp generation knobs to Chatterbox's documented ranges before forwarding. let exaggeration = req.exaggeration.map(|x| x.clamp(0.25, 2.0)); let cfg_weight = req.cfg_weight.map(|x| x.clamp(0.0, 1.0)); let temperature = req.temperature.map(|x| x.clamp(0.05, 5.0)); // One synthesis at a time (see TTS_PERMIT) — fast-fail when busy. let Ok(_permit) = TTS_PERMIT.try_acquire() else { span.set_status(Status::error("tts busy")); return HttpResponse::TooManyRequests().json(json!({ "error": "TTS is busy with another request — try again shortly" })); }; // Wait for the LLM side to release the GPU before sending — the synthesis // timeout starts at send, not here (see ai::gpu). let _gpu = crate::ai::gpu::tts_lease().await; match client .text_to_speech(&text, voice, format, exaggeration, cfg_weight, temperature) .await { Ok(bytes) => { span.set_attribute(KeyValue::new("tts.audio_bytes", bytes.len() as i64)); span.set_status(Status::Ok); let audio_base64 = base64::engine::general_purpose::STANDARD.encode(&bytes); HttpResponse::Ok().json(TtsSpeechResponse { audio_base64, format: format.to_string(), }) } Err(e) => { span.set_status(Status::error("tts synthesis failed")); log::error!("TTS synth failed: {:?}", e); HttpResponse::BadGateway().json(json!({ "error": format!("TTS failed: {e}") })) } } } #[derive(Debug, Serialize)] pub struct TtsJobCreatedResponse { pub job_id: String, pub status: TtsJobStatus, } #[derive(Debug, Serialize)] pub struct TtsJobStatusResponse { pub job_id: String, pub status: TtsJobStatus, pub format: String, #[serde(skip_serializing_if = "Option::is_none")] pub audio_base64: Option, #[serde(skip_serializing_if = "Option::is_none")] pub error: Option, } /// POST /tts/speech/jobs — durable variant of /tts/speech for long syntheses. /// Returns 202 + a job id immediately; the synth queues on the single GPU /// permit (instead of fast-failing 429) and the client polls the job until /// the audio is ready. #[post("/tts/speech/jobs")] pub async fn create_speech_job_handler( http_request: HttpRequest, _claims: Claims, req: web::Json, app_state: web::Data, ) -> impl Responder { let parent_context = extract_context_from_request(&http_request); let mut span = global_tracer().start_with_context("http.tts.speech_job.create", &parent_context); let text = prepare_for_tts(&req.text); if text.is_empty() { span.set_status(Status::error("text is required")); return HttpResponse::BadRequest().json(json!({ "error": "text is required" })); } if app_state.llamacpp.is_none() { span.set_status(Status::error("tts backend not configured")); return HttpResponse::ServiceUnavailable() .json(json!({ "error": "TTS backend not configured (set LLAMA_SWAP_URL)" })); } let format = req .format .as_deref() .filter(|s| !s.is_empty()) .unwrap_or("mp3") .to_string(); let voice = req .voice .clone() .filter(|s| !s.is_empty()) .or_else(default_voice); // Clamp generation knobs to Chatterbox's documented ranges before forwarding. let exaggeration = req.exaggeration.map(|x| x.clamp(0.25, 2.0)); let cfg_weight = req.cfg_weight.map(|x| x.clamp(0.0, 1.0)); let temperature = req.temperature.map(|x| x.clamp(0.05, 5.0)); span.set_attribute(KeyValue::new("tts.format", format.clone())); span.set_attribute(KeyValue::new("tts.has_voice", voice.is_some())); span.set_attribute(KeyValue::new("tts.text_len", text.len() as i64)); let job_id = Uuid::new_v4(); { let mut jobs = TTS_JOBS.lock().unwrap(); sweep_stale_jobs(&mut jobs, Instant::now()); jobs.insert( job_id, TtsJob { status: TtsJobStatus::Queued, format: format.clone(), audio_base64: None, error: None, created_at: Instant::now(), finished_at: None, abort: None, }, ); } let state = app_state.clone(); let handle = tokio::spawn(async move { // Queue rather than fast-fail: jobs wait their turn for the GPU. let _permit = match TTS_PERMIT.acquire().await { Ok(p) => p, Err(_) => { finish_job( job_id, TtsJobStatus::Error, None, Some("TTS queue closed".to_string()), ); return; } }; // Wait for the LLM side to release the GPU too (see ai::gpu) — only // then does the job count as running. The synthesis timeout starts at // the HTTP send below, so neither wait burns it, and the client can // anchor its own deadline to the queued→running transition. let _gpu = crate::ai::gpu::tts_lease().await; // Cancelled while queued — release the permits without synthesizing. let cancelled = with_job(job_id, |job| { if job.status == TtsJobStatus::Queued { job.status = TtsJobStatus::Running; false } else { true } }) .unwrap_or(true); if cancelled { return; } let Some(client) = state.llamacpp.as_ref() else { finish_job( job_id, TtsJobStatus::Error, None, Some("TTS backend not configured".to_string()), ); return; }; match client .text_to_speech( &text, voice.as_deref(), &format, exaggeration, cfg_weight, temperature, ) .await { Ok(bytes) => { let audio = base64::engine::general_purpose::STANDARD.encode(&bytes); finish_job(job_id, TtsJobStatus::Done, Some(audio), None); } Err(e) => { log::error!("TTS job {job_id} failed: {:?}", e); finish_job( job_id, TtsJobStatus::Error, None, Some(format!("TTS failed: {e}")), ); } } }); // Aborting an already-finished task is a no-op, so this late install is // safe even if the job raced to completion. with_job(job_id, |job| { if !job.status.is_terminal() { job.abort = Some(handle.abort_handle()); } }); span.set_status(Status::Ok); HttpResponse::Accepted().json(TtsJobCreatedResponse { job_id: job_id.to_string(), status: TtsJobStatus::Queued, }) } /// GET /tts/speech/jobs/{id} — poll a speech job; returns the audio once done. /// 404s after the job expires (results are kept ~10 min past completion). #[get("/tts/speech/jobs/{id}")] pub async fn speech_job_status_handler( http_request: HttpRequest, _claims: Claims, path: web::Path, ) -> impl Responder { let parent_context = extract_context_from_request(&http_request); let mut span = global_tracer().start_with_context("http.tts.speech_job.status", &parent_context); let Ok(id) = Uuid::parse_str(&path.into_inner()) else { span.set_status(Status::error("invalid job id")); return HttpResponse::BadRequest().json(json!({ "error": "invalid job id" })); }; let resp = { let jobs = TTS_JOBS.lock().unwrap(); jobs.get(&id).map(|job| TtsJobStatusResponse { job_id: id.to_string(), status: job.status, format: job.format.clone(), audio_base64: job.audio_base64.clone(), error: job.error.clone(), }) }; match resp { Some(r) => { span.set_status(Status::Ok); HttpResponse::Ok().json(r) } None => { span.set_status(Status::error("job not found")); HttpResponse::NotFound() .json(json!({ "error": "TTS job not found (it may have expired)" })) } } } /// DELETE /tts/speech/jobs/{id} — cancel a queued/running speech job. Once the /// upstream GPU job has started it can't be interrupted (same wrapper /// limitation as the sync path); cancelling stops the wait and discards the /// result. Cancelling an already-finished job leaves it terminal. #[delete("/tts/speech/jobs/{id}")] pub async fn cancel_speech_job_handler( http_request: HttpRequest, _claims: Claims, path: web::Path, ) -> impl Responder { let parent_context = extract_context_from_request(&http_request); let mut span = global_tracer().start_with_context("http.tts.speech_job.cancel", &parent_context); let Ok(id) = Uuid::parse_str(&path.into_inner()) else { span.set_status(Status::error("invalid job id")); return HttpResponse::BadRequest().json(json!({ "error": "invalid job id" })); }; let status = with_job(id, |job| { if !job.status.is_terminal() { if let Some(h) = job.abort.take() { h.abort(); } job.status = TtsJobStatus::Cancelled; job.finished_at = Some(Instant::now()); } job.status }); match status { Some(s) => { span.set_status(Status::Ok); HttpResponse::Ok().json(json!({ "job_id": id.to_string(), "status": s })) } None => { span.set_status(Status::error("job not found")); HttpResponse::NotFound() .json(json!({ "error": "TTS job not found (it may have expired)" })) } } } #[derive(Debug, Deserialize)] pub struct ListVoicesQuery { /// `?refresh=1` bypasses the voice-list cache and re-queries upstream /// (which may spin up the TTS model). #[serde(default)] pub refresh: Option, } /// GET /tts/voices — list the Chatterbox voice library. Served from an /// in-memory cache when possible so browsing settings doesn't make llama-swap /// load the TTS model (and evict the resident LLM); see VOICES_CACHE. #[get("/tts/voices")] pub async fn list_voices_handler( http_request: HttpRequest, _claims: Claims, query: web::Query, app_state: web::Data, ) -> impl Responder { let parent_context = extract_context_from_request(&http_request); let mut span = global_tracer().start_with_context("http.tts.voices.list", &parent_context); let force = query .refresh .as_deref() .is_some_and(|v| matches!(v, "1" | "true" | "yes")); if !force && let Some(v) = cached_voices() { span.set_attribute(KeyValue::new("tts.voices_cache_hit", true)); span.set_status(Status::Ok); return HttpResponse::Ok().json(v); } let Some(client) = app_state.llamacpp.as_ref() else { span.set_status(Status::error("tts backend not configured")); return HttpResponse::ServiceUnavailable() .json(json!({ "error": "TTS backend not configured" })); }; match client.list_voices().await { Ok(v) => { store_voices_cache(&v); span.set_attribute(KeyValue::new("tts.voices_cache_hit", false)); span.set_status(Status::Ok); HttpResponse::Ok().json(v) } Err(e) => { span.set_status(Status::error("list_voices failed")); log::error!("list_voices failed: {:?}", e); HttpResponse::BadGateway().json(json!({ "error": format!("{e}") })) } } } /// DELETE /tts/voices/{name} — remove a cloned voice from the library. #[delete("/tts/voices/{name}")] pub async fn delete_voice_handler( http_request: HttpRequest, _claims: Claims, path: web::Path, app_state: web::Data, ) -> impl Responder { let parent_context = extract_context_from_request(&http_request); let mut span = global_tracer().start_with_context("http.tts.voices.delete", &parent_context); let Some(client) = app_state.llamacpp.as_ref() else { span.set_status(Status::error("tts backend not configured")); return HttpResponse::ServiceUnavailable() .json(json!({ "error": "TTS backend not configured" })); }; // Same charset rule as creation — a name that sanitizes differently was // never a voice we created, and must not reach the upstream URL. let raw = path.into_inner(); let name = match sanitize_voice_name(&raw) { Some(n) if n == raw => n, _ => { span.set_status(Status::error("invalid voice name")); return HttpResponse::BadRequest().json(json!({ "error": "invalid voice name" })); } }; span.set_attribute(KeyValue::new("tts.voice_name", name.clone())); match client.delete_voice(&name).await { Ok(v) => { invalidate_voices_cache(); span.set_status(Status::Ok); HttpResponse::Ok().json(v) } Err(e) => { span.set_status(Status::error("delete_voice failed")); log::error!("delete_voice failed: {:?}", e); HttpResponse::BadGateway().json(json!({ "error": format!("{e}") })) } } } /// POST /tts/voices/upload — register a cloned voice from an uploaded audio /// clip. Multipart fields: `voice_name` (text) + a file part (`voice_file`), /// plus optional `start_seconds` / `duration_seconds` (text) selecting which /// window of a longer recording becomes the reference clip. #[post("/tts/voices/upload")] pub async fn create_voice_upload_handler( http_request: HttpRequest, _claims: Claims, mut payload: Multipart, app_state: web::Data, ) -> impl Responder { let parent_context = extract_context_from_request(&http_request); let mut span = global_tracer().start_with_context("http.tts.voices.upload", &parent_context); let Some(client) = app_state.llamacpp.as_ref() else { span.set_status(Status::error("tts backend not configured")); return HttpResponse::ServiceUnavailable() .json(json!({ "error": "TTS backend not configured" })); }; let mut voice_name: Option = None; let mut start_field: Option = None; let mut duration_field: Option = None; let mut file_bytes = BytesMut::new(); let mut filename = "voice.wav".to_string(); while let Some(Ok(mut part)) = payload.next().await { // Capture disposition fields up front so the immutable borrow ends // before we mutably stream the part body (mirrors handlers/image.rs). let (fname_opt, name_opt) = { let cd = part.content_disposition(); ( cd.and_then(|c| c.get_filename()).map(|s| s.to_string()), cd.and_then(|c| c.get_name()).map(|s| s.to_string()), ) }; if let Some(fname) = fname_opt { filename = fname; while let Some(Ok(data)) = part.next().await { if file_bytes.len() + data.len() > MAX_VOICE_UPLOAD_BYTES { span.set_status(Status::error("voice clip exceeds limit")); return HttpResponse::PayloadTooLarge() .json(json!({ "error": "voice clip exceeds 25 MB" })); } file_bytes.put(data); } } else if matches!( name_opt.as_deref(), Some("voice_name" | "start_seconds" | "duration_seconds") ) { let field = name_opt.as_deref().unwrap().to_string(); let mut buf = BytesMut::new(); while let Some(Ok(data)) = part.next().await { buf.put(data); } let text = String::from_utf8_lossy(&buf).trim().to_string(); match field.as_str() { "voice_name" => voice_name = Some(text), "start_seconds" => start_field = Some(text), _ => duration_field = Some(text), } } else { while let Some(Ok(_)) = part.next().await {} } } // Empty text parts are treated as absent; anything else must parse, so a // client bug ("abc") fails loudly instead of silently cloning from 0s. let parse_secs = |field: Option<&String>, name: &str| -> Result, String> { match field.map(|s| s.as_str()).filter(|s| !s.is_empty()) { None => Ok(None), Some(s) => s .parse::() .map(Some) .map_err(|_| format!("{name} must be a number of seconds")), } }; let window = parse_secs(start_field.as_ref(), "start_seconds").and_then(|start| { parse_secs(duration_field.as_ref(), "duration_seconds") .and_then(|dur| resolve_ref_window(start, dur)) }); let (ref_start, ref_duration) = match window { Ok(w) => w, Err(msg) => { span.set_status(Status::error("invalid reference window")); return HttpResponse::BadRequest().json(json!({ "error": msg })); } }; let Some(name) = voice_name.as_deref().and_then(sanitize_voice_name) else { span.set_status(Status::error("voice_name is required")); return HttpResponse::BadRequest() .json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" })); }; // Tag the name with the ref-clip length (e.g. `grandma-30s`) so the // library shows which reference length produced each clone. let name = append_ref_window(&name, ref_start, ref_duration.round().max(1.0) as u32); if file_bytes.is_empty() { span.set_status(Status::error("voice_file is required")); return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" })); } span.set_attribute(KeyValue::new("tts.voice_name", name.clone())); span.set_attribute(KeyValue::new("tts.upload_bytes", file_bytes.len() as i64)); // Normalize to WAV so any device format (e.g. .aac / .opus, which Chatterbox // rejects by extension) is accepted. let src_ext = Path::new(&filename).extension().and_then(|e| e.to_str()); let wav = match transcode_bytes_to_wav(file_bytes.as_ref(), src_ext, ref_start, ref_duration).await { Ok(w) => w, Err(e) => { span.set_status(Status::error("audio decode failed")); log::error!("voice upload transcode failed: {:?}", e); return HttpResponse::BadRequest() .json(json!({ "error": "couldn't decode that audio file" })); } }; match client .create_voice(&name, wav, "reference.wav", "audio/wav") .await { Ok(v) => { invalidate_voices_cache(); span.set_status(Status::Ok); HttpResponse::Ok().json(v) } Err(e) => { span.set_status(Status::error("create_voice failed")); log::error!("create_voice (upload) failed: {:?}", e); HttpResponse::BadGateway().json(json!({ "error": format!("{e}") })) } } } #[derive(Debug, Deserialize)] pub struct CreateVoiceFromLibraryRequest { pub voice_name: String, /// Library-relative path to an audio or video file. pub path: String, #[serde(default)] pub library: Option, /// Offset into the source where the reference window begins (default 0) — /// lets the client pick the clean-speech section of a long recording. #[serde(default)] pub start_seconds: Option, /// Reference window length; clamped to LLAMA_SWAP_TTS_REF_SECONDS. #[serde(default)] pub duration_seconds: Option, } /// POST /tts/voices/from-library — register a cloned voice from a file already /// in a library. Audio and video alike are ffmpeg-normalized to a mono 24 kHz /// WAV reference clip (window selected by start/duration_seconds, length /// capped by LLAMA_SWAP_TTS_REF_SECONDS). #[post("/tts/voices/from-library")] pub async fn create_voice_from_library_handler( http_request: HttpRequest, _claims: Claims, req: web::Json, app_state: web::Data, ) -> impl Responder { let parent_context = extract_context_from_request(&http_request); let mut span = global_tracer().start_with_context("http.tts.voices.from_library", &parent_context); let Some(client) = app_state.llamacpp.as_ref() else { span.set_status(Status::error("tts backend not configured")); return HttpResponse::ServiceUnavailable() .json(json!({ "error": "TTS backend not configured" })); }; let Some(voice_name) = sanitize_voice_name(&req.voice_name) else { span.set_status(Status::error("voice_name is required")); return HttpResponse::BadRequest() .json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" })); }; let (ref_start, ref_duration) = match resolve_ref_window(req.start_seconds, req.duration_seconds) { Ok(w) => w, Err(msg) => { span.set_status(Status::error("invalid reference window")); return HttpResponse::BadRequest().json(json!({ "error": msg })); } }; // Tag the name with the ref-clip length (e.g. `grandma-30s`) so the // library shows which reference length produced each clone. let voice_name = append_ref_window(&voice_name, ref_start, ref_duration.round().max(1.0) as u32); let library = match libraries::resolve_library_param(&app_state, req.library.as_deref()) { Ok(Some(l)) => l, Ok(None) => app_state.primary_library(), Err(msg) => { span.set_status(Status::error("invalid library")); return HttpResponse::BadRequest().json(json!({ "error": msg })); } }; // is_valid_full_path confines the path to the library root (no traversal). let abs = match is_valid_full_path(&library.root_path, &req.path, false) { Some(p) if p.exists() => p, _ => { span.set_status(Status::error("file not found")); return HttpResponse::NotFound().json(json!({ "error": "file not found in library" })); } }; // Only real audio/video sources are valid voice references — refuse to // slurp arbitrary library files into memory / ffmpeg. if !is_audio_file(&abs) && !is_video_file(&abs) { span.set_status(Status::error("not an audio/video file")); return HttpResponse::BadRequest() .json(json!({ "error": "file is not an audio or video file" })); } span.set_attribute(KeyValue::new("tts.voice_name", voice_name.clone())); let wav = match prepare_reference_audio(&abs, ref_start, ref_duration).await { Ok(b) => b, Err(e) => { span.set_status(Status::error("audio decode failed")); log::error!("voice reference prep failed for {:?}: {:?}", abs, e); return HttpResponse::BadRequest() .json(json!({ "error": "couldn't decode that file's audio" })); } }; match client .create_voice(&voice_name, wav, "reference.wav", "audio/wav") .await { Ok(v) => { invalidate_voices_cache(); span.set_status(Status::Ok); HttpResponse::Ok().json(v) } Err(e) => { span.set_status(Status::error("create_voice failed")); log::error!("create_voice (from-library) failed: {:?}", e); HttpResponse::BadGateway().json(json!({ "error": format!("{e}") })) } } } /// Read a library file (audio or video) as a Chatterbox-ready reference: ffmpeg /// decodes/extracts its audio to mono 24 kHz WAV. Reading straight from the /// library path avoids slurping a (possibly large) video into memory. async fn prepare_reference_audio(abs: &Path, start: f64, duration: f64) -> anyhow::Result> { run_ffmpeg_to_wav(&abs.to_string_lossy(), start, duration).await } #[cfg(test)] mod tests { use super::*; #[test] fn sanitize_voice_name_keeps_safe_chars() { assert_eq!(sanitize_voice_name("m").as_deref(), Some("m")); assert_eq!( sanitize_voice_name(" Cameron ").as_deref(), Some("Cameron") ); assert_eq!( sanitize_voice_name("voice_01-a").as_deref(), Some("voice_01-a") ); } #[test] fn sanitize_voice_name_strips_unsafe_chars() { // Path separators / dots / spaces become '-' and are trimmed at edges. assert_eq!(sanitize_voice_name("a b.c").as_deref(), Some("a-b-c")); assert_eq!( sanitize_voice_name("../etc/passwd").as_deref(), Some("etc-passwd") ); } #[test] fn sanitize_voice_name_rejects_empty_or_all_unsafe() { assert_eq!(sanitize_voice_name(""), None); assert_eq!(sanitize_voice_name(" "), None); assert_eq!(sanitize_voice_name("../../"), None); assert_eq!(sanitize_voice_name("...."), None); } #[test] fn sanitize_voice_name_bounds_length() { let long = "a".repeat(200); assert_eq!(sanitize_voice_name(&long).unwrap().len(), 64); } #[test] fn append_ref_window_tags_name() { assert_eq!(append_ref_window("grandma", 0.0, 30), "grandma-30s"); assert_eq!(append_ref_window("voice_01", 0.0, 15), "voice_01-15s"); } #[test] fn append_ref_window_includes_nonzero_start() { // Sub-minute starts stay in seconds; longer ones read as XmYYs since // ':' isn't allowed in voice names. assert_eq!(append_ref_window("grandma", 45.0, 30), "grandma-at45s-30s"); assert_eq!( append_ref_window("grandma", 92.4, 30), "grandma-at1m32s-30s" ); assert_eq!( append_ref_window("grandma", 600.0, 12), "grandma-at10m00s-12s" ); // A start that rounds to zero is "from the start". assert_eq!(append_ref_window("grandma", 0.3, 30), "grandma-30s"); } #[test] fn append_ref_window_is_idempotent_for_same_window() { assert_eq!(append_ref_window("grandma-30s", 0.0, 30), "grandma-30s"); assert_eq!( append_ref_window("grandma-at45s-30s", 45.0, 30), "grandma-at45s-30s" ); // A different window still appends — that's the comparison use-case. assert_eq!(append_ref_window("grandma-15s", 0.0, 30), "grandma-15s-30s"); assert_eq!( append_ref_window("grandma-30s", 45.0, 30), "grandma-30s-at45s-30s" ); } #[test] fn append_ref_window_keeps_64_char_bound() { let long = "a".repeat(64); let tagged = append_ref_window(&long, 0.0, 30); assert_eq!(tagged.len(), 64); assert!(tagged.ends_with("-30s")); let tagged = append_ref_window(&long, 92.0, 30); assert_eq!(tagged.len(), 64); assert!(tagged.ends_with("-at1m32s-30s")); } #[test] fn resolve_ref_window_defaults_to_start_of_clip_at_cap_length() { // Reads the live cap rather than mutating LLAMA_SWAP_TTS_REF_SECONDS: // env mutation flakes under the parallel suite (see env_dispatch). let cap = f64::from(tts_ref_seconds()); assert_eq!(resolve_ref_window(None, None), Ok((0.0, cap))); } #[test] fn resolve_ref_window_accepts_offset_and_clamps_duration() { let cap = f64::from(tts_ref_seconds()); assert_eq!(resolve_ref_window(Some(92.5), None), Ok((92.5, cap))); assert_eq!(resolve_ref_window(Some(10.0), Some(12.0)), Ok((10.0, 12.0))); // Longer-than-cap windows are bounded, not rejected. assert_eq!(resolve_ref_window(None, Some(cap + 100.0)), Ok((0.0, cap))); } #[test] fn resolve_ref_window_rejects_garbage() { assert!(resolve_ref_window(Some(-1.0), None).is_err()); assert!(resolve_ref_window(Some(f64::NAN), None).is_err()); assert!(resolve_ref_window(Some(f64::INFINITY), None).is_err()); assert!(resolve_ref_window(None, Some(0.0)).is_err()); assert!(resolve_ref_window(None, Some(-5.0)).is_err()); assert!(resolve_ref_window(None, Some(f64::NAN)).is_err()); } #[test] fn sweep_drops_expired_results_and_keeps_live_jobs() { let now = Instant::now(); let mk = |status: TtsJobStatus, created: Instant, finished: Option| TtsJob { status, format: "mp3".into(), audio_base64: None, error: None, created_at: created, finished_at: finished, abort: None, }; let mut jobs = HashMap::new(); let live = Uuid::new_v4(); let fresh_done = Uuid::new_v4(); let stale_done = Uuid::new_v4(); jobs.insert(live, mk(TtsJobStatus::Running, now, None)); jobs.insert( fresh_done, mk(TtsJobStatus::Done, now, Some(now - Duration::from_secs(60))), ); jobs.insert( stale_done, mk( TtsJobStatus::Done, now - TTS_JOB_MAX_AGE / 2, Some(now - TTS_JOB_RESULT_TTL), ), ); sweep_stale_jobs(&mut jobs, now); assert!(jobs.contains_key(&live)); assert!(jobs.contains_key(&fresh_done)); assert!(!jobs.contains_key(&stale_done)); } #[test] fn sweep_drops_jobs_past_max_age_even_if_unfinished() { let now = Instant::now(); let mut jobs = HashMap::new(); let ancient = Uuid::new_v4(); jobs.insert( ancient, TtsJob { status: TtsJobStatus::Running, format: "mp3".into(), audio_base64: None, error: None, created_at: now - TTS_JOB_MAX_AGE, finished_at: None, abort: None, }, ); sweep_stale_jobs(&mut jobs, now); assert!(jobs.is_empty()); } #[test] fn voices_cache_roundtrip_and_invalidation() { invalidate_voices_cache(); assert!(cached_voices().is_none()); let v = json!({ "voices": [{ "name": "m-30s" }], "count": 1 }); store_voices_cache(&v); assert_eq!(cached_voices(), Some(v)); invalidate_voices_cache(); assert!(cached_voices().is_none()); } #[test] fn clean_for_tts_strips_markdown() { assert_eq!( clean_for_tts("**Bold** and _italic_ and `code`"), "Bold and italic and code" ); assert_eq!(clean_for_tts("# Title\n\nbody"), "Title\nbody"); assert_eq!( clean_for_tts("See [docs](http://x.com) now"), "See docs now" ); assert_eq!(clean_for_tts("- one\n- two"), "one\ntwo"); } #[test] fn clean_for_tts_strips_emoji_and_urls() { assert_eq!(clean_for_tts("Hello 😀 world 🎉"), "Hello world"); assert_eq!( clean_for_tts("visit https://example.com today"), "visit today" ); // ZWJ-glued emoji sequence is fully removed. assert_eq!(clean_for_tts("family 👨‍👩‍👧 photo"), "family photo"); } #[test] fn clean_for_tts_collapses_blank_lines_to_single_break() { // Chatterbox pauses (sometimes ~20s) per blank line, so paragraph // breaks must collapse to a single newline. assert_eq!(clean_for_tts("para one\n\npara two"), "para one\npara two"); assert_eq!(clean_for_tts("a\n\n\n\nb"), "a\nb"); // Whitespace-only "blank" lines collapse too. assert_eq!(clean_for_tts("a\n \t \nb"), "a\nb"); // A single newline is left alone. assert_eq!(clean_for_tts("a\nb"), "a\nb"); } #[test] fn clean_for_tts_preserves_bracket_tags() { // Non-turbo Chatterbox ignores these; a future Turbo uses them as // paralinguistic cues — so we must not strip them. assert_eq!(clean_for_tts("hello [laugh] there"), "hello [laugh] there"); } }