1017fe73af
Clones that don't start at 0:00 are tagged with where the reference window begins (grandma-at1m32s-30s), so voices cloned from different sections of the same source are distinguishable in the voice list. Zero-start names keep the existing -30s form. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
1279 lines
49 KiB
Rust
1279 lines
49 KiB
Rust
// TTS endpoints: proxy text-to-speech + voice-library management to the
|
||
// Chatterbox server that sits behind llama-swap (via LlamaCppClient). Speech
|
||
// synthesis returns audio as base64-in-JSON so the mobile app can play it as a
|
||
// `data:` URI without a binary-fetch path. Voice cloning registers a named
|
||
// voice from either an uploaded clip (device) or an existing library file
|
||
// (audio read directly; video has its audio track extracted via ffmpeg).
|
||
|
||
use actix_multipart::Multipart;
|
||
use actix_web::{HttpRequest, HttpResponse, Responder, delete, get, post, web};
|
||
use anyhow::Context;
|
||
use base64::Engine;
|
||
use bytes::{BufMut, BytesMut};
|
||
use futures::StreamExt;
|
||
use opentelemetry::KeyValue;
|
||
use opentelemetry::trace::{Span, Status, Tracer};
|
||
use regex::Regex;
|
||
use serde::{Deserialize, Serialize};
|
||
use serde_json::{Value, json};
|
||
use std::collections::HashMap;
|
||
use std::path::Path;
|
||
use std::sync::{LazyLock, Mutex as StdMutex};
|
||
use std::time::{Duration, Instant};
|
||
use tokio::sync::Semaphore;
|
||
use uuid::Uuid;
|
||
|
||
use crate::data::Claims;
|
||
use crate::file_types::{is_audio_file, is_video_file};
|
||
use crate::files::is_valid_full_path;
|
||
use crate::libraries;
|
||
use crate::otel::{extract_context_from_request, global_tracer};
|
||
use crate::state::AppState;
|
||
|
||
/// Hard cap on an uploaded voice-reference clip. Chatterbox itself caps the
|
||
/// payload (~60s clip); this is a defensive ceiling so a hostile/oversized
|
||
/// upload can't balloon ImageApi memory before we ever forward it.
|
||
const MAX_VOICE_UPLOAD_BYTES: usize = 25 * 1024 * 1024; // 25 MB
|
||
|
||
/// Serialize speech synthesis: the Chatterbox server has no internal lock or
|
||
/// queue, so concurrent requests contend on the single GPU and cascade into
|
||
/// timeouts. One permit; when busy we fast-fail with 429 rather than queue —
|
||
/// the app surfaces "busy" immediately, and typical jobs clear in well under a
|
||
/// minute. (An abandoned upstream job can still occupy the GPU until it
|
||
/// finishes — that's a wrapper limitation; the chunked-queue plan fixes it.)
|
||
static TTS_PERMIT: LazyLock<Semaphore> = LazyLock::new(|| Semaphore::new(1));
|
||
|
||
// --- Voice-list cache --------------------------------------------------------
|
||
|
||
/// Cached raw voice-library JSON. llama-swap's `/upstream/<model>/voices`
|
||
/// passthrough spins the TTS model up just to answer a listing — which can
|
||
/// evict the resident LLM — so we serve a cached copy and only hit upstream on
|
||
/// a cold cache, an explicit `?refresh=1`, or after a voice create/delete
|
||
/// invalidates it (the TTS model is already loaded right then anyway).
|
||
static VOICES_CACHE: LazyLock<StdMutex<Option<Value>>> = LazyLock::new(|| StdMutex::new(None));
|
||
|
||
fn cached_voices() -> Option<Value> {
|
||
VOICES_CACHE.lock().unwrap().clone()
|
||
}
|
||
|
||
fn store_voices_cache(v: &Value) {
|
||
*VOICES_CACHE.lock().unwrap() = Some(v.clone());
|
||
}
|
||
|
||
fn invalidate_voices_cache() {
|
||
*VOICES_CACHE.lock().unwrap() = None;
|
||
}
|
||
|
||
// --- Async speech jobs -------------------------------------------------------
|
||
//
|
||
// Synthesizing a long insight can take minutes — too long to hang one HTTP
|
||
// request from a phone that may background the app or drop the connection.
|
||
// Durable variant: POST /tts/speech/jobs returns a job id immediately, the
|
||
// synth runs in a spawned task (queuing on TTS_PERMIT instead of fast-failing
|
||
// 429), and the client polls GET /tts/speech/jobs/{id} until it collects the
|
||
// audio. State is in-memory only (deliberately lighter than the chat
|
||
// TurnRegistry): a restart loses jobs, the client surfaces that and retries.
|
||
|
||
#[derive(Clone, Copy, PartialEq, Eq, Debug, Serialize)]
|
||
#[serde(rename_all = "snake_case")]
|
||
pub enum TtsJobStatus {
|
||
Queued,
|
||
Running,
|
||
Done,
|
||
Error,
|
||
Cancelled,
|
||
}
|
||
|
||
impl TtsJobStatus {
|
||
fn is_terminal(self) -> bool {
|
||
matches!(self, Self::Done | Self::Error | Self::Cancelled)
|
||
}
|
||
}
|
||
|
||
struct TtsJob {
|
||
status: TtsJobStatus,
|
||
format: String,
|
||
audio_base64: Option<String>,
|
||
error: Option<String>,
|
||
created_at: Instant,
|
||
finished_at: Option<Instant>,
|
||
abort: Option<tokio::task::AbortHandle>,
|
||
}
|
||
|
||
/// Finished jobs linger so a client that lost connectivity can still collect
|
||
/// the result on a later poll; anything older than MAX_AGE is dropped outright
|
||
/// (aborted first if somehow still running). Swept lazily on each dispatch.
|
||
const TTS_JOB_RESULT_TTL: Duration = Duration::from_secs(10 * 60);
|
||
const TTS_JOB_MAX_AGE: Duration = Duration::from_secs(30 * 60);
|
||
|
||
static TTS_JOBS: LazyLock<StdMutex<HashMap<Uuid, TtsJob>>> =
|
||
LazyLock::new(|| StdMutex::new(HashMap::new()));
|
||
|
||
fn sweep_stale_jobs(jobs: &mut HashMap<Uuid, TtsJob>, now: Instant) {
|
||
jobs.retain(|_, job| {
|
||
let result_expired = job
|
||
.finished_at
|
||
.is_some_and(|t| now.duration_since(t) >= TTS_JOB_RESULT_TTL);
|
||
let too_old = now.duration_since(job.created_at) >= TTS_JOB_MAX_AGE;
|
||
if too_old && let Some(h) = job.abort.take() {
|
||
h.abort();
|
||
}
|
||
!(result_expired || too_old)
|
||
});
|
||
}
|
||
|
||
/// Run `f` against a job, if it still exists.
|
||
fn with_job<R>(id: Uuid, f: impl FnOnce(&mut TtsJob) -> R) -> Option<R> {
|
||
TTS_JOBS.lock().unwrap().get_mut(&id).map(f)
|
||
}
|
||
|
||
/// Move a job to a terminal state (first terminal write wins — a cancel that
|
||
/// raced a completion keeps the cancel).
|
||
fn finish_job(id: Uuid, status: TtsJobStatus, audio_base64: Option<String>, error: Option<String>) {
|
||
with_job(id, |job| {
|
||
if job.status.is_terminal() {
|
||
return;
|
||
}
|
||
job.status = status;
|
||
job.audio_base64 = audio_base64;
|
||
job.error = error;
|
||
job.finished_at = Some(Instant::now());
|
||
job.abort = None;
|
||
});
|
||
}
|
||
|
||
/// Sanitize a user-supplied voice name. The name is forwarded to Chatterbox
|
||
/// where it becomes a filename in the voice-library directory, so we restrict
|
||
/// it to a safe charset (alphanumerics, dash, underscore) — no path
|
||
/// separators, dots, or whitespace — and bound its length. Returns `None`
|
||
/// when nothing usable remains.
|
||
fn sanitize_voice_name(raw: &str) -> Option<String> {
|
||
let cleaned: String = raw
|
||
.trim()
|
||
.chars()
|
||
.map(|c| {
|
||
if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
|
||
c
|
||
} else {
|
||
'-'
|
||
}
|
||
})
|
||
.collect();
|
||
let cleaned = cleaned.trim_matches('-').to_string();
|
||
if cleaned.is_empty() {
|
||
return None;
|
||
}
|
||
Some(cleaned.chars().take(64).collect())
|
||
}
|
||
|
||
/// Reference-clip cap in seconds for voice cloning. Chatterbox is zero-shot —
|
||
/// a clean ~10–20s sample is the sweet spot and more rarely helps. Tune via
|
||
/// `LLAMA_SWAP_TTS_REF_SECONDS` (default 30).
|
||
fn tts_ref_seconds() -> u32 {
|
||
std::env::var("LLAMA_SWAP_TTS_REF_SECONDS")
|
||
.ok()
|
||
.and_then(|s| s.trim().parse::<u32>().ok())
|
||
.filter(|n| *n > 0)
|
||
.unwrap_or(30)
|
||
}
|
||
|
||
/// Tag a (sanitized) voice name with the reference window used to create it:
|
||
/// `grandma` → `grandma-30s` (from the start), or `grandma-at1m32s-30s` (30s
|
||
/// window starting at 1:32). The tag makes the window visible in the voice
|
||
/// list so clones of the same source from different sections can be compared.
|
||
/// Skips the append when the name already ends in the same tag; keeps the
|
||
/// 64-char bound by truncating the base name, never the tag.
|
||
fn append_ref_window(name: &str, start: f64, secs: u32) -> String {
|
||
let start_whole = start.round().max(0.0) as u64;
|
||
let suffix = if start_whole > 0 {
|
||
// ':' isn't in the safe voice-name charset, so 1:32 becomes 1m32s.
|
||
let at = if start_whole >= 60 {
|
||
format!("at{}m{:02}s", start_whole / 60, start_whole % 60)
|
||
} else {
|
||
format!("at{start_whole}s")
|
||
};
|
||
format!("-{at}-{secs}s")
|
||
} else {
|
||
format!("-{secs}s")
|
||
};
|
||
if name.ends_with(&suffix) {
|
||
return name.to_string();
|
||
}
|
||
let max_base = 64usize.saturating_sub(suffix.len());
|
||
let base: String = name.chars().take(max_base).collect();
|
||
let base = base.trim_end_matches('-');
|
||
format!("{base}{suffix}")
|
||
}
|
||
|
||
/// Resolve a caller-supplied reference window into concrete `(start, duration)`
|
||
/// seconds for ffmpeg. Start defaults to 0; duration defaults to the
|
||
/// `tts_ref_seconds` cap and is clamped to it (the cap is the most audio the
|
||
/// TTS backend benefits from, so longer requests are quietly bounded rather
|
||
/// than rejected). Non-finite or negative values are the caller's bug → Err.
|
||
fn resolve_ref_window(
|
||
start_seconds: Option<f64>,
|
||
duration_seconds: Option<f64>,
|
||
) -> Result<(f64, f64), String> {
|
||
let cap = f64::from(tts_ref_seconds());
|
||
let start = start_seconds.unwrap_or(0.0);
|
||
if !start.is_finite() || start < 0.0 {
|
||
return Err("start_seconds must be a non-negative number".to_string());
|
||
}
|
||
let duration = duration_seconds.unwrap_or(cap);
|
||
if !duration.is_finite() || duration <= 0.0 {
|
||
return Err("duration_seconds must be a positive number".to_string());
|
||
}
|
||
Ok((start, duration.min(cap)))
|
||
}
|
||
|
||
/// Optional default voice for synthesis when the request doesn't name one.
|
||
/// Set `LLAMA_SWAP_TTS_VOICE=m` to read insights in a cloned voice by default.
|
||
fn default_voice() -> Option<String> {
|
||
std::env::var("LLAMA_SWAP_TTS_VOICE")
|
||
.ok()
|
||
.map(|s| s.trim().to_string())
|
||
.filter(|s| !s.is_empty())
|
||
}
|
||
|
||
// Markdown / formatting strippers, compiled once. Insight text is markdown,
|
||
// which TTS would otherwise read literally ("star star bold star star").
|
||
static MD_IMAGE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\([^)]*\)").unwrap());
|
||
static MD_LINK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\([^)]*\)").unwrap());
|
||
static MD_HEADING: LazyLock<Regex> =
|
||
LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}#{1,6}\s*").unwrap());
|
||
static MD_BLOCKQUOTE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}>\s?").unwrap());
|
||
static MD_LIST: LazyLock<Regex> =
|
||
LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}([-*+]|\d+\.)\s+").unwrap());
|
||
static MD_EMPHASIS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[*_`~]+").unwrap());
|
||
static URL_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"https?://\S+").unwrap());
|
||
static MULTISPACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[ \t]{2,}").unwrap());
|
||
// Any run of 2+ newlines (incl. whitespace-only blank lines) collapses to ONE
|
||
// newline: Chatterbox inserts a long pause (sometimes ~20s of silence) per
|
||
// blank line, so paragraph breaks must reach it as a single line break at most.
|
||
static MULTINEWLINE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\n(?:[ \t]*\n)+").unwrap());
|
||
|
||
/// True for emoji / pictographic symbols, which most TTS models either skip or
|
||
/// mispronounce. Covers the main emoji blocks plus dingbats, misc-technical,
|
||
/// variation selectors, and the ZWJ used to glue emoji sequences. We do NOT
|
||
/// strip `[bracketed]` tags — non-turbo Chatterbox ignores them, and a future
|
||
/// Turbo switch uses them as paralinguistic cues.
|
||
fn is_emoji_like(c: char) -> bool {
|
||
let u = c as u32;
|
||
matches!(u,
|
||
0x1F000..=0x1FAFF // emoji, pictographs, supplemental symbols, flags
|
||
| 0x2300..=0x23FF // misc technical (⌚ ⏰ ⏳ …)
|
||
| 0x2600..=0x27BF // misc symbols + dingbats
|
||
| 0x2B00..=0x2BFF // misc symbols & arrows (★ ⬆ …)
|
||
| 0xFE00..=0xFE0F // variation selectors
|
||
| 0x200D // zero-width joiner
|
||
)
|
||
}
|
||
|
||
/// Normalize insight text for speech: unwrap markdown links/images to their
|
||
/// visible text, drop heading/list/blockquote/emphasis markers and URLs, strip
|
||
/// emoji, and collapse whitespace. Centralized here so every caller (app,
|
||
/// WebUI, curl) gets clean audio.
|
||
fn clean_for_tts(input: &str) -> String {
|
||
let s = MD_IMAGE.replace_all(input, "$1");
|
||
let s = MD_LINK.replace_all(&s, "$1");
|
||
let s = MD_HEADING.replace_all(&s, "");
|
||
let s = MD_BLOCKQUOTE.replace_all(&s, "");
|
||
let s = MD_LIST.replace_all(&s, "");
|
||
let s = MD_EMPHASIS.replace_all(&s, "");
|
||
let s = URL_RE.replace_all(&s, " ");
|
||
let s: String = s.chars().filter(|c| !is_emoji_like(*c)).collect();
|
||
let s = MULTISPACE.replace_all(&s, " ");
|
||
let s = MULTINEWLINE.replace_all(&s, "\n");
|
||
s.trim().to_string()
|
||
}
|
||
|
||
/// Full text-preparation pipeline for synthesis: markdown/emoji cleanup, then
|
||
/// the user's pronunciation overrides (see [`crate::ai::pronunciation`]) on
|
||
/// the resulting plain text — after cleanup so word boundaries aren't
|
||
/// obscured by `**WSL**`-style markup.
|
||
fn prepare_for_tts(input: &str) -> String {
|
||
crate::ai::pronunciation::apply_pronunciations(&clean_for_tts(input))
|
||
}
|
||
|
||
/// Decode an audio/video file to mono 24 kHz WAV via ffmpeg, returning the WAV
|
||
/// bytes. Chatterbox validates the reference clip by file *extension* and
|
||
/// rejects several formats (e.g. `.aac`, `.opus`), so we always normalize to
|
||
/// WAV regardless of the source container. Extracts `duration` seconds starting
|
||
/// at `start` (see resolve_ref_window) — references only need a few seconds of
|
||
/// clean speech, which may sit anywhere in a long recording.
|
||
async fn run_ffmpeg_to_wav(input_path: &str, start: f64, duration: f64) -> anyhow::Result<Vec<u8>> {
|
||
let out = tempfile::Builder::new()
|
||
.suffix(".wav")
|
||
.tempfile()
|
||
.context("creating temp wav")?;
|
||
let out_s = out.path().to_string_lossy().to_string();
|
||
|
||
let start_s = format!("{start}");
|
||
let secs = format!("{duration}");
|
||
|
||
// -ss before -i is input seeking: fast, and frame accuracy doesn't matter
|
||
// for picking a speech window.
|
||
let mut args: Vec<&str> = vec!["-y"];
|
||
if start > 0.0 {
|
||
args.extend(["-ss", &start_s]);
|
||
}
|
||
args.extend([
|
||
"-i", input_path, "-vn", "-ac", "1", "-ar", "24000", "-t", &secs, "-f", "wav", &out_s,
|
||
]);
|
||
|
||
let output = tokio::process::Command::new("ffmpeg")
|
||
.args(&args)
|
||
.output()
|
||
.await
|
||
.context("spawning ffmpeg")?;
|
||
|
||
if !output.status.success() {
|
||
anyhow::bail!("ffmpeg failed: {}", String::from_utf8_lossy(&output.stderr));
|
||
}
|
||
std::fs::read(&out_s).context("reading transcoded audio")
|
||
}
|
||
|
||
/// Normalize in-memory upload bytes to WAV: write to a temp file (keeping the
|
||
/// source extension as an ffmpeg probe hint) then transcode.
|
||
async fn transcode_bytes_to_wav(
|
||
input: &[u8],
|
||
src_ext: Option<&str>,
|
||
start: f64,
|
||
duration: f64,
|
||
) -> anyhow::Result<Vec<u8>> {
|
||
let suffix = src_ext
|
||
.filter(|e| !e.is_empty())
|
||
.map(|e| format!(".{e}"))
|
||
.unwrap_or_else(|| ".bin".to_string());
|
||
let in_tmp = tempfile::Builder::new()
|
||
.suffix(&suffix)
|
||
.tempfile()
|
||
.context("creating temp input")?;
|
||
std::fs::write(in_tmp.path(), input).context("writing temp input")?;
|
||
run_ffmpeg_to_wav(&in_tmp.path().to_string_lossy(), start, duration).await
|
||
}
|
||
|
||
#[derive(Debug, Deserialize)]
|
||
pub struct TtsSpeechRequest {
|
||
pub text: String,
|
||
#[serde(default)]
|
||
pub voice: Option<String>,
|
||
/// Audio container, e.g. `"mp3"` (default) or `"wav"`.
|
||
#[serde(default)]
|
||
pub format: Option<String>,
|
||
/// Chatterbox knobs (clamped server-side). exaggeration 0.25–2.0 (emotion),
|
||
/// cfg_weight 0.0–1.0 (pace; ~0.3 for fast speakers, 0 to neutralize a
|
||
/// reference accent), temperature 0.05–5.0 (randomness).
|
||
#[serde(default)]
|
||
pub exaggeration: Option<f32>,
|
||
#[serde(default)]
|
||
pub cfg_weight: Option<f32>,
|
||
#[serde(default)]
|
||
pub temperature: Option<f32>,
|
||
}
|
||
|
||
#[derive(Debug, Serialize)]
|
||
pub struct TtsSpeechResponse {
|
||
pub audio_base64: String,
|
||
pub format: String,
|
||
}
|
||
|
||
/// POST /tts/speech — synthesize `text` (optionally in a named `voice`) and
|
||
/// return base64-encoded audio for `data:` URI playback on the client.
|
||
#[post("/tts/speech")]
|
||
pub async fn tts_speech_handler(
|
||
http_request: HttpRequest,
|
||
_claims: Claims,
|
||
req: web::Json<TtsSpeechRequest>,
|
||
app_state: web::Data<AppState>,
|
||
) -> impl Responder {
|
||
let parent_context = extract_context_from_request(&http_request);
|
||
let mut span = global_tracer().start_with_context("http.tts.speech", &parent_context);
|
||
|
||
let text = prepare_for_tts(&req.text);
|
||
if text.is_empty() {
|
||
span.set_status(Status::error("text is required"));
|
||
return HttpResponse::BadRequest().json(json!({ "error": "text is required" }));
|
||
}
|
||
let Some(client) = app_state.llamacpp.as_ref() else {
|
||
span.set_status(Status::error("tts backend not configured"));
|
||
return HttpResponse::ServiceUnavailable()
|
||
.json(json!({ "error": "TTS backend not configured (set LLAMA_SWAP_URL)" }));
|
||
};
|
||
|
||
let format = req
|
||
.format
|
||
.as_deref()
|
||
.filter(|s| !s.is_empty())
|
||
.unwrap_or("mp3");
|
||
let dv = default_voice();
|
||
let voice = req
|
||
.voice
|
||
.as_deref()
|
||
.filter(|s| !s.is_empty())
|
||
.or(dv.as_deref());
|
||
|
||
span.set_attribute(KeyValue::new("tts.model", client.tts_model.clone()));
|
||
span.set_attribute(KeyValue::new("tts.format", format.to_string()));
|
||
span.set_attribute(KeyValue::new("tts.has_voice", voice.is_some()));
|
||
span.set_attribute(KeyValue::new("tts.text_len", text.len() as i64));
|
||
|
||
// Clamp generation knobs to Chatterbox's documented ranges before forwarding.
|
||
let exaggeration = req.exaggeration.map(|x| x.clamp(0.25, 2.0));
|
||
let cfg_weight = req.cfg_weight.map(|x| x.clamp(0.0, 1.0));
|
||
let temperature = req.temperature.map(|x| x.clamp(0.05, 5.0));
|
||
|
||
// One synthesis at a time (see TTS_PERMIT) — fast-fail when busy.
|
||
let Ok(_permit) = TTS_PERMIT.try_acquire() else {
|
||
span.set_status(Status::error("tts busy"));
|
||
return HttpResponse::TooManyRequests().json(json!({
|
||
"error": "TTS is busy with another request — try again shortly"
|
||
}));
|
||
};
|
||
|
||
// Wait for the LLM side to release the GPU before sending — the synthesis
|
||
// timeout starts at send, not here (see ai::gpu).
|
||
let _gpu = crate::ai::gpu::tts_lease().await;
|
||
|
||
match client
|
||
.text_to_speech(&text, voice, format, exaggeration, cfg_weight, temperature)
|
||
.await
|
||
{
|
||
Ok(bytes) => {
|
||
span.set_attribute(KeyValue::new("tts.audio_bytes", bytes.len() as i64));
|
||
span.set_status(Status::Ok);
|
||
let audio_base64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
|
||
HttpResponse::Ok().json(TtsSpeechResponse {
|
||
audio_base64,
|
||
format: format.to_string(),
|
||
})
|
||
}
|
||
Err(e) => {
|
||
span.set_status(Status::error("tts synthesis failed"));
|
||
log::error!("TTS synth failed: {:?}", e);
|
||
HttpResponse::BadGateway().json(json!({ "error": format!("TTS failed: {e}") }))
|
||
}
|
||
}
|
||
}
|
||
|
||
#[derive(Debug, Serialize)]
|
||
pub struct TtsJobCreatedResponse {
|
||
pub job_id: String,
|
||
pub status: TtsJobStatus,
|
||
}
|
||
|
||
#[derive(Debug, Serialize)]
|
||
pub struct TtsJobStatusResponse {
|
||
pub job_id: String,
|
||
pub status: TtsJobStatus,
|
||
pub format: String,
|
||
#[serde(skip_serializing_if = "Option::is_none")]
|
||
pub audio_base64: Option<String>,
|
||
#[serde(skip_serializing_if = "Option::is_none")]
|
||
pub error: Option<String>,
|
||
}
|
||
|
||
/// POST /tts/speech/jobs — durable variant of /tts/speech for long syntheses.
|
||
/// Returns 202 + a job id immediately; the synth queues on the single GPU
|
||
/// permit (instead of fast-failing 429) and the client polls the job until
|
||
/// the audio is ready.
|
||
#[post("/tts/speech/jobs")]
|
||
pub async fn create_speech_job_handler(
|
||
http_request: HttpRequest,
|
||
_claims: Claims,
|
||
req: web::Json<TtsSpeechRequest>,
|
||
app_state: web::Data<AppState>,
|
||
) -> impl Responder {
|
||
let parent_context = extract_context_from_request(&http_request);
|
||
let mut span =
|
||
global_tracer().start_with_context("http.tts.speech_job.create", &parent_context);
|
||
|
||
let text = prepare_for_tts(&req.text);
|
||
if text.is_empty() {
|
||
span.set_status(Status::error("text is required"));
|
||
return HttpResponse::BadRequest().json(json!({ "error": "text is required" }));
|
||
}
|
||
if app_state.llamacpp.is_none() {
|
||
span.set_status(Status::error("tts backend not configured"));
|
||
return HttpResponse::ServiceUnavailable()
|
||
.json(json!({ "error": "TTS backend not configured (set LLAMA_SWAP_URL)" }));
|
||
}
|
||
|
||
let format = req
|
||
.format
|
||
.as_deref()
|
||
.filter(|s| !s.is_empty())
|
||
.unwrap_or("mp3")
|
||
.to_string();
|
||
let voice = req
|
||
.voice
|
||
.clone()
|
||
.filter(|s| !s.is_empty())
|
||
.or_else(default_voice);
|
||
// Clamp generation knobs to Chatterbox's documented ranges before forwarding.
|
||
let exaggeration = req.exaggeration.map(|x| x.clamp(0.25, 2.0));
|
||
let cfg_weight = req.cfg_weight.map(|x| x.clamp(0.0, 1.0));
|
||
let temperature = req.temperature.map(|x| x.clamp(0.05, 5.0));
|
||
|
||
span.set_attribute(KeyValue::new("tts.format", format.clone()));
|
||
span.set_attribute(KeyValue::new("tts.has_voice", voice.is_some()));
|
||
span.set_attribute(KeyValue::new("tts.text_len", text.len() as i64));
|
||
|
||
let job_id = Uuid::new_v4();
|
||
{
|
||
let mut jobs = TTS_JOBS.lock().unwrap();
|
||
sweep_stale_jobs(&mut jobs, Instant::now());
|
||
jobs.insert(
|
||
job_id,
|
||
TtsJob {
|
||
status: TtsJobStatus::Queued,
|
||
format: format.clone(),
|
||
audio_base64: None,
|
||
error: None,
|
||
created_at: Instant::now(),
|
||
finished_at: None,
|
||
abort: None,
|
||
},
|
||
);
|
||
}
|
||
|
||
let state = app_state.clone();
|
||
let handle = tokio::spawn(async move {
|
||
// Queue rather than fast-fail: jobs wait their turn for the GPU.
|
||
let _permit = match TTS_PERMIT.acquire().await {
|
||
Ok(p) => p,
|
||
Err(_) => {
|
||
finish_job(
|
||
job_id,
|
||
TtsJobStatus::Error,
|
||
None,
|
||
Some("TTS queue closed".to_string()),
|
||
);
|
||
return;
|
||
}
|
||
};
|
||
// Wait for the LLM side to release the GPU too (see ai::gpu) — only
|
||
// then does the job count as running. The synthesis timeout starts at
|
||
// the HTTP send below, so neither wait burns it, and the client can
|
||
// anchor its own deadline to the queued→running transition.
|
||
let _gpu = crate::ai::gpu::tts_lease().await;
|
||
|
||
// Cancelled while queued — release the permits without synthesizing.
|
||
let cancelled = with_job(job_id, |job| {
|
||
if job.status == TtsJobStatus::Queued {
|
||
job.status = TtsJobStatus::Running;
|
||
false
|
||
} else {
|
||
true
|
||
}
|
||
})
|
||
.unwrap_or(true);
|
||
if cancelled {
|
||
return;
|
||
}
|
||
|
||
let Some(client) = state.llamacpp.as_ref() else {
|
||
finish_job(
|
||
job_id,
|
||
TtsJobStatus::Error,
|
||
None,
|
||
Some("TTS backend not configured".to_string()),
|
||
);
|
||
return;
|
||
};
|
||
match client
|
||
.text_to_speech(
|
||
&text,
|
||
voice.as_deref(),
|
||
&format,
|
||
exaggeration,
|
||
cfg_weight,
|
||
temperature,
|
||
)
|
||
.await
|
||
{
|
||
Ok(bytes) => {
|
||
let audio = base64::engine::general_purpose::STANDARD.encode(&bytes);
|
||
finish_job(job_id, TtsJobStatus::Done, Some(audio), None);
|
||
}
|
||
Err(e) => {
|
||
log::error!("TTS job {job_id} failed: {:?}", e);
|
||
finish_job(
|
||
job_id,
|
||
TtsJobStatus::Error,
|
||
None,
|
||
Some(format!("TTS failed: {e}")),
|
||
);
|
||
}
|
||
}
|
||
});
|
||
// Aborting an already-finished task is a no-op, so this late install is
|
||
// safe even if the job raced to completion.
|
||
with_job(job_id, |job| {
|
||
if !job.status.is_terminal() {
|
||
job.abort = Some(handle.abort_handle());
|
||
}
|
||
});
|
||
|
||
span.set_status(Status::Ok);
|
||
HttpResponse::Accepted().json(TtsJobCreatedResponse {
|
||
job_id: job_id.to_string(),
|
||
status: TtsJobStatus::Queued,
|
||
})
|
||
}
|
||
|
||
/// GET /tts/speech/jobs/{id} — poll a speech job; returns the audio once done.
|
||
/// 404s after the job expires (results are kept ~10 min past completion).
|
||
#[get("/tts/speech/jobs/{id}")]
|
||
pub async fn speech_job_status_handler(
|
||
http_request: HttpRequest,
|
||
_claims: Claims,
|
||
path: web::Path<String>,
|
||
) -> impl Responder {
|
||
let parent_context = extract_context_from_request(&http_request);
|
||
let mut span =
|
||
global_tracer().start_with_context("http.tts.speech_job.status", &parent_context);
|
||
|
||
let Ok(id) = Uuid::parse_str(&path.into_inner()) else {
|
||
span.set_status(Status::error("invalid job id"));
|
||
return HttpResponse::BadRequest().json(json!({ "error": "invalid job id" }));
|
||
};
|
||
let resp = {
|
||
let jobs = TTS_JOBS.lock().unwrap();
|
||
jobs.get(&id).map(|job| TtsJobStatusResponse {
|
||
job_id: id.to_string(),
|
||
status: job.status,
|
||
format: job.format.clone(),
|
||
audio_base64: job.audio_base64.clone(),
|
||
error: job.error.clone(),
|
||
})
|
||
};
|
||
match resp {
|
||
Some(r) => {
|
||
span.set_status(Status::Ok);
|
||
HttpResponse::Ok().json(r)
|
||
}
|
||
None => {
|
||
span.set_status(Status::error("job not found"));
|
||
HttpResponse::NotFound()
|
||
.json(json!({ "error": "TTS job not found (it may have expired)" }))
|
||
}
|
||
}
|
||
}
|
||
|
||
/// DELETE /tts/speech/jobs/{id} — cancel a queued/running speech job. Once the
|
||
/// upstream GPU job has started it can't be interrupted (same wrapper
|
||
/// limitation as the sync path); cancelling stops the wait and discards the
|
||
/// result. Cancelling an already-finished job leaves it terminal.
|
||
#[delete("/tts/speech/jobs/{id}")]
|
||
pub async fn cancel_speech_job_handler(
|
||
http_request: HttpRequest,
|
||
_claims: Claims,
|
||
path: web::Path<String>,
|
||
) -> impl Responder {
|
||
let parent_context = extract_context_from_request(&http_request);
|
||
let mut span =
|
||
global_tracer().start_with_context("http.tts.speech_job.cancel", &parent_context);
|
||
|
||
let Ok(id) = Uuid::parse_str(&path.into_inner()) else {
|
||
span.set_status(Status::error("invalid job id"));
|
||
return HttpResponse::BadRequest().json(json!({ "error": "invalid job id" }));
|
||
};
|
||
let status = with_job(id, |job| {
|
||
if !job.status.is_terminal() {
|
||
if let Some(h) = job.abort.take() {
|
||
h.abort();
|
||
}
|
||
job.status = TtsJobStatus::Cancelled;
|
||
job.finished_at = Some(Instant::now());
|
||
}
|
||
job.status
|
||
});
|
||
match status {
|
||
Some(s) => {
|
||
span.set_status(Status::Ok);
|
||
HttpResponse::Ok().json(json!({ "job_id": id.to_string(), "status": s }))
|
||
}
|
||
None => {
|
||
span.set_status(Status::error("job not found"));
|
||
HttpResponse::NotFound()
|
||
.json(json!({ "error": "TTS job not found (it may have expired)" }))
|
||
}
|
||
}
|
||
}
|
||
|
||
#[derive(Debug, Deserialize)]
|
||
pub struct ListVoicesQuery {
|
||
/// `?refresh=1` bypasses the voice-list cache and re-queries upstream
|
||
/// (which may spin up the TTS model).
|
||
#[serde(default)]
|
||
pub refresh: Option<String>,
|
||
}
|
||
|
||
/// GET /tts/voices — list the Chatterbox voice library. Served from an
|
||
/// in-memory cache when possible so browsing settings doesn't make llama-swap
|
||
/// load the TTS model (and evict the resident LLM); see VOICES_CACHE.
|
||
#[get("/tts/voices")]
|
||
pub async fn list_voices_handler(
|
||
http_request: HttpRequest,
|
||
_claims: Claims,
|
||
query: web::Query<ListVoicesQuery>,
|
||
app_state: web::Data<AppState>,
|
||
) -> impl Responder {
|
||
let parent_context = extract_context_from_request(&http_request);
|
||
let mut span = global_tracer().start_with_context("http.tts.voices.list", &parent_context);
|
||
|
||
let force = query
|
||
.refresh
|
||
.as_deref()
|
||
.is_some_and(|v| matches!(v, "1" | "true" | "yes"));
|
||
if !force && let Some(v) = cached_voices() {
|
||
span.set_attribute(KeyValue::new("tts.voices_cache_hit", true));
|
||
span.set_status(Status::Ok);
|
||
return HttpResponse::Ok().json(v);
|
||
}
|
||
|
||
let Some(client) = app_state.llamacpp.as_ref() else {
|
||
span.set_status(Status::error("tts backend not configured"));
|
||
return HttpResponse::ServiceUnavailable()
|
||
.json(json!({ "error": "TTS backend not configured" }));
|
||
};
|
||
match client.list_voices().await {
|
||
Ok(v) => {
|
||
store_voices_cache(&v);
|
||
span.set_attribute(KeyValue::new("tts.voices_cache_hit", false));
|
||
span.set_status(Status::Ok);
|
||
HttpResponse::Ok().json(v)
|
||
}
|
||
Err(e) => {
|
||
span.set_status(Status::error("list_voices failed"));
|
||
log::error!("list_voices failed: {:?}", e);
|
||
HttpResponse::BadGateway().json(json!({ "error": format!("{e}") }))
|
||
}
|
||
}
|
||
}
|
||
|
||
/// DELETE /tts/voices/{name} — remove a cloned voice from the library.
|
||
#[delete("/tts/voices/{name}")]
|
||
pub async fn delete_voice_handler(
|
||
http_request: HttpRequest,
|
||
_claims: Claims,
|
||
path: web::Path<String>,
|
||
app_state: web::Data<AppState>,
|
||
) -> impl Responder {
|
||
let parent_context = extract_context_from_request(&http_request);
|
||
let mut span = global_tracer().start_with_context("http.tts.voices.delete", &parent_context);
|
||
|
||
let Some(client) = app_state.llamacpp.as_ref() else {
|
||
span.set_status(Status::error("tts backend not configured"));
|
||
return HttpResponse::ServiceUnavailable()
|
||
.json(json!({ "error": "TTS backend not configured" }));
|
||
};
|
||
// Same charset rule as creation — a name that sanitizes differently was
|
||
// never a voice we created, and must not reach the upstream URL.
|
||
let raw = path.into_inner();
|
||
let name = match sanitize_voice_name(&raw) {
|
||
Some(n) if n == raw => n,
|
||
_ => {
|
||
span.set_status(Status::error("invalid voice name"));
|
||
return HttpResponse::BadRequest().json(json!({ "error": "invalid voice name" }));
|
||
}
|
||
};
|
||
span.set_attribute(KeyValue::new("tts.voice_name", name.clone()));
|
||
|
||
match client.delete_voice(&name).await {
|
||
Ok(v) => {
|
||
invalidate_voices_cache();
|
||
span.set_status(Status::Ok);
|
||
HttpResponse::Ok().json(v)
|
||
}
|
||
Err(e) => {
|
||
span.set_status(Status::error("delete_voice failed"));
|
||
log::error!("delete_voice failed: {:?}", e);
|
||
HttpResponse::BadGateway().json(json!({ "error": format!("{e}") }))
|
||
}
|
||
}
|
||
}
|
||
|
||
/// POST /tts/voices/upload — register a cloned voice from an uploaded audio
|
||
/// clip. Multipart fields: `voice_name` (text) + a file part (`voice_file`),
|
||
/// plus optional `start_seconds` / `duration_seconds` (text) selecting which
|
||
/// window of a longer recording becomes the reference clip.
|
||
#[post("/tts/voices/upload")]
|
||
pub async fn create_voice_upload_handler(
|
||
http_request: HttpRequest,
|
||
_claims: Claims,
|
||
mut payload: Multipart,
|
||
app_state: web::Data<AppState>,
|
||
) -> impl Responder {
|
||
let parent_context = extract_context_from_request(&http_request);
|
||
let mut span = global_tracer().start_with_context("http.tts.voices.upload", &parent_context);
|
||
|
||
let Some(client) = app_state.llamacpp.as_ref() else {
|
||
span.set_status(Status::error("tts backend not configured"));
|
||
return HttpResponse::ServiceUnavailable()
|
||
.json(json!({ "error": "TTS backend not configured" }));
|
||
};
|
||
|
||
let mut voice_name: Option<String> = None;
|
||
let mut start_field: Option<String> = None;
|
||
let mut duration_field: Option<String> = None;
|
||
let mut file_bytes = BytesMut::new();
|
||
let mut filename = "voice.wav".to_string();
|
||
|
||
while let Some(Ok(mut part)) = payload.next().await {
|
||
// Capture disposition fields up front so the immutable borrow ends
|
||
// before we mutably stream the part body (mirrors handlers/image.rs).
|
||
let (fname_opt, name_opt) = {
|
||
let cd = part.content_disposition();
|
||
(
|
||
cd.and_then(|c| c.get_filename()).map(|s| s.to_string()),
|
||
cd.and_then(|c| c.get_name()).map(|s| s.to_string()),
|
||
)
|
||
};
|
||
|
||
if let Some(fname) = fname_opt {
|
||
filename = fname;
|
||
while let Some(Ok(data)) = part.next().await {
|
||
if file_bytes.len() + data.len() > MAX_VOICE_UPLOAD_BYTES {
|
||
span.set_status(Status::error("voice clip exceeds limit"));
|
||
return HttpResponse::PayloadTooLarge()
|
||
.json(json!({ "error": "voice clip exceeds 25 MB" }));
|
||
}
|
||
file_bytes.put(data);
|
||
}
|
||
} else if matches!(
|
||
name_opt.as_deref(),
|
||
Some("voice_name" | "start_seconds" | "duration_seconds")
|
||
) {
|
||
let field = name_opt.as_deref().unwrap().to_string();
|
||
let mut buf = BytesMut::new();
|
||
while let Some(Ok(data)) = part.next().await {
|
||
buf.put(data);
|
||
}
|
||
let text = String::from_utf8_lossy(&buf).trim().to_string();
|
||
match field.as_str() {
|
||
"voice_name" => voice_name = Some(text),
|
||
"start_seconds" => start_field = Some(text),
|
||
_ => duration_field = Some(text),
|
||
}
|
||
} else {
|
||
while let Some(Ok(_)) = part.next().await {}
|
||
}
|
||
}
|
||
|
||
// Empty text parts are treated as absent; anything else must parse, so a
|
||
// client bug ("abc") fails loudly instead of silently cloning from 0s.
|
||
let parse_secs = |field: Option<&String>, name: &str| -> Result<Option<f64>, String> {
|
||
match field.map(|s| s.as_str()).filter(|s| !s.is_empty()) {
|
||
None => Ok(None),
|
||
Some(s) => s
|
||
.parse::<f64>()
|
||
.map(Some)
|
||
.map_err(|_| format!("{name} must be a number of seconds")),
|
||
}
|
||
};
|
||
let window = parse_secs(start_field.as_ref(), "start_seconds").and_then(|start| {
|
||
parse_secs(duration_field.as_ref(), "duration_seconds")
|
||
.and_then(|dur| resolve_ref_window(start, dur))
|
||
});
|
||
let (ref_start, ref_duration) = match window {
|
||
Ok(w) => w,
|
||
Err(msg) => {
|
||
span.set_status(Status::error("invalid reference window"));
|
||
return HttpResponse::BadRequest().json(json!({ "error": msg }));
|
||
}
|
||
};
|
||
|
||
let Some(name) = voice_name.as_deref().and_then(sanitize_voice_name) else {
|
||
span.set_status(Status::error("voice_name is required"));
|
||
return HttpResponse::BadRequest()
|
||
.json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
|
||
};
|
||
// Tag the name with the ref-clip length (e.g. `grandma-30s`) so the
|
||
// library shows which reference length produced each clone.
|
||
let name = append_ref_window(&name, ref_start, ref_duration.round().max(1.0) as u32);
|
||
if file_bytes.is_empty() {
|
||
span.set_status(Status::error("voice_file is required"));
|
||
return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" }));
|
||
}
|
||
span.set_attribute(KeyValue::new("tts.voice_name", name.clone()));
|
||
span.set_attribute(KeyValue::new("tts.upload_bytes", file_bytes.len() as i64));
|
||
|
||
// Normalize to WAV so any device format (e.g. .aac / .opus, which Chatterbox
|
||
// rejects by extension) is accepted.
|
||
let src_ext = Path::new(&filename).extension().and_then(|e| e.to_str());
|
||
let wav =
|
||
match transcode_bytes_to_wav(file_bytes.as_ref(), src_ext, ref_start, ref_duration).await {
|
||
Ok(w) => w,
|
||
Err(e) => {
|
||
span.set_status(Status::error("audio decode failed"));
|
||
log::error!("voice upload transcode failed: {:?}", e);
|
||
return HttpResponse::BadRequest()
|
||
.json(json!({ "error": "couldn't decode that audio file" }));
|
||
}
|
||
};
|
||
|
||
match client
|
||
.create_voice(&name, wav, "reference.wav", "audio/wav")
|
||
.await
|
||
{
|
||
Ok(v) => {
|
||
invalidate_voices_cache();
|
||
span.set_status(Status::Ok);
|
||
HttpResponse::Ok().json(v)
|
||
}
|
||
Err(e) => {
|
||
span.set_status(Status::error("create_voice failed"));
|
||
log::error!("create_voice (upload) failed: {:?}", e);
|
||
HttpResponse::BadGateway().json(json!({ "error": format!("{e}") }))
|
||
}
|
||
}
|
||
}
|
||
|
||
#[derive(Debug, Deserialize)]
|
||
pub struct CreateVoiceFromLibraryRequest {
|
||
pub voice_name: String,
|
||
/// Library-relative path to an audio or video file.
|
||
pub path: String,
|
||
#[serde(default)]
|
||
pub library: Option<String>,
|
||
/// Offset into the source where the reference window begins (default 0) —
|
||
/// lets the client pick the clean-speech section of a long recording.
|
||
#[serde(default)]
|
||
pub start_seconds: Option<f64>,
|
||
/// Reference window length; clamped to LLAMA_SWAP_TTS_REF_SECONDS.
|
||
#[serde(default)]
|
||
pub duration_seconds: Option<f64>,
|
||
}
|
||
|
||
/// POST /tts/voices/from-library — register a cloned voice from a file already
|
||
/// in a library. Audio and video alike are ffmpeg-normalized to a mono 24 kHz
|
||
/// WAV reference clip (window selected by start/duration_seconds, length
|
||
/// capped by LLAMA_SWAP_TTS_REF_SECONDS).
|
||
#[post("/tts/voices/from-library")]
|
||
pub async fn create_voice_from_library_handler(
|
||
http_request: HttpRequest,
|
||
_claims: Claims,
|
||
req: web::Json<CreateVoiceFromLibraryRequest>,
|
||
app_state: web::Data<AppState>,
|
||
) -> impl Responder {
|
||
let parent_context = extract_context_from_request(&http_request);
|
||
let mut span =
|
||
global_tracer().start_with_context("http.tts.voices.from_library", &parent_context);
|
||
|
||
let Some(client) = app_state.llamacpp.as_ref() else {
|
||
span.set_status(Status::error("tts backend not configured"));
|
||
return HttpResponse::ServiceUnavailable()
|
||
.json(json!({ "error": "TTS backend not configured" }));
|
||
};
|
||
let Some(voice_name) = sanitize_voice_name(&req.voice_name) else {
|
||
span.set_status(Status::error("voice_name is required"));
|
||
return HttpResponse::BadRequest()
|
||
.json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
|
||
};
|
||
let (ref_start, ref_duration) =
|
||
match resolve_ref_window(req.start_seconds, req.duration_seconds) {
|
||
Ok(w) => w,
|
||
Err(msg) => {
|
||
span.set_status(Status::error("invalid reference window"));
|
||
return HttpResponse::BadRequest().json(json!({ "error": msg }));
|
||
}
|
||
};
|
||
// Tag the name with the ref-clip length (e.g. `grandma-30s`) so the
|
||
// library shows which reference length produced each clone.
|
||
let voice_name =
|
||
append_ref_window(&voice_name, ref_start, ref_duration.round().max(1.0) as u32);
|
||
|
||
let library = match libraries::resolve_library_param(&app_state, req.library.as_deref()) {
|
||
Ok(Some(l)) => l,
|
||
Ok(None) => app_state.primary_library(),
|
||
Err(msg) => {
|
||
span.set_status(Status::error("invalid library"));
|
||
return HttpResponse::BadRequest().json(json!({ "error": msg }));
|
||
}
|
||
};
|
||
|
||
// is_valid_full_path confines the path to the library root (no traversal).
|
||
let abs = match is_valid_full_path(&library.root_path, &req.path, false) {
|
||
Some(p) if p.exists() => p,
|
||
_ => {
|
||
span.set_status(Status::error("file not found"));
|
||
return HttpResponse::NotFound().json(json!({ "error": "file not found in library" }));
|
||
}
|
||
};
|
||
|
||
// Only real audio/video sources are valid voice references — refuse to
|
||
// slurp arbitrary library files into memory / ffmpeg.
|
||
if !is_audio_file(&abs) && !is_video_file(&abs) {
|
||
span.set_status(Status::error("not an audio/video file"));
|
||
return HttpResponse::BadRequest()
|
||
.json(json!({ "error": "file is not an audio or video file" }));
|
||
}
|
||
span.set_attribute(KeyValue::new("tts.voice_name", voice_name.clone()));
|
||
|
||
let wav = match prepare_reference_audio(&abs, ref_start, ref_duration).await {
|
||
Ok(b) => b,
|
||
Err(e) => {
|
||
span.set_status(Status::error("audio decode failed"));
|
||
log::error!("voice reference prep failed for {:?}: {:?}", abs, e);
|
||
return HttpResponse::BadRequest()
|
||
.json(json!({ "error": "couldn't decode that file's audio" }));
|
||
}
|
||
};
|
||
|
||
match client
|
||
.create_voice(&voice_name, wav, "reference.wav", "audio/wav")
|
||
.await
|
||
{
|
||
Ok(v) => {
|
||
invalidate_voices_cache();
|
||
span.set_status(Status::Ok);
|
||
HttpResponse::Ok().json(v)
|
||
}
|
||
Err(e) => {
|
||
span.set_status(Status::error("create_voice failed"));
|
||
log::error!("create_voice (from-library) failed: {:?}", e);
|
||
HttpResponse::BadGateway().json(json!({ "error": format!("{e}") }))
|
||
}
|
||
}
|
||
}
|
||
|
||
/// Read a library file (audio or video) as a Chatterbox-ready reference: ffmpeg
|
||
/// decodes/extracts its audio to mono 24 kHz WAV. Reading straight from the
|
||
/// library path avoids slurping a (possibly large) video into memory.
|
||
async fn prepare_reference_audio(abs: &Path, start: f64, duration: f64) -> anyhow::Result<Vec<u8>> {
|
||
run_ffmpeg_to_wav(&abs.to_string_lossy(), start, duration).await
|
||
}
|
||
|
||
#[cfg(test)]
|
||
mod tests {
|
||
use super::*;
|
||
|
||
#[test]
|
||
fn sanitize_voice_name_keeps_safe_chars() {
|
||
assert_eq!(sanitize_voice_name("m").as_deref(), Some("m"));
|
||
assert_eq!(
|
||
sanitize_voice_name(" Cameron ").as_deref(),
|
||
Some("Cameron")
|
||
);
|
||
assert_eq!(
|
||
sanitize_voice_name("voice_01-a").as_deref(),
|
||
Some("voice_01-a")
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn sanitize_voice_name_strips_unsafe_chars() {
|
||
// Path separators / dots / spaces become '-' and are trimmed at edges.
|
||
assert_eq!(sanitize_voice_name("a b.c").as_deref(), Some("a-b-c"));
|
||
assert_eq!(
|
||
sanitize_voice_name("../etc/passwd").as_deref(),
|
||
Some("etc-passwd")
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn sanitize_voice_name_rejects_empty_or_all_unsafe() {
|
||
assert_eq!(sanitize_voice_name(""), None);
|
||
assert_eq!(sanitize_voice_name(" "), None);
|
||
assert_eq!(sanitize_voice_name("../../"), None);
|
||
assert_eq!(sanitize_voice_name("...."), None);
|
||
}
|
||
|
||
#[test]
|
||
fn sanitize_voice_name_bounds_length() {
|
||
let long = "a".repeat(200);
|
||
assert_eq!(sanitize_voice_name(&long).unwrap().len(), 64);
|
||
}
|
||
|
||
#[test]
|
||
fn append_ref_window_tags_name() {
|
||
assert_eq!(append_ref_window("grandma", 0.0, 30), "grandma-30s");
|
||
assert_eq!(append_ref_window("voice_01", 0.0, 15), "voice_01-15s");
|
||
}
|
||
|
||
#[test]
|
||
fn append_ref_window_includes_nonzero_start() {
|
||
// Sub-minute starts stay in seconds; longer ones read as XmYYs since
|
||
// ':' isn't allowed in voice names.
|
||
assert_eq!(append_ref_window("grandma", 45.0, 30), "grandma-at45s-30s");
|
||
assert_eq!(
|
||
append_ref_window("grandma", 92.4, 30),
|
||
"grandma-at1m32s-30s"
|
||
);
|
||
assert_eq!(
|
||
append_ref_window("grandma", 600.0, 12),
|
||
"grandma-at10m00s-12s"
|
||
);
|
||
// A start that rounds to zero is "from the start".
|
||
assert_eq!(append_ref_window("grandma", 0.3, 30), "grandma-30s");
|
||
}
|
||
|
||
#[test]
|
||
fn append_ref_window_is_idempotent_for_same_window() {
|
||
assert_eq!(append_ref_window("grandma-30s", 0.0, 30), "grandma-30s");
|
||
assert_eq!(
|
||
append_ref_window("grandma-at45s-30s", 45.0, 30),
|
||
"grandma-at45s-30s"
|
||
);
|
||
// A different window still appends — that's the comparison use-case.
|
||
assert_eq!(append_ref_window("grandma-15s", 0.0, 30), "grandma-15s-30s");
|
||
assert_eq!(
|
||
append_ref_window("grandma-30s", 45.0, 30),
|
||
"grandma-30s-at45s-30s"
|
||
);
|
||
}
|
||
|
||
#[test]
|
||
fn append_ref_window_keeps_64_char_bound() {
|
||
let long = "a".repeat(64);
|
||
let tagged = append_ref_window(&long, 0.0, 30);
|
||
assert_eq!(tagged.len(), 64);
|
||
assert!(tagged.ends_with("-30s"));
|
||
|
||
let tagged = append_ref_window(&long, 92.0, 30);
|
||
assert_eq!(tagged.len(), 64);
|
||
assert!(tagged.ends_with("-at1m32s-30s"));
|
||
}
|
||
|
||
#[test]
|
||
fn resolve_ref_window_defaults_to_start_of_clip_at_cap_length() {
|
||
// Reads the live cap rather than mutating LLAMA_SWAP_TTS_REF_SECONDS:
|
||
// env mutation flakes under the parallel suite (see env_dispatch).
|
||
let cap = f64::from(tts_ref_seconds());
|
||
assert_eq!(resolve_ref_window(None, None), Ok((0.0, cap)));
|
||
}
|
||
|
||
#[test]
|
||
fn resolve_ref_window_accepts_offset_and_clamps_duration() {
|
||
let cap = f64::from(tts_ref_seconds());
|
||
assert_eq!(resolve_ref_window(Some(92.5), None), Ok((92.5, cap)));
|
||
assert_eq!(resolve_ref_window(Some(10.0), Some(12.0)), Ok((10.0, 12.0)));
|
||
// Longer-than-cap windows are bounded, not rejected.
|
||
assert_eq!(resolve_ref_window(None, Some(cap + 100.0)), Ok((0.0, cap)));
|
||
}
|
||
|
||
#[test]
|
||
fn resolve_ref_window_rejects_garbage() {
|
||
assert!(resolve_ref_window(Some(-1.0), None).is_err());
|
||
assert!(resolve_ref_window(Some(f64::NAN), None).is_err());
|
||
assert!(resolve_ref_window(Some(f64::INFINITY), None).is_err());
|
||
assert!(resolve_ref_window(None, Some(0.0)).is_err());
|
||
assert!(resolve_ref_window(None, Some(-5.0)).is_err());
|
||
assert!(resolve_ref_window(None, Some(f64::NAN)).is_err());
|
||
}
|
||
|
||
#[test]
|
||
fn sweep_drops_expired_results_and_keeps_live_jobs() {
|
||
let now = Instant::now();
|
||
let mk = |status: TtsJobStatus, created: Instant, finished: Option<Instant>| TtsJob {
|
||
status,
|
||
format: "mp3".into(),
|
||
audio_base64: None,
|
||
error: None,
|
||
created_at: created,
|
||
finished_at: finished,
|
||
abort: None,
|
||
};
|
||
let mut jobs = HashMap::new();
|
||
let live = Uuid::new_v4();
|
||
let fresh_done = Uuid::new_v4();
|
||
let stale_done = Uuid::new_v4();
|
||
jobs.insert(live, mk(TtsJobStatus::Running, now, None));
|
||
jobs.insert(
|
||
fresh_done,
|
||
mk(TtsJobStatus::Done, now, Some(now - Duration::from_secs(60))),
|
||
);
|
||
jobs.insert(
|
||
stale_done,
|
||
mk(
|
||
TtsJobStatus::Done,
|
||
now - TTS_JOB_MAX_AGE / 2,
|
||
Some(now - TTS_JOB_RESULT_TTL),
|
||
),
|
||
);
|
||
|
||
sweep_stale_jobs(&mut jobs, now);
|
||
assert!(jobs.contains_key(&live));
|
||
assert!(jobs.contains_key(&fresh_done));
|
||
assert!(!jobs.contains_key(&stale_done));
|
||
}
|
||
|
||
#[test]
|
||
fn sweep_drops_jobs_past_max_age_even_if_unfinished() {
|
||
let now = Instant::now();
|
||
let mut jobs = HashMap::new();
|
||
let ancient = Uuid::new_v4();
|
||
jobs.insert(
|
||
ancient,
|
||
TtsJob {
|
||
status: TtsJobStatus::Running,
|
||
format: "mp3".into(),
|
||
audio_base64: None,
|
||
error: None,
|
||
created_at: now - TTS_JOB_MAX_AGE,
|
||
finished_at: None,
|
||
abort: None,
|
||
},
|
||
);
|
||
sweep_stale_jobs(&mut jobs, now);
|
||
assert!(jobs.is_empty());
|
||
}
|
||
|
||
#[test]
|
||
fn voices_cache_roundtrip_and_invalidation() {
|
||
invalidate_voices_cache();
|
||
assert!(cached_voices().is_none());
|
||
let v = json!({ "voices": [{ "name": "m-30s" }], "count": 1 });
|
||
store_voices_cache(&v);
|
||
assert_eq!(cached_voices(), Some(v));
|
||
invalidate_voices_cache();
|
||
assert!(cached_voices().is_none());
|
||
}
|
||
|
||
#[test]
|
||
fn clean_for_tts_strips_markdown() {
|
||
assert_eq!(
|
||
clean_for_tts("**Bold** and _italic_ and `code`"),
|
||
"Bold and italic and code"
|
||
);
|
||
assert_eq!(clean_for_tts("# Title\n\nbody"), "Title\nbody");
|
||
assert_eq!(
|
||
clean_for_tts("See [docs](http://x.com) now"),
|
||
"See docs now"
|
||
);
|
||
assert_eq!(clean_for_tts("- one\n- two"), "one\ntwo");
|
||
}
|
||
|
||
#[test]
|
||
fn clean_for_tts_strips_emoji_and_urls() {
|
||
assert_eq!(clean_for_tts("Hello 😀 world 🎉"), "Hello world");
|
||
assert_eq!(
|
||
clean_for_tts("visit https://example.com today"),
|
||
"visit today"
|
||
);
|
||
// ZWJ-glued emoji sequence is fully removed.
|
||
assert_eq!(clean_for_tts("family 👨👩👧 photo"), "family photo");
|
||
}
|
||
|
||
#[test]
|
||
fn clean_for_tts_collapses_blank_lines_to_single_break() {
|
||
// Chatterbox pauses (sometimes ~20s) per blank line, so paragraph
|
||
// breaks must collapse to a single newline.
|
||
assert_eq!(clean_for_tts("para one\n\npara two"), "para one\npara two");
|
||
assert_eq!(clean_for_tts("a\n\n\n\nb"), "a\nb");
|
||
// Whitespace-only "blank" lines collapse too.
|
||
assert_eq!(clean_for_tts("a\n \t \nb"), "a\nb");
|
||
// A single newline is left alone.
|
||
assert_eq!(clean_for_tts("a\nb"), "a\nb");
|
||
}
|
||
|
||
#[test]
|
||
fn clean_for_tts_preserves_bracket_tags() {
|
||
// Non-turbo Chatterbox ignores these; a future Turbo uses them as
|
||
// paralinguistic cues — so we must not strip them.
|
||
assert_eq!(clean_for_tts("hello [laugh] there"), "hello [laugh] there");
|
||
}
|
||
}
|