From cab867da609a3c8356a9e36c62faf86da75a7da1 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Wed, 3 Jun 2026 14:02:56 -0400 Subject: [PATCH] Serialize /tts/speech with a single permit; 429 when busy The Chatterbox wrapper has no internal lock or cancellation, so concurrent synth requests contend on the single GPU and abandoned (timed-out) jobs cascade into stacked slowness. Gate synthesis behind a one-permit semaphore and fast-fail concurrent requests with 429 instead of queueing. Co-Authored-By: Claude Opus 4.8 (1M context) --- README.md | 3 ++- src/ai/tts.rs | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 58ddc81..39ebe30 100644 --- a/README.md +++ b/README.md @@ -154,7 +154,8 @@ is built whenever that's set — independent of `LLM_BACKEND`). Endpoints: - `POST /tts/speech` — body `{ text, voice?, format?, exaggeration?, cfg_weight?, temperature? }`; returns `{ audio_base64, format }`. Input is cleaned server-side (markdown + emoji stripped) and the generation knobs are clamped - to Chatterbox's ranges. + to Chatterbox's ranges. Synthesis is serialized (one at a time — the upstream + has no GPU lock of its own); a concurrent request gets a fast `429`. - `GET /tts/voices` — list the voice library. - `POST /tts/voices/upload` — multipart `voice_name` + `voice_file`; clone a voice from an uploaded clip (≤25 MB). diff --git a/src/ai/tts.rs b/src/ai/tts.rs index 9c98bee..2c2009b 100644 --- a/src/ai/tts.rs +++ b/src/ai/tts.rs @@ -18,6 +18,7 @@ use serde::{Deserialize, Serialize}; use serde_json::json; use std::path::Path; use std::sync::LazyLock; +use tokio::sync::Semaphore; use crate::data::Claims; use crate::file_types::{is_audio_file, is_video_file}; @@ -31,6 +32,14 @@ use crate::state::AppState; /// upload can't balloon ImageApi memory before we ever forward it. const MAX_VOICE_UPLOAD_BYTES: usize = 25 * 1024 * 1024; // 25 MB +/// Serialize speech synthesis: the Chatterbox server has no internal lock or +/// queue, so concurrent requests contend on the single GPU and cascade into +/// timeouts. One permit; when busy we fast-fail with 429 rather than queue — +/// the app surfaces "busy" immediately, and typical jobs clear in well under a +/// minute. (An abandoned upstream job can still occupy the GPU until it +/// finishes — that's a wrapper limitation; the chunked-queue plan fixes it.) +static TTS_PERMIT: LazyLock = LazyLock::new(|| Semaphore::new(1)); + /// Sanitize a user-supplied voice name. The name is forwarded to Chatterbox /// where it becomes a filename in the voice-library directory, so we restrict /// it to a safe charset (alphanumerics, dash, underscore) — no path @@ -235,6 +244,14 @@ pub async fn tts_speech_handler( let cfg_weight = req.cfg_weight.map(|x| x.clamp(0.0, 1.0)); let temperature = req.temperature.map(|x| x.clamp(0.05, 5.0)); + // One synthesis at a time (see TTS_PERMIT) — fast-fail when busy. + let Ok(_permit) = TTS_PERMIT.try_acquire() else { + span.set_status(Status::error("tts busy")); + return HttpResponse::TooManyRequests().json(json!({ + "error": "TTS is busy with another request — try again shortly" + })); + }; + match client .text_to_speech(&text, voice, format, exaggeration, cfg_weight, temperature) .await