Feature/tts integration #103
@@ -154,7 +154,8 @@ is built whenever that's set — independent of `LLM_BACKEND`). Endpoints:
|
|||||||
- `POST /tts/speech` — body `{ text, voice?, format?, exaggeration?, cfg_weight?,
|
- `POST /tts/speech` — body `{ text, voice?, format?, exaggeration?, cfg_weight?,
|
||||||
temperature? }`; returns `{ audio_base64, format }`. Input is cleaned
|
temperature? }`; returns `{ audio_base64, format }`. Input is cleaned
|
||||||
server-side (markdown + emoji stripped) and the generation knobs are clamped
|
server-side (markdown + emoji stripped) and the generation knobs are clamped
|
||||||
to Chatterbox's ranges.
|
to Chatterbox's ranges. Synthesis is serialized (one at a time — the upstream
|
||||||
|
has no GPU lock of its own); a concurrent request gets a fast `429`.
|
||||||
- `GET /tts/voices` — list the voice library.
|
- `GET /tts/voices` — list the voice library.
|
||||||
- `POST /tts/voices/upload` — multipart `voice_name` + `voice_file`; clone a
|
- `POST /tts/voices/upload` — multipart `voice_name` + `voice_file`; clone a
|
||||||
voice from an uploaded clip (≤25 MB).
|
voice from an uploaded clip (≤25 MB).
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ use serde::{Deserialize, Serialize};
|
|||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::sync::LazyLock;
|
use std::sync::LazyLock;
|
||||||
|
use tokio::sync::Semaphore;
|
||||||
|
|
||||||
use crate::data::Claims;
|
use crate::data::Claims;
|
||||||
use crate::file_types::{is_audio_file, is_video_file};
|
use crate::file_types::{is_audio_file, is_video_file};
|
||||||
@@ -31,6 +32,14 @@ use crate::state::AppState;
|
|||||||
/// upload can't balloon ImageApi memory before we ever forward it.
|
/// upload can't balloon ImageApi memory before we ever forward it.
|
||||||
const MAX_VOICE_UPLOAD_BYTES: usize = 25 * 1024 * 1024; // 25 MB
|
const MAX_VOICE_UPLOAD_BYTES: usize = 25 * 1024 * 1024; // 25 MB
|
||||||
|
|
||||||
|
/// Serialize speech synthesis: the Chatterbox server has no internal lock or
|
||||||
|
/// queue, so concurrent requests contend on the single GPU and cascade into
|
||||||
|
/// timeouts. One permit; when busy we fast-fail with 429 rather than queue —
|
||||||
|
/// the app surfaces "busy" immediately, and typical jobs clear in well under a
|
||||||
|
/// minute. (An abandoned upstream job can still occupy the GPU until it
|
||||||
|
/// finishes — that's a wrapper limitation; the chunked-queue plan fixes it.)
|
||||||
|
static TTS_PERMIT: LazyLock<Semaphore> = LazyLock::new(|| Semaphore::new(1));
|
||||||
|
|
||||||
/// Sanitize a user-supplied voice name. The name is forwarded to Chatterbox
|
/// Sanitize a user-supplied voice name. The name is forwarded to Chatterbox
|
||||||
/// where it becomes a filename in the voice-library directory, so we restrict
|
/// where it becomes a filename in the voice-library directory, so we restrict
|
||||||
/// it to a safe charset (alphanumerics, dash, underscore) — no path
|
/// it to a safe charset (alphanumerics, dash, underscore) — no path
|
||||||
@@ -235,6 +244,14 @@ pub async fn tts_speech_handler(
|
|||||||
let cfg_weight = req.cfg_weight.map(|x| x.clamp(0.0, 1.0));
|
let cfg_weight = req.cfg_weight.map(|x| x.clamp(0.0, 1.0));
|
||||||
let temperature = req.temperature.map(|x| x.clamp(0.05, 5.0));
|
let temperature = req.temperature.map(|x| x.clamp(0.05, 5.0));
|
||||||
|
|
||||||
|
// One synthesis at a time (see TTS_PERMIT) — fast-fail when busy.
|
||||||
|
let Ok(_permit) = TTS_PERMIT.try_acquire() else {
|
||||||
|
span.set_status(Status::error("tts busy"));
|
||||||
|
return HttpResponse::TooManyRequests().json(json!({
|
||||||
|
"error": "TTS is busy with another request — try again shortly"
|
||||||
|
}));
|
||||||
|
};
|
||||||
|
|
||||||
match client
|
match client
|
||||||
.text_to_speech(&text, voice, format, exaggeration, cfg_weight, temperature)
|
.text_to_speech(&text, voice, format, exaggeration, cfg_weight, temperature)
|
||||||
.await
|
.await
|
||||||
|
|||||||
Reference in New Issue
Block a user