Add TTS voice deletion, async speech jobs, voice-list cache, ref-seconds name tags
- DELETE /tts/voices/{name}: remove a cloned voice via the llama-swap
passthrough (upstream chatterbox-tts-api exposes DELETE /voices/{name}).
- POST/GET/DELETE /tts/speech/jobs: durable job flow for long syntheses —
dispatch returns 202 + job id, the synth queues on the GPU permit instead
of fast-failing 429, and clients poll for the result (kept ~10 min).
- GET /tts/voices now serves an in-memory cache so listing voices doesn't
make llama-swap spin up the TTS model (evicting the resident LLM);
invalidated on create/delete, ?refresh=1 forces an upstream re-query.
- Created voice names are tagged with LLAMA_SWAP_TTS_REF_SECONDS (e.g.
grandma-30s) so the library shows which ref length produced each clone.
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
@@ -156,12 +156,26 @@ is built whenever that's set — independent of `LLM_BACKEND`). Endpoints:
|
|||||||
server-side (markdown + emoji stripped) and the generation knobs are clamped
|
server-side (markdown + emoji stripped) and the generation knobs are clamped
|
||||||
to Chatterbox's ranges. Synthesis is serialized (one at a time — the upstream
|
to Chatterbox's ranges. Synthesis is serialized (one at a time — the upstream
|
||||||
has no GPU lock of its own); a concurrent request gets a fast `429`.
|
has no GPU lock of its own); a concurrent request gets a fast `429`.
|
||||||
- `GET /tts/voices` — list the voice library.
|
- `POST /tts/speech/jobs` — durable variant for long syntheses: same body as
|
||||||
|
`/tts/speech`, returns `202 { job_id, status }` immediately. Jobs queue on the
|
||||||
|
GPU permit instead of fast-failing `429`.
|
||||||
|
- `GET /tts/speech/jobs/{id}` — poll a job: `{ job_id, status, format,
|
||||||
|
audio_base64?, error? }` with status `queued|running|done|error|cancelled`.
|
||||||
|
Results are kept in memory ~10 min after completion, then the job 404s.
|
||||||
|
- `DELETE /tts/speech/jobs/{id}` — cancel a queued/running job.
|
||||||
|
- `GET /tts/voices` — list the voice library. Served from an in-memory cache
|
||||||
|
(so the listing doesn't make llama-swap spin up the TTS model and evict the
|
||||||
|
resident LLM); pass `?refresh=1` to force an upstream re-query. The cache is
|
||||||
|
invalidated by voice create/delete.
|
||||||
- `POST /tts/voices/upload` — multipart `voice_name` + `voice_file`; clone a
|
- `POST /tts/voices/upload` — multipart `voice_name` + `voice_file`; clone a
|
||||||
voice from an uploaded clip (≤25 MB).
|
voice from an uploaded clip (≤25 MB).
|
||||||
- `POST /tts/voices/from-library` — body `{ voice_name, path, library? }`; clone
|
- `POST /tts/voices/from-library` — body `{ voice_name, path, library? }`; clone
|
||||||
from a library file (audio forwarded as-is; video has its audio extracted via
|
from a library file (audio forwarded as-is; video has its audio extracted via
|
||||||
ffmpeg).
|
ffmpeg).
|
||||||
|
- `DELETE /tts/voices/{name}` — remove a cloned voice from the library.
|
||||||
|
|
||||||
|
Created voice names are tagged with the ref-clip cap in effect (e.g.
|
||||||
|
`grandma-30s`) so the library shows which reference length produced each clone.
|
||||||
|
|
||||||
Env:
|
Env:
|
||||||
- `LLAMA_SWAP_TTS_MODEL` - TTS model id in llama-swap's `config.yaml` [default: `chatterbox`]
|
- `LLAMA_SWAP_TTS_MODEL` - TTS model id in llama-swap's `config.yaml` [default: `chatterbox`]
|
||||||
|
|||||||
@@ -253,6 +253,34 @@ impl LlamaCppClient {
|
|||||||
resp.json().await.context("parsing create_voice response")
|
resp.json().await.context("parsing create_voice response")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Delete a cloned voice from the Chatterbox voice library
|
||||||
|
/// (`DELETE /voices/{name}` on the upstream, via llama-swap passthrough).
|
||||||
|
pub async fn delete_voice(&self, voice_name: &str) -> Result<Value> {
|
||||||
|
let url = format!(
|
||||||
|
"{}/upstream/{}/voices/{}",
|
||||||
|
self.swap_root(),
|
||||||
|
self.tts_model,
|
||||||
|
voice_name
|
||||||
|
);
|
||||||
|
let resp = self
|
||||||
|
.client
|
||||||
|
.delete(&url)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("DELETE {} failed", url))?;
|
||||||
|
|
||||||
|
if !resp.status().is_success() {
|
||||||
|
let status = resp.status();
|
||||||
|
let text = resp.text().await.unwrap_or_default();
|
||||||
|
bail!("llama-swap delete_voice failed: {} — {}", status, text);
|
||||||
|
}
|
||||||
|
// Some upstreams reply with an empty body on delete.
|
||||||
|
Ok(resp
|
||||||
|
.json()
|
||||||
|
.await
|
||||||
|
.unwrap_or_else(|_| json!({ "status": "deleted" })))
|
||||||
|
}
|
||||||
|
|
||||||
/// Translate canonical messages to the OpenAI-compatible wire shape.
|
/// Translate canonical messages to the OpenAI-compatible wire shape.
|
||||||
/// Behaviorally identical to `OpenRouterClient::messages_to_openai` —
|
/// Behaviorally identical to `OpenRouterClient::messages_to_openai` —
|
||||||
/// stringify tool-call arguments, rewrite images into content-parts, attach
|
/// stringify tool-call arguments, rewrite images into content-parts, attach
|
||||||
|
|||||||
+3
-2
@@ -37,8 +37,9 @@ pub use llm_client::{
|
|||||||
pub use ollama::{EMBEDDING_MODEL, OllamaClient};
|
pub use ollama::{EMBEDDING_MODEL, OllamaClient};
|
||||||
pub use sms_client::{SmsApiClient, SmsMessage};
|
pub use sms_client::{SmsApiClient, SmsMessage};
|
||||||
pub use tts::{
|
pub use tts::{
|
||||||
create_voice_from_library_handler, create_voice_upload_handler, list_voices_handler,
|
cancel_speech_job_handler, create_speech_job_handler, create_voice_from_library_handler,
|
||||||
tts_speech_handler,
|
create_voice_upload_handler, delete_voice_handler, list_voices_handler,
|
||||||
|
speech_job_status_handler, tts_speech_handler,
|
||||||
};
|
};
|
||||||
|
|
||||||
/// Display name used for the user in message transcripts and first-person
|
/// Display name used for the user in message transcripts and first-person
|
||||||
|
|||||||
+538
-13
@@ -6,7 +6,7 @@
|
|||||||
// (audio read directly; video has its audio track extracted via ffmpeg).
|
// (audio read directly; video has its audio track extracted via ffmpeg).
|
||||||
|
|
||||||
use actix_multipart::Multipart;
|
use actix_multipart::Multipart;
|
||||||
use actix_web::{HttpRequest, HttpResponse, Responder, get, post, web};
|
use actix_web::{HttpRequest, HttpResponse, Responder, delete, get, post, web};
|
||||||
use anyhow::Context;
|
use anyhow::Context;
|
||||||
use base64::Engine;
|
use base64::Engine;
|
||||||
use bytes::{BufMut, BytesMut};
|
use bytes::{BufMut, BytesMut};
|
||||||
@@ -15,10 +15,13 @@ use opentelemetry::KeyValue;
|
|||||||
use opentelemetry::trace::{Span, Status, Tracer};
|
use opentelemetry::trace::{Span, Status, Tracer};
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_json::json;
|
use serde_json::{Value, json};
|
||||||
|
use std::collections::HashMap;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
use std::sync::LazyLock;
|
use std::sync::{LazyLock, Mutex as StdMutex};
|
||||||
|
use std::time::{Duration, Instant};
|
||||||
use tokio::sync::Semaphore;
|
use tokio::sync::Semaphore;
|
||||||
|
use uuid::Uuid;
|
||||||
|
|
||||||
use crate::data::Claims;
|
use crate::data::Claims;
|
||||||
use crate::file_types::{is_audio_file, is_video_file};
|
use crate::file_types::{is_audio_file, is_video_file};
|
||||||
@@ -40,6 +43,105 @@ const MAX_VOICE_UPLOAD_BYTES: usize = 25 * 1024 * 1024; // 25 MB
|
|||||||
/// finishes — that's a wrapper limitation; the chunked-queue plan fixes it.)
|
/// finishes — that's a wrapper limitation; the chunked-queue plan fixes it.)
|
||||||
static TTS_PERMIT: LazyLock<Semaphore> = LazyLock::new(|| Semaphore::new(1));
|
static TTS_PERMIT: LazyLock<Semaphore> = LazyLock::new(|| Semaphore::new(1));
|
||||||
|
|
||||||
|
// --- Voice-list cache --------------------------------------------------------
|
||||||
|
|
||||||
|
/// Cached raw voice-library JSON. llama-swap's `/upstream/<model>/voices`
|
||||||
|
/// passthrough spins the TTS model up just to answer a listing — which can
|
||||||
|
/// evict the resident LLM — so we serve a cached copy and only hit upstream on
|
||||||
|
/// a cold cache, an explicit `?refresh=1`, or after a voice create/delete
|
||||||
|
/// invalidates it (the TTS model is already loaded right then anyway).
|
||||||
|
static VOICES_CACHE: LazyLock<StdMutex<Option<Value>>> = LazyLock::new(|| StdMutex::new(None));
|
||||||
|
|
||||||
|
fn cached_voices() -> Option<Value> {
|
||||||
|
VOICES_CACHE.lock().unwrap().clone()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn store_voices_cache(v: &Value) {
|
||||||
|
*VOICES_CACHE.lock().unwrap() = Some(v.clone());
|
||||||
|
}
|
||||||
|
|
||||||
|
fn invalidate_voices_cache() {
|
||||||
|
*VOICES_CACHE.lock().unwrap() = None;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- Async speech jobs -------------------------------------------------------
|
||||||
|
//
|
||||||
|
// Synthesizing a long insight can take minutes — too long to hang one HTTP
|
||||||
|
// request from a phone that may background the app or drop the connection.
|
||||||
|
// Durable variant: POST /tts/speech/jobs returns a job id immediately, the
|
||||||
|
// synth runs in a spawned task (queuing on TTS_PERMIT instead of fast-failing
|
||||||
|
// 429), and the client polls GET /tts/speech/jobs/{id} until it collects the
|
||||||
|
// audio. State is in-memory only (deliberately lighter than the chat
|
||||||
|
// TurnRegistry): a restart loses jobs, the client surfaces that and retries.
|
||||||
|
|
||||||
|
#[derive(Clone, Copy, PartialEq, Eq, Debug, Serialize)]
|
||||||
|
#[serde(rename_all = "snake_case")]
|
||||||
|
pub enum TtsJobStatus {
|
||||||
|
Queued,
|
||||||
|
Running,
|
||||||
|
Done,
|
||||||
|
Error,
|
||||||
|
Cancelled,
|
||||||
|
}
|
||||||
|
|
||||||
|
impl TtsJobStatus {
|
||||||
|
fn is_terminal(self) -> bool {
|
||||||
|
matches!(self, Self::Done | Self::Error | Self::Cancelled)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
struct TtsJob {
|
||||||
|
status: TtsJobStatus,
|
||||||
|
format: String,
|
||||||
|
audio_base64: Option<String>,
|
||||||
|
error: Option<String>,
|
||||||
|
created_at: Instant,
|
||||||
|
finished_at: Option<Instant>,
|
||||||
|
abort: Option<tokio::task::AbortHandle>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Finished jobs linger so a client that lost connectivity can still collect
|
||||||
|
/// the result on a later poll; anything older than MAX_AGE is dropped outright
|
||||||
|
/// (aborted first if somehow still running). Swept lazily on each dispatch.
|
||||||
|
const TTS_JOB_RESULT_TTL: Duration = Duration::from_secs(10 * 60);
|
||||||
|
const TTS_JOB_MAX_AGE: Duration = Duration::from_secs(30 * 60);
|
||||||
|
|
||||||
|
static TTS_JOBS: LazyLock<StdMutex<HashMap<Uuid, TtsJob>>> =
|
||||||
|
LazyLock::new(|| StdMutex::new(HashMap::new()));
|
||||||
|
|
||||||
|
fn sweep_stale_jobs(jobs: &mut HashMap<Uuid, TtsJob>, now: Instant) {
|
||||||
|
jobs.retain(|_, job| {
|
||||||
|
let result_expired = job
|
||||||
|
.finished_at
|
||||||
|
.is_some_and(|t| now.duration_since(t) >= TTS_JOB_RESULT_TTL);
|
||||||
|
let too_old = now.duration_since(job.created_at) >= TTS_JOB_MAX_AGE;
|
||||||
|
if too_old && let Some(h) = job.abort.take() {
|
||||||
|
h.abort();
|
||||||
|
}
|
||||||
|
!(result_expired || too_old)
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Run `f` against a job, if it still exists.
|
||||||
|
fn with_job<R>(id: Uuid, f: impl FnOnce(&mut TtsJob) -> R) -> Option<R> {
|
||||||
|
TTS_JOBS.lock().unwrap().get_mut(&id).map(f)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Move a job to a terminal state (first terminal write wins — a cancel that
|
||||||
|
/// raced a completion keeps the cancel).
|
||||||
|
fn finish_job(id: Uuid, status: TtsJobStatus, audio_base64: Option<String>, error: Option<String>) {
|
||||||
|
with_job(id, |job| {
|
||||||
|
if job.status.is_terminal() {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
job.status = status;
|
||||||
|
job.audio_base64 = audio_base64;
|
||||||
|
job.error = error;
|
||||||
|
job.finished_at = Some(Instant::now());
|
||||||
|
job.abort = None;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
/// Sanitize a user-supplied voice name. The name is forwarded to Chatterbox
|
/// Sanitize a user-supplied voice name. The name is forwarded to Chatterbox
|
||||||
/// where it becomes a filename in the voice-library directory, so we restrict
|
/// where it becomes a filename in the voice-library directory, so we restrict
|
||||||
/// it to a safe charset (alphanumerics, dash, underscore) — no path
|
/// it to a safe charset (alphanumerics, dash, underscore) — no path
|
||||||
@@ -64,6 +166,33 @@ fn sanitize_voice_name(raw: &str) -> Option<String> {
|
|||||||
Some(cleaned.chars().take(64).collect())
|
Some(cleaned.chars().take(64).collect())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Reference-clip cap in seconds for voice cloning. Chatterbox is zero-shot —
|
||||||
|
/// a clean ~10–20s sample is the sweet spot and more rarely helps. Tune via
|
||||||
|
/// `LLAMA_SWAP_TTS_REF_SECONDS` (default 30).
|
||||||
|
fn tts_ref_seconds() -> u32 {
|
||||||
|
std::env::var("LLAMA_SWAP_TTS_REF_SECONDS")
|
||||||
|
.ok()
|
||||||
|
.and_then(|s| s.trim().parse::<u32>().ok())
|
||||||
|
.filter(|n| *n > 0)
|
||||||
|
.unwrap_or(30)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Tag a (sanitized) voice name with the reference-clip cap used to create it,
|
||||||
|
/// e.g. `grandma` → `grandma-30s`. The tag makes the ref length visible in the
|
||||||
|
/// voice list so clones of the same source at different caps can be compared.
|
||||||
|
/// Skips the append when the name already ends in the same tag; keeps the
|
||||||
|
/// 64-char bound by truncating the base name, never the tag.
|
||||||
|
fn append_ref_seconds(name: &str, secs: u32) -> String {
|
||||||
|
let suffix = format!("-{secs}s");
|
||||||
|
if name.ends_with(&suffix) {
|
||||||
|
return name.to_string();
|
||||||
|
}
|
||||||
|
let max_base = 64usize.saturating_sub(suffix.len());
|
||||||
|
let base: String = name.chars().take(max_base).collect();
|
||||||
|
let base = base.trim_end_matches('-');
|
||||||
|
format!("{base}{suffix}")
|
||||||
|
}
|
||||||
|
|
||||||
/// Optional default voice for synthesis when the request doesn't name one.
|
/// Optional default voice for synthesis when the request doesn't name one.
|
||||||
/// Set `LLAMA_SWAP_TTS_VOICE=m` to read insights in a cloned voice by default.
|
/// Set `LLAMA_SWAP_TTS_VOICE=m` to read insights in a cloned voice by default.
|
||||||
fn default_voice() -> Option<String> {
|
fn default_voice() -> Option<String> {
|
||||||
@@ -137,15 +266,9 @@ async fn run_ffmpeg_to_wav(input_path: &str) -> anyhow::Result<Vec<u8>> {
|
|||||||
.context("creating temp wav")?;
|
.context("creating temp wav")?;
|
||||||
let out_s = out.path().to_string_lossy().to_string();
|
let out_s = out.path().to_string_lossy().to_string();
|
||||||
|
|
||||||
// Cap the reference clip length. Chatterbox is zero-shot — a clean ~10–20s
|
// Cap the reference clip length — we use the first N seconds (see
|
||||||
// sample is the sweet spot and more rarely helps — so we use the first N
|
// tts_ref_seconds).
|
||||||
// seconds. Tune via LLAMA_SWAP_TTS_REF_SECONDS (default 30).
|
let secs = tts_ref_seconds().to_string();
|
||||||
let secs = std::env::var("LLAMA_SWAP_TTS_REF_SECONDS")
|
|
||||||
.ok()
|
|
||||||
.and_then(|s| s.trim().parse::<u32>().ok())
|
|
||||||
.filter(|n| *n > 0)
|
|
||||||
.unwrap_or(30)
|
|
||||||
.to_string();
|
|
||||||
|
|
||||||
let output = tokio::process::Command::new("ffmpeg")
|
let output = tokio::process::Command::new("ffmpeg")
|
||||||
.args([
|
.args([
|
||||||
@@ -276,16 +399,277 @@ pub async fn tts_speech_handler(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/// GET /tts/voices — list the Chatterbox voice library (raw passthrough).
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct TtsJobCreatedResponse {
|
||||||
|
pub job_id: String,
|
||||||
|
pub status: TtsJobStatus,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct TtsJobStatusResponse {
|
||||||
|
pub job_id: String,
|
||||||
|
pub status: TtsJobStatus,
|
||||||
|
pub format: String,
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
pub audio_base64: Option<String>,
|
||||||
|
#[serde(skip_serializing_if = "Option::is_none")]
|
||||||
|
pub error: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// POST /tts/speech/jobs — durable variant of /tts/speech for long syntheses.
|
||||||
|
/// Returns 202 + a job id immediately; the synth queues on the single GPU
|
||||||
|
/// permit (instead of fast-failing 429) and the client polls the job until
|
||||||
|
/// the audio is ready.
|
||||||
|
#[post("/tts/speech/jobs")]
|
||||||
|
pub async fn create_speech_job_handler(
|
||||||
|
http_request: HttpRequest,
|
||||||
|
_claims: Claims,
|
||||||
|
req: web::Json<TtsSpeechRequest>,
|
||||||
|
app_state: web::Data<AppState>,
|
||||||
|
) -> impl Responder {
|
||||||
|
let parent_context = extract_context_from_request(&http_request);
|
||||||
|
let mut span =
|
||||||
|
global_tracer().start_with_context("http.tts.speech_job.create", &parent_context);
|
||||||
|
|
||||||
|
let text = clean_for_tts(&req.text);
|
||||||
|
if text.is_empty() {
|
||||||
|
span.set_status(Status::error("text is required"));
|
||||||
|
return HttpResponse::BadRequest().json(json!({ "error": "text is required" }));
|
||||||
|
}
|
||||||
|
if app_state.llamacpp.is_none() {
|
||||||
|
span.set_status(Status::error("tts backend not configured"));
|
||||||
|
return HttpResponse::ServiceUnavailable()
|
||||||
|
.json(json!({ "error": "TTS backend not configured (set LLAMA_SWAP_URL)" }));
|
||||||
|
}
|
||||||
|
|
||||||
|
let format = req
|
||||||
|
.format
|
||||||
|
.as_deref()
|
||||||
|
.filter(|s| !s.is_empty())
|
||||||
|
.unwrap_or("mp3")
|
||||||
|
.to_string();
|
||||||
|
let voice = req
|
||||||
|
.voice
|
||||||
|
.clone()
|
||||||
|
.filter(|s| !s.is_empty())
|
||||||
|
.or_else(default_voice);
|
||||||
|
// Clamp generation knobs to Chatterbox's documented ranges before forwarding.
|
||||||
|
let exaggeration = req.exaggeration.map(|x| x.clamp(0.25, 2.0));
|
||||||
|
let cfg_weight = req.cfg_weight.map(|x| x.clamp(0.0, 1.0));
|
||||||
|
let temperature = req.temperature.map(|x| x.clamp(0.05, 5.0));
|
||||||
|
|
||||||
|
span.set_attribute(KeyValue::new("tts.format", format.clone()));
|
||||||
|
span.set_attribute(KeyValue::new("tts.has_voice", voice.is_some()));
|
||||||
|
span.set_attribute(KeyValue::new("tts.text_len", text.len() as i64));
|
||||||
|
|
||||||
|
let job_id = Uuid::new_v4();
|
||||||
|
{
|
||||||
|
let mut jobs = TTS_JOBS.lock().unwrap();
|
||||||
|
sweep_stale_jobs(&mut jobs, Instant::now());
|
||||||
|
jobs.insert(
|
||||||
|
job_id,
|
||||||
|
TtsJob {
|
||||||
|
status: TtsJobStatus::Queued,
|
||||||
|
format: format.clone(),
|
||||||
|
audio_base64: None,
|
||||||
|
error: None,
|
||||||
|
created_at: Instant::now(),
|
||||||
|
finished_at: None,
|
||||||
|
abort: None,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
let state = app_state.clone();
|
||||||
|
let handle = tokio::spawn(async move {
|
||||||
|
// Queue rather than fast-fail: jobs wait their turn for the GPU.
|
||||||
|
let _permit = match TTS_PERMIT.acquire().await {
|
||||||
|
Ok(p) => p,
|
||||||
|
Err(_) => {
|
||||||
|
finish_job(
|
||||||
|
job_id,
|
||||||
|
TtsJobStatus::Error,
|
||||||
|
None,
|
||||||
|
Some("TTS queue closed".to_string()),
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
// Cancelled while queued — release the permit without synthesizing.
|
||||||
|
let cancelled = with_job(job_id, |job| {
|
||||||
|
if job.status == TtsJobStatus::Queued {
|
||||||
|
job.status = TtsJobStatus::Running;
|
||||||
|
false
|
||||||
|
} else {
|
||||||
|
true
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.unwrap_or(true);
|
||||||
|
if cancelled {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
let Some(client) = state.llamacpp.as_ref() else {
|
||||||
|
finish_job(
|
||||||
|
job_id,
|
||||||
|
TtsJobStatus::Error,
|
||||||
|
None,
|
||||||
|
Some("TTS backend not configured".to_string()),
|
||||||
|
);
|
||||||
|
return;
|
||||||
|
};
|
||||||
|
match client
|
||||||
|
.text_to_speech(
|
||||||
|
&text,
|
||||||
|
voice.as_deref(),
|
||||||
|
&format,
|
||||||
|
exaggeration,
|
||||||
|
cfg_weight,
|
||||||
|
temperature,
|
||||||
|
)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(bytes) => {
|
||||||
|
let audio = base64::engine::general_purpose::STANDARD.encode(&bytes);
|
||||||
|
finish_job(job_id, TtsJobStatus::Done, Some(audio), None);
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
log::error!("TTS job {job_id} failed: {:?}", e);
|
||||||
|
finish_job(
|
||||||
|
job_id,
|
||||||
|
TtsJobStatus::Error,
|
||||||
|
None,
|
||||||
|
Some(format!("TTS failed: {e}")),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
// Aborting an already-finished task is a no-op, so this late install is
|
||||||
|
// safe even if the job raced to completion.
|
||||||
|
with_job(job_id, |job| {
|
||||||
|
if !job.status.is_terminal() {
|
||||||
|
job.abort = Some(handle.abort_handle());
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
span.set_status(Status::Ok);
|
||||||
|
HttpResponse::Accepted().json(TtsJobCreatedResponse {
|
||||||
|
job_id: job_id.to_string(),
|
||||||
|
status: TtsJobStatus::Queued,
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
/// GET /tts/speech/jobs/{id} — poll a speech job; returns the audio once done.
|
||||||
|
/// 404s after the job expires (results are kept ~10 min past completion).
|
||||||
|
#[get("/tts/speech/jobs/{id}")]
|
||||||
|
pub async fn speech_job_status_handler(
|
||||||
|
http_request: HttpRequest,
|
||||||
|
_claims: Claims,
|
||||||
|
path: web::Path<String>,
|
||||||
|
) -> impl Responder {
|
||||||
|
let parent_context = extract_context_from_request(&http_request);
|
||||||
|
let mut span =
|
||||||
|
global_tracer().start_with_context("http.tts.speech_job.status", &parent_context);
|
||||||
|
|
||||||
|
let Ok(id) = Uuid::parse_str(&path.into_inner()) else {
|
||||||
|
span.set_status(Status::error("invalid job id"));
|
||||||
|
return HttpResponse::BadRequest().json(json!({ "error": "invalid job id" }));
|
||||||
|
};
|
||||||
|
let resp = {
|
||||||
|
let jobs = TTS_JOBS.lock().unwrap();
|
||||||
|
jobs.get(&id).map(|job| TtsJobStatusResponse {
|
||||||
|
job_id: id.to_string(),
|
||||||
|
status: job.status,
|
||||||
|
format: job.format.clone(),
|
||||||
|
audio_base64: job.audio_base64.clone(),
|
||||||
|
error: job.error.clone(),
|
||||||
|
})
|
||||||
|
};
|
||||||
|
match resp {
|
||||||
|
Some(r) => {
|
||||||
|
span.set_status(Status::Ok);
|
||||||
|
HttpResponse::Ok().json(r)
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
span.set_status(Status::error("job not found"));
|
||||||
|
HttpResponse::NotFound()
|
||||||
|
.json(json!({ "error": "TTS job not found (it may have expired)" }))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// DELETE /tts/speech/jobs/{id} — cancel a queued/running speech job. Once the
|
||||||
|
/// upstream GPU job has started it can't be interrupted (same wrapper
|
||||||
|
/// limitation as the sync path); cancelling stops the wait and discards the
|
||||||
|
/// result. Cancelling an already-finished job leaves it terminal.
|
||||||
|
#[delete("/tts/speech/jobs/{id}")]
|
||||||
|
pub async fn cancel_speech_job_handler(
|
||||||
|
http_request: HttpRequest,
|
||||||
|
_claims: Claims,
|
||||||
|
path: web::Path<String>,
|
||||||
|
) -> impl Responder {
|
||||||
|
let parent_context = extract_context_from_request(&http_request);
|
||||||
|
let mut span =
|
||||||
|
global_tracer().start_with_context("http.tts.speech_job.cancel", &parent_context);
|
||||||
|
|
||||||
|
let Ok(id) = Uuid::parse_str(&path.into_inner()) else {
|
||||||
|
span.set_status(Status::error("invalid job id"));
|
||||||
|
return HttpResponse::BadRequest().json(json!({ "error": "invalid job id" }));
|
||||||
|
};
|
||||||
|
let status = with_job(id, |job| {
|
||||||
|
if !job.status.is_terminal() {
|
||||||
|
if let Some(h) = job.abort.take() {
|
||||||
|
h.abort();
|
||||||
|
}
|
||||||
|
job.status = TtsJobStatus::Cancelled;
|
||||||
|
job.finished_at = Some(Instant::now());
|
||||||
|
}
|
||||||
|
job.status
|
||||||
|
});
|
||||||
|
match status {
|
||||||
|
Some(s) => {
|
||||||
|
span.set_status(Status::Ok);
|
||||||
|
HttpResponse::Ok().json(json!({ "job_id": id.to_string(), "status": s }))
|
||||||
|
}
|
||||||
|
None => {
|
||||||
|
span.set_status(Status::error("job not found"));
|
||||||
|
HttpResponse::NotFound()
|
||||||
|
.json(json!({ "error": "TTS job not found (it may have expired)" }))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct ListVoicesQuery {
|
||||||
|
/// `?refresh=1` bypasses the voice-list cache and re-queries upstream
|
||||||
|
/// (which may spin up the TTS model).
|
||||||
|
#[serde(default)]
|
||||||
|
pub refresh: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// GET /tts/voices — list the Chatterbox voice library. Served from an
|
||||||
|
/// in-memory cache when possible so browsing settings doesn't make llama-swap
|
||||||
|
/// load the TTS model (and evict the resident LLM); see VOICES_CACHE.
|
||||||
#[get("/tts/voices")]
|
#[get("/tts/voices")]
|
||||||
pub async fn list_voices_handler(
|
pub async fn list_voices_handler(
|
||||||
http_request: HttpRequest,
|
http_request: HttpRequest,
|
||||||
_claims: Claims,
|
_claims: Claims,
|
||||||
|
query: web::Query<ListVoicesQuery>,
|
||||||
app_state: web::Data<AppState>,
|
app_state: web::Data<AppState>,
|
||||||
) -> impl Responder {
|
) -> impl Responder {
|
||||||
let parent_context = extract_context_from_request(&http_request);
|
let parent_context = extract_context_from_request(&http_request);
|
||||||
let mut span = global_tracer().start_with_context("http.tts.voices.list", &parent_context);
|
let mut span = global_tracer().start_with_context("http.tts.voices.list", &parent_context);
|
||||||
|
|
||||||
|
let force = query
|
||||||
|
.refresh
|
||||||
|
.as_deref()
|
||||||
|
.is_some_and(|v| matches!(v, "1" | "true" | "yes"));
|
||||||
|
if !force && let Some(v) = cached_voices() {
|
||||||
|
span.set_attribute(KeyValue::new("tts.voices_cache_hit", true));
|
||||||
|
span.set_status(Status::Ok);
|
||||||
|
return HttpResponse::Ok().json(v);
|
||||||
|
}
|
||||||
|
|
||||||
let Some(client) = app_state.llamacpp.as_ref() else {
|
let Some(client) = app_state.llamacpp.as_ref() else {
|
||||||
span.set_status(Status::error("tts backend not configured"));
|
span.set_status(Status::error("tts backend not configured"));
|
||||||
return HttpResponse::ServiceUnavailable()
|
return HttpResponse::ServiceUnavailable()
|
||||||
@@ -293,6 +677,8 @@ pub async fn list_voices_handler(
|
|||||||
};
|
};
|
||||||
match client.list_voices().await {
|
match client.list_voices().await {
|
||||||
Ok(v) => {
|
Ok(v) => {
|
||||||
|
store_voices_cache(&v);
|
||||||
|
span.set_attribute(KeyValue::new("tts.voices_cache_hit", false));
|
||||||
span.set_status(Status::Ok);
|
span.set_status(Status::Ok);
|
||||||
HttpResponse::Ok().json(v)
|
HttpResponse::Ok().json(v)
|
||||||
}
|
}
|
||||||
@@ -304,6 +690,48 @@ pub async fn list_voices_handler(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// DELETE /tts/voices/{name} — remove a cloned voice from the library.
|
||||||
|
#[delete("/tts/voices/{name}")]
|
||||||
|
pub async fn delete_voice_handler(
|
||||||
|
http_request: HttpRequest,
|
||||||
|
_claims: Claims,
|
||||||
|
path: web::Path<String>,
|
||||||
|
app_state: web::Data<AppState>,
|
||||||
|
) -> impl Responder {
|
||||||
|
let parent_context = extract_context_from_request(&http_request);
|
||||||
|
let mut span = global_tracer().start_with_context("http.tts.voices.delete", &parent_context);
|
||||||
|
|
||||||
|
let Some(client) = app_state.llamacpp.as_ref() else {
|
||||||
|
span.set_status(Status::error("tts backend not configured"));
|
||||||
|
return HttpResponse::ServiceUnavailable()
|
||||||
|
.json(json!({ "error": "TTS backend not configured" }));
|
||||||
|
};
|
||||||
|
// Same charset rule as creation — a name that sanitizes differently was
|
||||||
|
// never a voice we created, and must not reach the upstream URL.
|
||||||
|
let raw = path.into_inner();
|
||||||
|
let name = match sanitize_voice_name(&raw) {
|
||||||
|
Some(n) if n == raw => n,
|
||||||
|
_ => {
|
||||||
|
span.set_status(Status::error("invalid voice name"));
|
||||||
|
return HttpResponse::BadRequest().json(json!({ "error": "invalid voice name" }));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
span.set_attribute(KeyValue::new("tts.voice_name", name.clone()));
|
||||||
|
|
||||||
|
match client.delete_voice(&name).await {
|
||||||
|
Ok(v) => {
|
||||||
|
invalidate_voices_cache();
|
||||||
|
span.set_status(Status::Ok);
|
||||||
|
HttpResponse::Ok().json(v)
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
span.set_status(Status::error("delete_voice failed"));
|
||||||
|
log::error!("delete_voice failed: {:?}", e);
|
||||||
|
HttpResponse::BadGateway().json(json!({ "error": format!("{e}") }))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// POST /tts/voices/upload — register a cloned voice from an uploaded audio
|
/// POST /tts/voices/upload — register a cloned voice from an uploaded audio
|
||||||
/// clip. Multipart fields: `voice_name` (text) + a file part (`voice_file`).
|
/// clip. Multipart fields: `voice_name` (text) + a file part (`voice_file`).
|
||||||
#[post("/tts/voices/upload")]
|
#[post("/tts/voices/upload")]
|
||||||
@@ -363,6 +791,9 @@ pub async fn create_voice_upload_handler(
|
|||||||
return HttpResponse::BadRequest()
|
return HttpResponse::BadRequest()
|
||||||
.json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
|
.json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
|
||||||
};
|
};
|
||||||
|
// Tag the name with the ref-clip cap (e.g. `grandma-30s`) so the library
|
||||||
|
// shows which reference length produced each clone.
|
||||||
|
let name = append_ref_seconds(&name, tts_ref_seconds());
|
||||||
if file_bytes.is_empty() {
|
if file_bytes.is_empty() {
|
||||||
span.set_status(Status::error("voice_file is required"));
|
span.set_status(Status::error("voice_file is required"));
|
||||||
return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" }));
|
return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" }));
|
||||||
@@ -388,6 +819,7 @@ pub async fn create_voice_upload_handler(
|
|||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
Ok(v) => {
|
Ok(v) => {
|
||||||
|
invalidate_voices_cache();
|
||||||
span.set_status(Status::Ok);
|
span.set_status(Status::Ok);
|
||||||
HttpResponse::Ok().json(v)
|
HttpResponse::Ok().json(v)
|
||||||
}
|
}
|
||||||
@@ -432,6 +864,9 @@ pub async fn create_voice_from_library_handler(
|
|||||||
return HttpResponse::BadRequest()
|
return HttpResponse::BadRequest()
|
||||||
.json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
|
.json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
|
||||||
};
|
};
|
||||||
|
// Tag the name with the ref-clip cap (e.g. `grandma-30s`) so the library
|
||||||
|
// shows which reference length produced each clone.
|
||||||
|
let voice_name = append_ref_seconds(&voice_name, tts_ref_seconds());
|
||||||
|
|
||||||
let library = match libraries::resolve_library_param(&app_state, req.library.as_deref()) {
|
let library = match libraries::resolve_library_param(&app_state, req.library.as_deref()) {
|
||||||
Ok(Some(l)) => l,
|
Ok(Some(l)) => l,
|
||||||
@@ -475,6 +910,7 @@ pub async fn create_voice_from_library_handler(
|
|||||||
.await
|
.await
|
||||||
{
|
{
|
||||||
Ok(v) => {
|
Ok(v) => {
|
||||||
|
invalidate_voices_cache();
|
||||||
span.set_status(Status::Ok);
|
span.set_status(Status::Ok);
|
||||||
HttpResponse::Ok().json(v)
|
HttpResponse::Ok().json(v)
|
||||||
}
|
}
|
||||||
@@ -534,6 +970,95 @@ mod tests {
|
|||||||
assert_eq!(sanitize_voice_name(&long).unwrap().len(), 64);
|
assert_eq!(sanitize_voice_name(&long).unwrap().len(), 64);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn append_ref_seconds_tags_name() {
|
||||||
|
assert_eq!(append_ref_seconds("grandma", 30), "grandma-30s");
|
||||||
|
assert_eq!(append_ref_seconds("voice_01", 15), "voice_01-15s");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn append_ref_seconds_is_idempotent_for_same_cap() {
|
||||||
|
assert_eq!(append_ref_seconds("grandma-30s", 30), "grandma-30s");
|
||||||
|
// A different cap still appends — that's the comparison use-case.
|
||||||
|
assert_eq!(append_ref_seconds("grandma-15s", 30), "grandma-15s-30s");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn append_ref_seconds_keeps_64_char_bound() {
|
||||||
|
let long = "a".repeat(64);
|
||||||
|
let tagged = append_ref_seconds(&long, 30);
|
||||||
|
assert_eq!(tagged.len(), 64);
|
||||||
|
assert!(tagged.ends_with("-30s"));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sweep_drops_expired_results_and_keeps_live_jobs() {
|
||||||
|
let now = Instant::now();
|
||||||
|
let mk = |status: TtsJobStatus, created: Instant, finished: Option<Instant>| TtsJob {
|
||||||
|
status,
|
||||||
|
format: "mp3".into(),
|
||||||
|
audio_base64: None,
|
||||||
|
error: None,
|
||||||
|
created_at: created,
|
||||||
|
finished_at: finished,
|
||||||
|
abort: None,
|
||||||
|
};
|
||||||
|
let mut jobs = HashMap::new();
|
||||||
|
let live = Uuid::new_v4();
|
||||||
|
let fresh_done = Uuid::new_v4();
|
||||||
|
let stale_done = Uuid::new_v4();
|
||||||
|
jobs.insert(live, mk(TtsJobStatus::Running, now, None));
|
||||||
|
jobs.insert(
|
||||||
|
fresh_done,
|
||||||
|
mk(TtsJobStatus::Done, now, Some(now - Duration::from_secs(60))),
|
||||||
|
);
|
||||||
|
jobs.insert(
|
||||||
|
stale_done,
|
||||||
|
mk(
|
||||||
|
TtsJobStatus::Done,
|
||||||
|
now - TTS_JOB_MAX_AGE / 2,
|
||||||
|
Some(now - TTS_JOB_RESULT_TTL),
|
||||||
|
),
|
||||||
|
);
|
||||||
|
|
||||||
|
sweep_stale_jobs(&mut jobs, now);
|
||||||
|
assert!(jobs.contains_key(&live));
|
||||||
|
assert!(jobs.contains_key(&fresh_done));
|
||||||
|
assert!(!jobs.contains_key(&stale_done));
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sweep_drops_jobs_past_max_age_even_if_unfinished() {
|
||||||
|
let now = Instant::now();
|
||||||
|
let mut jobs = HashMap::new();
|
||||||
|
let ancient = Uuid::new_v4();
|
||||||
|
jobs.insert(
|
||||||
|
ancient,
|
||||||
|
TtsJob {
|
||||||
|
status: TtsJobStatus::Running,
|
||||||
|
format: "mp3".into(),
|
||||||
|
audio_base64: None,
|
||||||
|
error: None,
|
||||||
|
created_at: now - TTS_JOB_MAX_AGE,
|
||||||
|
finished_at: None,
|
||||||
|
abort: None,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
sweep_stale_jobs(&mut jobs, now);
|
||||||
|
assert!(jobs.is_empty());
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn voices_cache_roundtrip_and_invalidation() {
|
||||||
|
invalidate_voices_cache();
|
||||||
|
assert!(cached_voices().is_none());
|
||||||
|
let v = json!({ "voices": [{ "name": "m-30s" }], "count": 1 });
|
||||||
|
store_voices_cache(&v);
|
||||||
|
assert_eq!(cached_voices(), Some(v));
|
||||||
|
invalidate_voices_cache();
|
||||||
|
assert!(cached_voices().is_none());
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn clean_for_tts_strips_markdown() {
|
fn clean_for_tts_strips_markdown() {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
|
|||||||
@@ -364,9 +364,13 @@ fn main() -> std::io::Result<()> {
|
|||||||
.service(ai::rate_insight_handler)
|
.service(ai::rate_insight_handler)
|
||||||
.service(ai::export_training_data_handler)
|
.service(ai::export_training_data_handler)
|
||||||
.service(ai::tts_speech_handler)
|
.service(ai::tts_speech_handler)
|
||||||
|
.service(ai::create_speech_job_handler)
|
||||||
|
.service(ai::speech_job_status_handler)
|
||||||
|
.service(ai::cancel_speech_job_handler)
|
||||||
.service(ai::list_voices_handler)
|
.service(ai::list_voices_handler)
|
||||||
.service(ai::create_voice_upload_handler)
|
.service(ai::create_voice_upload_handler)
|
||||||
.service(ai::create_voice_from_library_handler)
|
.service(ai::create_voice_from_library_handler)
|
||||||
|
.service(ai::delete_voice_handler)
|
||||||
.service(libraries::list_libraries)
|
.service(libraries::list_libraries)
|
||||||
.service(libraries::patch_library)
|
.service(libraries::patch_library)
|
||||||
.add_feature(add_tag_services::<_, SqliteTagDao>)
|
.add_feature(add_tag_services::<_, SqliteTagDao>)
|
||||||
|
|||||||
Reference in New Issue
Block a user