From 03699f7413596ebe9f8321b4ab511355af05bf79 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Wed, 10 Jun 2026 17:36:15 -0400 Subject: [PATCH] Add TTS voice deletion, async speech jobs, voice-list cache, ref-seconds name tags MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - DELETE /tts/voices/{name}: remove a cloned voice via the llama-swap passthrough (upstream chatterbox-tts-api exposes DELETE /voices/{name}). - POST/GET/DELETE /tts/speech/jobs: durable job flow for long syntheses — dispatch returns 202 + job id, the synth queues on the GPU permit instead of fast-failing 429, and clients poll for the result (kept ~10 min). - GET /tts/voices now serves an in-memory cache so listing voices doesn't make llama-swap spin up the TTS model (evicting the resident LLM); invalidated on create/delete, ?refresh=1 forces an upstream re-query. - Created voice names are tagged with LLAMA_SWAP_TTS_REF_SECONDS (e.g. grandma-30s) so the library shows which ref length produced each clone. Co-Authored-By: Claude Fable 5 --- README.md | 16 +- src/ai/llamacpp.rs | 28 +++ src/ai/mod.rs | 5 +- src/ai/tts.rs | 551 +++++++++++++++++++++++++++++++++++++++++++-- src/main.rs | 4 + 5 files changed, 588 insertions(+), 16 deletions(-) diff --git a/README.md b/README.md index 39ebe30..f355e32 100644 --- a/README.md +++ b/README.md @@ -156,12 +156,26 @@ is built whenever that's set — independent of `LLM_BACKEND`). Endpoints: server-side (markdown + emoji stripped) and the generation knobs are clamped to Chatterbox's ranges. Synthesis is serialized (one at a time — the upstream has no GPU lock of its own); a concurrent request gets a fast `429`. -- `GET /tts/voices` — list the voice library. +- `POST /tts/speech/jobs` — durable variant for long syntheses: same body as + `/tts/speech`, returns `202 { job_id, status }` immediately. Jobs queue on the + GPU permit instead of fast-failing `429`. +- `GET /tts/speech/jobs/{id}` — poll a job: `{ job_id, status, format, + audio_base64?, error? }` with status `queued|running|done|error|cancelled`. + Results are kept in memory ~10 min after completion, then the job 404s. +- `DELETE /tts/speech/jobs/{id}` — cancel a queued/running job. +- `GET /tts/voices` — list the voice library. Served from an in-memory cache + (so the listing doesn't make llama-swap spin up the TTS model and evict the + resident LLM); pass `?refresh=1` to force an upstream re-query. The cache is + invalidated by voice create/delete. - `POST /tts/voices/upload` — multipart `voice_name` + `voice_file`; clone a voice from an uploaded clip (≤25 MB). - `POST /tts/voices/from-library` — body `{ voice_name, path, library? }`; clone from a library file (audio forwarded as-is; video has its audio extracted via ffmpeg). +- `DELETE /tts/voices/{name}` — remove a cloned voice from the library. + +Created voice names are tagged with the ref-clip cap in effect (e.g. +`grandma-30s`) so the library shows which reference length produced each clone. Env: - `LLAMA_SWAP_TTS_MODEL` - TTS model id in llama-swap's `config.yaml` [default: `chatterbox`] diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs index 6227e2f..820f5f8 100644 --- a/src/ai/llamacpp.rs +++ b/src/ai/llamacpp.rs @@ -253,6 +253,34 @@ impl LlamaCppClient { resp.json().await.context("parsing create_voice response") } + /// Delete a cloned voice from the Chatterbox voice library + /// (`DELETE /voices/{name}` on the upstream, via llama-swap passthrough). + pub async fn delete_voice(&self, voice_name: &str) -> Result { + let url = format!( + "{}/upstream/{}/voices/{}", + self.swap_root(), + self.tts_model, + voice_name + ); + let resp = self + .client + .delete(&url) + .send() + .await + .with_context(|| format!("DELETE {} failed", url))?; + + if !resp.status().is_success() { + let status = resp.status(); + let text = resp.text().await.unwrap_or_default(); + bail!("llama-swap delete_voice failed: {} — {}", status, text); + } + // Some upstreams reply with an empty body on delete. + Ok(resp + .json() + .await + .unwrap_or_else(|_| json!({ "status": "deleted" }))) + } + /// Translate canonical messages to the OpenAI-compatible wire shape. /// Behaviorally identical to `OpenRouterClient::messages_to_openai` — /// stringify tool-call arguments, rewrite images into content-parts, attach diff --git a/src/ai/mod.rs b/src/ai/mod.rs index 40a3f21..e083e1d 100644 --- a/src/ai/mod.rs +++ b/src/ai/mod.rs @@ -37,8 +37,9 @@ pub use llm_client::{ pub use ollama::{EMBEDDING_MODEL, OllamaClient}; pub use sms_client::{SmsApiClient, SmsMessage}; pub use tts::{ - create_voice_from_library_handler, create_voice_upload_handler, list_voices_handler, - tts_speech_handler, + cancel_speech_job_handler, create_speech_job_handler, create_voice_from_library_handler, + create_voice_upload_handler, delete_voice_handler, list_voices_handler, + speech_job_status_handler, tts_speech_handler, }; /// Display name used for the user in message transcripts and first-person diff --git a/src/ai/tts.rs b/src/ai/tts.rs index b94be36..02cfc88 100644 --- a/src/ai/tts.rs +++ b/src/ai/tts.rs @@ -6,7 +6,7 @@ // (audio read directly; video has its audio track extracted via ffmpeg). use actix_multipart::Multipart; -use actix_web::{HttpRequest, HttpResponse, Responder, get, post, web}; +use actix_web::{HttpRequest, HttpResponse, Responder, delete, get, post, web}; use anyhow::Context; use base64::Engine; use bytes::{BufMut, BytesMut}; @@ -15,10 +15,13 @@ use opentelemetry::KeyValue; use opentelemetry::trace::{Span, Status, Tracer}; use regex::Regex; use serde::{Deserialize, Serialize}; -use serde_json::json; +use serde_json::{Value, json}; +use std::collections::HashMap; use std::path::Path; -use std::sync::LazyLock; +use std::sync::{LazyLock, Mutex as StdMutex}; +use std::time::{Duration, Instant}; use tokio::sync::Semaphore; +use uuid::Uuid; use crate::data::Claims; use crate::file_types::{is_audio_file, is_video_file}; @@ -40,6 +43,105 @@ const MAX_VOICE_UPLOAD_BYTES: usize = 25 * 1024 * 1024; // 25 MB /// finishes — that's a wrapper limitation; the chunked-queue plan fixes it.) static TTS_PERMIT: LazyLock = LazyLock::new(|| Semaphore::new(1)); +// --- Voice-list cache -------------------------------------------------------- + +/// Cached raw voice-library JSON. llama-swap's `/upstream//voices` +/// passthrough spins the TTS model up just to answer a listing — which can +/// evict the resident LLM — so we serve a cached copy and only hit upstream on +/// a cold cache, an explicit `?refresh=1`, or after a voice create/delete +/// invalidates it (the TTS model is already loaded right then anyway). +static VOICES_CACHE: LazyLock>> = LazyLock::new(|| StdMutex::new(None)); + +fn cached_voices() -> Option { + VOICES_CACHE.lock().unwrap().clone() +} + +fn store_voices_cache(v: &Value) { + *VOICES_CACHE.lock().unwrap() = Some(v.clone()); +} + +fn invalidate_voices_cache() { + *VOICES_CACHE.lock().unwrap() = None; +} + +// --- Async speech jobs ------------------------------------------------------- +// +// Synthesizing a long insight can take minutes — too long to hang one HTTP +// request from a phone that may background the app or drop the connection. +// Durable variant: POST /tts/speech/jobs returns a job id immediately, the +// synth runs in a spawned task (queuing on TTS_PERMIT instead of fast-failing +// 429), and the client polls GET /tts/speech/jobs/{id} until it collects the +// audio. State is in-memory only (deliberately lighter than the chat +// TurnRegistry): a restart loses jobs, the client surfaces that and retries. + +#[derive(Clone, Copy, PartialEq, Eq, Debug, Serialize)] +#[serde(rename_all = "snake_case")] +pub enum TtsJobStatus { + Queued, + Running, + Done, + Error, + Cancelled, +} + +impl TtsJobStatus { + fn is_terminal(self) -> bool { + matches!(self, Self::Done | Self::Error | Self::Cancelled) + } +} + +struct TtsJob { + status: TtsJobStatus, + format: String, + audio_base64: Option, + error: Option, + created_at: Instant, + finished_at: Option, + abort: Option, +} + +/// Finished jobs linger so a client that lost connectivity can still collect +/// the result on a later poll; anything older than MAX_AGE is dropped outright +/// (aborted first if somehow still running). Swept lazily on each dispatch. +const TTS_JOB_RESULT_TTL: Duration = Duration::from_secs(10 * 60); +const TTS_JOB_MAX_AGE: Duration = Duration::from_secs(30 * 60); + +static TTS_JOBS: LazyLock>> = + LazyLock::new(|| StdMutex::new(HashMap::new())); + +fn sweep_stale_jobs(jobs: &mut HashMap, now: Instant) { + jobs.retain(|_, job| { + let result_expired = job + .finished_at + .is_some_and(|t| now.duration_since(t) >= TTS_JOB_RESULT_TTL); + let too_old = now.duration_since(job.created_at) >= TTS_JOB_MAX_AGE; + if too_old && let Some(h) = job.abort.take() { + h.abort(); + } + !(result_expired || too_old) + }); +} + +/// Run `f` against a job, if it still exists. +fn with_job(id: Uuid, f: impl FnOnce(&mut TtsJob) -> R) -> Option { + TTS_JOBS.lock().unwrap().get_mut(&id).map(f) +} + +/// Move a job to a terminal state (first terminal write wins — a cancel that +/// raced a completion keeps the cancel). +fn finish_job(id: Uuid, status: TtsJobStatus, audio_base64: Option, error: Option) { + with_job(id, |job| { + if job.status.is_terminal() { + return; + } + job.status = status; + job.audio_base64 = audio_base64; + job.error = error; + job.finished_at = Some(Instant::now()); + job.abort = None; + }); +} + /// Sanitize a user-supplied voice name. The name is forwarded to Chatterbox /// where it becomes a filename in the voice-library directory, so we restrict /// it to a safe charset (alphanumerics, dash, underscore) — no path @@ -64,6 +166,33 @@ fn sanitize_voice_name(raw: &str) -> Option { Some(cleaned.chars().take(64).collect()) } +/// Reference-clip cap in seconds for voice cloning. Chatterbox is zero-shot — +/// a clean ~10–20s sample is the sweet spot and more rarely helps. Tune via +/// `LLAMA_SWAP_TTS_REF_SECONDS` (default 30). +fn tts_ref_seconds() -> u32 { + std::env::var("LLAMA_SWAP_TTS_REF_SECONDS") + .ok() + .and_then(|s| s.trim().parse::().ok()) + .filter(|n| *n > 0) + .unwrap_or(30) +} + +/// Tag a (sanitized) voice name with the reference-clip cap used to create it, +/// e.g. `grandma` → `grandma-30s`. The tag makes the ref length visible in the +/// voice list so clones of the same source at different caps can be compared. +/// Skips the append when the name already ends in the same tag; keeps the +/// 64-char bound by truncating the base name, never the tag. +fn append_ref_seconds(name: &str, secs: u32) -> String { + let suffix = format!("-{secs}s"); + if name.ends_with(&suffix) { + return name.to_string(); + } + let max_base = 64usize.saturating_sub(suffix.len()); + let base: String = name.chars().take(max_base).collect(); + let base = base.trim_end_matches('-'); + format!("{base}{suffix}") +} + /// Optional default voice for synthesis when the request doesn't name one. /// Set `LLAMA_SWAP_TTS_VOICE=m` to read insights in a cloned voice by default. fn default_voice() -> Option { @@ -137,15 +266,9 @@ async fn run_ffmpeg_to_wav(input_path: &str) -> anyhow::Result> { .context("creating temp wav")?; let out_s = out.path().to_string_lossy().to_string(); - // Cap the reference clip length. Chatterbox is zero-shot — a clean ~10–20s - // sample is the sweet spot and more rarely helps — so we use the first N - // seconds. Tune via LLAMA_SWAP_TTS_REF_SECONDS (default 30). - let secs = std::env::var("LLAMA_SWAP_TTS_REF_SECONDS") - .ok() - .and_then(|s| s.trim().parse::().ok()) - .filter(|n| *n > 0) - .unwrap_or(30) - .to_string(); + // Cap the reference clip length — we use the first N seconds (see + // tts_ref_seconds). + let secs = tts_ref_seconds().to_string(); let output = tokio::process::Command::new("ffmpeg") .args([ @@ -276,16 +399,277 @@ pub async fn tts_speech_handler( } } -/// GET /tts/voices — list the Chatterbox voice library (raw passthrough). +#[derive(Debug, Serialize)] +pub struct TtsJobCreatedResponse { + pub job_id: String, + pub status: TtsJobStatus, +} + +#[derive(Debug, Serialize)] +pub struct TtsJobStatusResponse { + pub job_id: String, + pub status: TtsJobStatus, + pub format: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub audio_base64: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub error: Option, +} + +/// POST /tts/speech/jobs — durable variant of /tts/speech for long syntheses. +/// Returns 202 + a job id immediately; the synth queues on the single GPU +/// permit (instead of fast-failing 429) and the client polls the job until +/// the audio is ready. +#[post("/tts/speech/jobs")] +pub async fn create_speech_job_handler( + http_request: HttpRequest, + _claims: Claims, + req: web::Json, + app_state: web::Data, +) -> impl Responder { + let parent_context = extract_context_from_request(&http_request); + let mut span = + global_tracer().start_with_context("http.tts.speech_job.create", &parent_context); + + let text = clean_for_tts(&req.text); + if text.is_empty() { + span.set_status(Status::error("text is required")); + return HttpResponse::BadRequest().json(json!({ "error": "text is required" })); + } + if app_state.llamacpp.is_none() { + span.set_status(Status::error("tts backend not configured")); + return HttpResponse::ServiceUnavailable() + .json(json!({ "error": "TTS backend not configured (set LLAMA_SWAP_URL)" })); + } + + let format = req + .format + .as_deref() + .filter(|s| !s.is_empty()) + .unwrap_or("mp3") + .to_string(); + let voice = req + .voice + .clone() + .filter(|s| !s.is_empty()) + .or_else(default_voice); + // Clamp generation knobs to Chatterbox's documented ranges before forwarding. + let exaggeration = req.exaggeration.map(|x| x.clamp(0.25, 2.0)); + let cfg_weight = req.cfg_weight.map(|x| x.clamp(0.0, 1.0)); + let temperature = req.temperature.map(|x| x.clamp(0.05, 5.0)); + + span.set_attribute(KeyValue::new("tts.format", format.clone())); + span.set_attribute(KeyValue::new("tts.has_voice", voice.is_some())); + span.set_attribute(KeyValue::new("tts.text_len", text.len() as i64)); + + let job_id = Uuid::new_v4(); + { + let mut jobs = TTS_JOBS.lock().unwrap(); + sweep_stale_jobs(&mut jobs, Instant::now()); + jobs.insert( + job_id, + TtsJob { + status: TtsJobStatus::Queued, + format: format.clone(), + audio_base64: None, + error: None, + created_at: Instant::now(), + finished_at: None, + abort: None, + }, + ); + } + + let state = app_state.clone(); + let handle = tokio::spawn(async move { + // Queue rather than fast-fail: jobs wait their turn for the GPU. + let _permit = match TTS_PERMIT.acquire().await { + Ok(p) => p, + Err(_) => { + finish_job( + job_id, + TtsJobStatus::Error, + None, + Some("TTS queue closed".to_string()), + ); + return; + } + }; + // Cancelled while queued — release the permit without synthesizing. + let cancelled = with_job(job_id, |job| { + if job.status == TtsJobStatus::Queued { + job.status = TtsJobStatus::Running; + false + } else { + true + } + }) + .unwrap_or(true); + if cancelled { + return; + } + + let Some(client) = state.llamacpp.as_ref() else { + finish_job( + job_id, + TtsJobStatus::Error, + None, + Some("TTS backend not configured".to_string()), + ); + return; + }; + match client + .text_to_speech( + &text, + voice.as_deref(), + &format, + exaggeration, + cfg_weight, + temperature, + ) + .await + { + Ok(bytes) => { + let audio = base64::engine::general_purpose::STANDARD.encode(&bytes); + finish_job(job_id, TtsJobStatus::Done, Some(audio), None); + } + Err(e) => { + log::error!("TTS job {job_id} failed: {:?}", e); + finish_job( + job_id, + TtsJobStatus::Error, + None, + Some(format!("TTS failed: {e}")), + ); + } + } + }); + // Aborting an already-finished task is a no-op, so this late install is + // safe even if the job raced to completion. + with_job(job_id, |job| { + if !job.status.is_terminal() { + job.abort = Some(handle.abort_handle()); + } + }); + + span.set_status(Status::Ok); + HttpResponse::Accepted().json(TtsJobCreatedResponse { + job_id: job_id.to_string(), + status: TtsJobStatus::Queued, + }) +} + +/// GET /tts/speech/jobs/{id} — poll a speech job; returns the audio once done. +/// 404s after the job expires (results are kept ~10 min past completion). +#[get("/tts/speech/jobs/{id}")] +pub async fn speech_job_status_handler( + http_request: HttpRequest, + _claims: Claims, + path: web::Path, +) -> impl Responder { + let parent_context = extract_context_from_request(&http_request); + let mut span = + global_tracer().start_with_context("http.tts.speech_job.status", &parent_context); + + let Ok(id) = Uuid::parse_str(&path.into_inner()) else { + span.set_status(Status::error("invalid job id")); + return HttpResponse::BadRequest().json(json!({ "error": "invalid job id" })); + }; + let resp = { + let jobs = TTS_JOBS.lock().unwrap(); + jobs.get(&id).map(|job| TtsJobStatusResponse { + job_id: id.to_string(), + status: job.status, + format: job.format.clone(), + audio_base64: job.audio_base64.clone(), + error: job.error.clone(), + }) + }; + match resp { + Some(r) => { + span.set_status(Status::Ok); + HttpResponse::Ok().json(r) + } + None => { + span.set_status(Status::error("job not found")); + HttpResponse::NotFound() + .json(json!({ "error": "TTS job not found (it may have expired)" })) + } + } +} + +/// DELETE /tts/speech/jobs/{id} — cancel a queued/running speech job. Once the +/// upstream GPU job has started it can't be interrupted (same wrapper +/// limitation as the sync path); cancelling stops the wait and discards the +/// result. Cancelling an already-finished job leaves it terminal. +#[delete("/tts/speech/jobs/{id}")] +pub async fn cancel_speech_job_handler( + http_request: HttpRequest, + _claims: Claims, + path: web::Path, +) -> impl Responder { + let parent_context = extract_context_from_request(&http_request); + let mut span = + global_tracer().start_with_context("http.tts.speech_job.cancel", &parent_context); + + let Ok(id) = Uuid::parse_str(&path.into_inner()) else { + span.set_status(Status::error("invalid job id")); + return HttpResponse::BadRequest().json(json!({ "error": "invalid job id" })); + }; + let status = with_job(id, |job| { + if !job.status.is_terminal() { + if let Some(h) = job.abort.take() { + h.abort(); + } + job.status = TtsJobStatus::Cancelled; + job.finished_at = Some(Instant::now()); + } + job.status + }); + match status { + Some(s) => { + span.set_status(Status::Ok); + HttpResponse::Ok().json(json!({ "job_id": id.to_string(), "status": s })) + } + None => { + span.set_status(Status::error("job not found")); + HttpResponse::NotFound() + .json(json!({ "error": "TTS job not found (it may have expired)" })) + } + } +} + +#[derive(Debug, Deserialize)] +pub struct ListVoicesQuery { + /// `?refresh=1` bypasses the voice-list cache and re-queries upstream + /// (which may spin up the TTS model). + #[serde(default)] + pub refresh: Option, +} + +/// GET /tts/voices — list the Chatterbox voice library. Served from an +/// in-memory cache when possible so browsing settings doesn't make llama-swap +/// load the TTS model (and evict the resident LLM); see VOICES_CACHE. #[get("/tts/voices")] pub async fn list_voices_handler( http_request: HttpRequest, _claims: Claims, + query: web::Query, app_state: web::Data, ) -> impl Responder { let parent_context = extract_context_from_request(&http_request); let mut span = global_tracer().start_with_context("http.tts.voices.list", &parent_context); + let force = query + .refresh + .as_deref() + .is_some_and(|v| matches!(v, "1" | "true" | "yes")); + if !force && let Some(v) = cached_voices() { + span.set_attribute(KeyValue::new("tts.voices_cache_hit", true)); + span.set_status(Status::Ok); + return HttpResponse::Ok().json(v); + } + let Some(client) = app_state.llamacpp.as_ref() else { span.set_status(Status::error("tts backend not configured")); return HttpResponse::ServiceUnavailable() @@ -293,6 +677,8 @@ pub async fn list_voices_handler( }; match client.list_voices().await { Ok(v) => { + store_voices_cache(&v); + span.set_attribute(KeyValue::new("tts.voices_cache_hit", false)); span.set_status(Status::Ok); HttpResponse::Ok().json(v) } @@ -304,6 +690,48 @@ pub async fn list_voices_handler( } } +/// DELETE /tts/voices/{name} — remove a cloned voice from the library. +#[delete("/tts/voices/{name}")] +pub async fn delete_voice_handler( + http_request: HttpRequest, + _claims: Claims, + path: web::Path, + app_state: web::Data, +) -> impl Responder { + let parent_context = extract_context_from_request(&http_request); + let mut span = global_tracer().start_with_context("http.tts.voices.delete", &parent_context); + + let Some(client) = app_state.llamacpp.as_ref() else { + span.set_status(Status::error("tts backend not configured")); + return HttpResponse::ServiceUnavailable() + .json(json!({ "error": "TTS backend not configured" })); + }; + // Same charset rule as creation — a name that sanitizes differently was + // never a voice we created, and must not reach the upstream URL. + let raw = path.into_inner(); + let name = match sanitize_voice_name(&raw) { + Some(n) if n == raw => n, + _ => { + span.set_status(Status::error("invalid voice name")); + return HttpResponse::BadRequest().json(json!({ "error": "invalid voice name" })); + } + }; + span.set_attribute(KeyValue::new("tts.voice_name", name.clone())); + + match client.delete_voice(&name).await { + Ok(v) => { + invalidate_voices_cache(); + span.set_status(Status::Ok); + HttpResponse::Ok().json(v) + } + Err(e) => { + span.set_status(Status::error("delete_voice failed")); + log::error!("delete_voice failed: {:?}", e); + HttpResponse::BadGateway().json(json!({ "error": format!("{e}") })) + } + } +} + /// POST /tts/voices/upload — register a cloned voice from an uploaded audio /// clip. Multipart fields: `voice_name` (text) + a file part (`voice_file`). #[post("/tts/voices/upload")] @@ -363,6 +791,9 @@ pub async fn create_voice_upload_handler( return HttpResponse::BadRequest() .json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" })); }; + // Tag the name with the ref-clip cap (e.g. `grandma-30s`) so the library + // shows which reference length produced each clone. + let name = append_ref_seconds(&name, tts_ref_seconds()); if file_bytes.is_empty() { span.set_status(Status::error("voice_file is required")); return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" })); @@ -388,6 +819,7 @@ pub async fn create_voice_upload_handler( .await { Ok(v) => { + invalidate_voices_cache(); span.set_status(Status::Ok); HttpResponse::Ok().json(v) } @@ -432,6 +864,9 @@ pub async fn create_voice_from_library_handler( return HttpResponse::BadRequest() .json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" })); }; + // Tag the name with the ref-clip cap (e.g. `grandma-30s`) so the library + // shows which reference length produced each clone. + let voice_name = append_ref_seconds(&voice_name, tts_ref_seconds()); let library = match libraries::resolve_library_param(&app_state, req.library.as_deref()) { Ok(Some(l)) => l, @@ -475,6 +910,7 @@ pub async fn create_voice_from_library_handler( .await { Ok(v) => { + invalidate_voices_cache(); span.set_status(Status::Ok); HttpResponse::Ok().json(v) } @@ -534,6 +970,95 @@ mod tests { assert_eq!(sanitize_voice_name(&long).unwrap().len(), 64); } + #[test] + fn append_ref_seconds_tags_name() { + assert_eq!(append_ref_seconds("grandma", 30), "grandma-30s"); + assert_eq!(append_ref_seconds("voice_01", 15), "voice_01-15s"); + } + + #[test] + fn append_ref_seconds_is_idempotent_for_same_cap() { + assert_eq!(append_ref_seconds("grandma-30s", 30), "grandma-30s"); + // A different cap still appends — that's the comparison use-case. + assert_eq!(append_ref_seconds("grandma-15s", 30), "grandma-15s-30s"); + } + + #[test] + fn append_ref_seconds_keeps_64_char_bound() { + let long = "a".repeat(64); + let tagged = append_ref_seconds(&long, 30); + assert_eq!(tagged.len(), 64); + assert!(tagged.ends_with("-30s")); + } + + #[test] + fn sweep_drops_expired_results_and_keeps_live_jobs() { + let now = Instant::now(); + let mk = |status: TtsJobStatus, created: Instant, finished: Option| TtsJob { + status, + format: "mp3".into(), + audio_base64: None, + error: None, + created_at: created, + finished_at: finished, + abort: None, + }; + let mut jobs = HashMap::new(); + let live = Uuid::new_v4(); + let fresh_done = Uuid::new_v4(); + let stale_done = Uuid::new_v4(); + jobs.insert(live, mk(TtsJobStatus::Running, now, None)); + jobs.insert( + fresh_done, + mk(TtsJobStatus::Done, now, Some(now - Duration::from_secs(60))), + ); + jobs.insert( + stale_done, + mk( + TtsJobStatus::Done, + now - TTS_JOB_MAX_AGE / 2, + Some(now - TTS_JOB_RESULT_TTL), + ), + ); + + sweep_stale_jobs(&mut jobs, now); + assert!(jobs.contains_key(&live)); + assert!(jobs.contains_key(&fresh_done)); + assert!(!jobs.contains_key(&stale_done)); + } + + #[test] + fn sweep_drops_jobs_past_max_age_even_if_unfinished() { + let now = Instant::now(); + let mut jobs = HashMap::new(); + let ancient = Uuid::new_v4(); + jobs.insert( + ancient, + TtsJob { + status: TtsJobStatus::Running, + format: "mp3".into(), + audio_base64: None, + error: None, + created_at: now - TTS_JOB_MAX_AGE, + finished_at: None, + abort: None, + }, + ); + sweep_stale_jobs(&mut jobs, now); + assert!(jobs.is_empty()); + } + + #[test] + fn voices_cache_roundtrip_and_invalidation() { + invalidate_voices_cache(); + assert!(cached_voices().is_none()); + let v = json!({ "voices": [{ "name": "m-30s" }], "count": 1 }); + store_voices_cache(&v); + assert_eq!(cached_voices(), Some(v)); + invalidate_voices_cache(); + assert!(cached_voices().is_none()); + } + #[test] fn clean_for_tts_strips_markdown() { assert_eq!( diff --git a/src/main.rs b/src/main.rs index f27cf8f..8b56efd 100644 --- a/src/main.rs +++ b/src/main.rs @@ -364,9 +364,13 @@ fn main() -> std::io::Result<()> { .service(ai::rate_insight_handler) .service(ai::export_training_data_handler) .service(ai::tts_speech_handler) + .service(ai::create_speech_job_handler) + .service(ai::speech_job_status_handler) + .service(ai::cancel_speech_job_handler) .service(ai::list_voices_handler) .service(ai::create_voice_upload_handler) .service(ai::create_voice_from_library_handler) + .service(ai::delete_voice_handler) .service(libraries::list_libraries) .service(libraries::patch_library) .add_feature(add_tag_services::<_, SqliteTagDao>)