Feature/tts integration #103
@@ -80,6 +80,16 @@ AGENTIC_CHAT_MAX_ITERATIONS=6
|
|||||||
# LLAMA_SWAP_ALLOWED_MODELS=chat,vision,embed
|
# LLAMA_SWAP_ALLOWED_MODELS=chat,vision,embed
|
||||||
# LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180
|
# LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180
|
||||||
|
|
||||||
|
# ── Text-to-speech (optional, requires LLAMA_SWAP_URL) ───────────────────
|
||||||
|
# TTS routes through the same llama-swap proxy (a Chatterbox model id), so it
|
||||||
|
# only needs LLAMA_SWAP_URL — it does NOT require LLM_BACKEND=llamacpp.
|
||||||
|
# Powers POST /tts/speech and the /tts/voices* endpoints (read-aloud insights
|
||||||
|
# + voice cloning in the mobile app).
|
||||||
|
# LLAMA_SWAP_TTS_MODEL=chatterbox # TTS model id in config.yaml
|
||||||
|
# LLAMA_SWAP_TTS_VOICE=m # default voice when a request omits one
|
||||||
|
# LLAMA_SWAP_TTS_REF_SECONDS=30 # max voice-clone reference clip length (s)
|
||||||
|
# LLAMA_SWAP_TTS_REQUEST_TIMEOUT_SECONDS=600 # synth timeout (long chunked text)
|
||||||
|
|
||||||
# ── AI Insights — sibling services (optional) ───────────────────────────
|
# ── AI Insights — sibling services (optional) ───────────────────────────
|
||||||
# Apollo (places, face inference, CLIP encoders). Single-Apollo deploys
|
# Apollo (places, face inference, CLIP encoders). Single-Apollo deploys
|
||||||
# typically set only APOLLO_API_BASE_URL and let the face + CLIP
|
# typically set only APOLLO_API_BASE_URL and let the face + CLIP
|
||||||
|
|||||||
@@ -477,6 +477,12 @@ GET /insights/models (local-backend models + capabilities; Ollam
|
|||||||
GET /insights/openrouter/models (curated OpenRouter allowlist)
|
GET /insights/openrouter/models (curated OpenRouter allowlist)
|
||||||
POST /insights/rate (thumbs up/down for training data)
|
POST /insights/rate (thumbs up/down for training data)
|
||||||
|
|
||||||
|
// Text-to-Speech (Chatterbox via llama-swap; needs LLAMA_SWAP_URL)
|
||||||
|
POST /tts/speech (read-aloud: { text, voice?, ... } -> { audio_base64, format })
|
||||||
|
GET /tts/voices (Chatterbox voice library)
|
||||||
|
POST /tts/voices/upload (clone a voice from an uploaded clip; multipart)
|
||||||
|
POST /tts/voices/from-library (clone a voice from a library audio/video file)
|
||||||
|
|
||||||
// Insight Chat Continuation
|
// Insight Chat Continuation
|
||||||
POST /insights/chat (single-turn reply, non-streaming)
|
POST /insights/chat (single-turn reply, non-streaming)
|
||||||
POST /insights/chat/stream (SSE: text / tool_call / tool_result / truncated / done)
|
POST /insights/chat/stream (SSE: text / tool_call / tool_result / truncated / done)
|
||||||
@@ -652,6 +658,17 @@ LLAMA_SWAP_ALLOWED_MODELS=chat,coder # Curated allowlist surfaced by
|
|||||||
# Empty = picker shows only the configured primary model.
|
# Empty = picker shows only the configured primary model.
|
||||||
LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180 # Per-request timeout; bump for slow CPU offload
|
LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180 # Per-request timeout; bump for slow CPU offload
|
||||||
|
|
||||||
|
# Text-to-speech (Chatterbox served behind llama-swap). Only needs
|
||||||
|
# LLAMA_SWAP_URL — independent of LLM_BACKEND. Powers /tts/speech (read-aloud)
|
||||||
|
# and /tts/voices* (voice cloning). Reference audio is ffmpeg-normalized to WAV
|
||||||
|
# server-side, so any source format works.
|
||||||
|
LLAMA_SWAP_TTS_MODEL=chatterbox # TTS model id in config.yaml (default: chatterbox)
|
||||||
|
LLAMA_SWAP_TTS_VOICE=m # Default voice when /tts/speech omits one (optional)
|
||||||
|
LLAMA_SWAP_TTS_REF_SECONDS=30 # Max voice-clone reference clip length, seconds
|
||||||
|
# (Chatterbox is zero-shot; ~10-20s clean ref is ideal)
|
||||||
|
LLAMA_SWAP_TTS_REQUEST_TIMEOUT_SECONDS=600 # Per-request synth timeout (long chunked insights take
|
||||||
|
# minutes); overrides the shared client timeout for /tts/speech
|
||||||
|
|
||||||
# Insight Chat Continuation
|
# Insight Chat Continuation
|
||||||
AGENTIC_CHAT_MAX_ITERATIONS=6 # Cap on tool-calling iterations per chat turn (default 6)
|
AGENTIC_CHAT_MAX_ITERATIONS=6 # Cap on tool-calling iterations per chat turn (default 6)
|
||||||
```
|
```
|
||||||
|
|||||||
Generated
+1
-1
@@ -2051,7 +2051,7 @@ dependencies = [
|
|||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "image-api"
|
name = "image-api"
|
||||||
version = "1.2.0"
|
version = "1.3.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"actix",
|
"actix",
|
||||||
"actix-cors",
|
"actix-cors",
|
||||||
|
|||||||
+1
-1
@@ -1,6 +1,6 @@
|
|||||||
[package]
|
[package]
|
||||||
name = "image-api"
|
name = "image-api"
|
||||||
version = "1.2.0"
|
version = "1.3.0"
|
||||||
authors = ["Cameron Cordes <cameronc.dev@gmail.com>"]
|
authors = ["Cameron Cordes <cameronc.dev@gmail.com>"]
|
||||||
edition = "2024"
|
edition = "2024"
|
||||||
|
|
||||||
|
|||||||
@@ -147,6 +147,34 @@ so you can rewrite the saved summary from within chat.
|
|||||||
- `AGENTIC_CHAT_MAX_ITERATIONS` - Cap on tool-calling iterations per chat turn [default: `6`]
|
- `AGENTIC_CHAT_MAX_ITERATIONS` - Cap on tool-calling iterations per chat turn [default: `6`]
|
||||||
- Per-request `max_iterations` (when sent by the client) is clamped to this cap
|
- Per-request `max_iterations` (when sent by the client) is clamped to this cap
|
||||||
|
|
||||||
|
#### Text-to-Speech (Optional)
|
||||||
|
Reads insights aloud and manages cloned voices via a Chatterbox model served
|
||||||
|
behind the same llama-swap proxy. Only requires `LLAMA_SWAP_URL` (the TTS client
|
||||||
|
is built whenever that's set — independent of `LLM_BACKEND`). Endpoints:
|
||||||
|
- `POST /tts/speech` — body `{ text, voice?, format?, exaggeration?, cfg_weight?,
|
||||||
|
temperature? }`; returns `{ audio_base64, format }`. Input is cleaned
|
||||||
|
server-side (markdown + emoji stripped) and the generation knobs are clamped
|
||||||
|
to Chatterbox's ranges. Synthesis is serialized (one at a time — the upstream
|
||||||
|
has no GPU lock of its own); a concurrent request gets a fast `429`.
|
||||||
|
- `GET /tts/voices` — list the voice library.
|
||||||
|
- `POST /tts/voices/upload` — multipart `voice_name` + `voice_file`; clone a
|
||||||
|
voice from an uploaded clip (≤25 MB).
|
||||||
|
- `POST /tts/voices/from-library` — body `{ voice_name, path, library? }`; clone
|
||||||
|
from a library file (audio forwarded as-is; video has its audio extracted via
|
||||||
|
ffmpeg).
|
||||||
|
|
||||||
|
Env:
|
||||||
|
- `LLAMA_SWAP_TTS_MODEL` - TTS model id in llama-swap's `config.yaml` [default: `chatterbox`]
|
||||||
|
- `LLAMA_SWAP_TTS_VOICE` - default voice used when a `/tts/speech` request omits `voice` (optional)
|
||||||
|
- `LLAMA_SWAP_TTS_REF_SECONDS` - max voice-clone reference clip length in seconds
|
||||||
|
[default: `30`]. Reference audio is ffmpeg-normalized to mono 24 kHz WAV (so any
|
||||||
|
source format works); Chatterbox is zero-shot, so a clean ~10–20s sample is the
|
||||||
|
sweet spot — more rarely helps.
|
||||||
|
- `LLAMA_SWAP_TTS_REQUEST_TIMEOUT_SECONDS` - per-request synthesis timeout in
|
||||||
|
seconds [default: `600`]. Long insights are chunked + synthesized server-side
|
||||||
|
and can take minutes; this is separate from (and overrides, for `/tts/speech`)
|
||||||
|
the shared `LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS`.
|
||||||
|
|
||||||
#### Fallback Behavior
|
#### Fallback Behavior
|
||||||
- Primary server is tried first with 5-second connection timeout
|
- Primary server is tried first with 5-second connection timeout
|
||||||
- On failure, automatically falls back to secondary server (if configured)
|
- On failure, automatically falls back to secondary server (if configured)
|
||||||
|
|||||||
@@ -36,6 +36,7 @@ const DEFAULT_BASE_URL: &str = "http://localhost:9292/v1";
|
|||||||
const DEFAULT_PRIMARY_MODEL: &str = "chat";
|
const DEFAULT_PRIMARY_MODEL: &str = "chat";
|
||||||
const DEFAULT_VISION_MODEL: &str = "vision";
|
const DEFAULT_VISION_MODEL: &str = "vision";
|
||||||
const DEFAULT_EMBEDDING_MODEL: &str = "embed";
|
const DEFAULT_EMBEDDING_MODEL: &str = "embed";
|
||||||
|
const DEFAULT_TTS_MODEL: &str = "chatterbox";
|
||||||
const DEFAULT_REQUEST_TIMEOUT_SECS: u64 = 180;
|
const DEFAULT_REQUEST_TIMEOUT_SECS: u64 = 180;
|
||||||
|
|
||||||
/// OpenAI-compatible client targeting a llama-swap proxy in front of one or
|
/// OpenAI-compatible client targeting a llama-swap proxy in front of one or
|
||||||
@@ -54,6 +55,10 @@ pub struct LlamaCppClient {
|
|||||||
/// to `primary_model` so describe_image works out of the box; override
|
/// to `primary_model` so describe_image works out of the box; override
|
||||||
/// via `LLAMA_SWAP_VISION_MODEL` for a dedicated vision slot.
|
/// via `LLAMA_SWAP_VISION_MODEL` for a dedicated vision slot.
|
||||||
pub vision_model: String,
|
pub vision_model: String,
|
||||||
|
/// TTS model slot id (e.g. `"chatterbox"`). Routes `text_to_speech` and
|
||||||
|
/// is the `/upstream/<id>/voices` path segment for the voice library.
|
||||||
|
/// Override via `LLAMA_SWAP_TTS_MODEL`.
|
||||||
|
pub tts_model: String,
|
||||||
num_ctx: Option<i32>,
|
num_ctx: Option<i32>,
|
||||||
temperature: Option<f32>,
|
temperature: Option<f32>,
|
||||||
top_p: Option<f32>,
|
top_p: Option<f32>,
|
||||||
@@ -78,6 +83,7 @@ impl LlamaCppClient {
|
|||||||
primary_model: pm.clone(),
|
primary_model: pm.clone(),
|
||||||
embedding_model: DEFAULT_EMBEDDING_MODEL.to_string(),
|
embedding_model: DEFAULT_EMBEDDING_MODEL.to_string(),
|
||||||
vision_model: pm,
|
vision_model: pm,
|
||||||
|
tts_model: DEFAULT_TTS_MODEL.to_string(),
|
||||||
num_ctx: None,
|
num_ctx: None,
|
||||||
temperature: None,
|
temperature: None,
|
||||||
top_p: None,
|
top_p: None,
|
||||||
@@ -111,6 +117,142 @@ impl LlamaCppClient {
|
|||||||
self.min_p = min_p;
|
self.min_p = min_p;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn set_tts_model(&mut self, model: String) {
|
||||||
|
self.tts_model = model;
|
||||||
|
}
|
||||||
|
|
||||||
|
// --- TTS (Chatterbox behind llama-swap) ---------------------------------
|
||||||
|
//
|
||||||
|
// Speech synthesis uses the OpenAI-compatible `{base_url}/audio/speech`
|
||||||
|
// endpoint (llama-swap routes by the `model` field). The voice *library*
|
||||||
|
// (list / create cloned voices) is NOT an OpenAI endpoint — it lives on the
|
||||||
|
// upstream server directly, reached via llama-swap's passthrough at
|
||||||
|
// `{swap_root}/upstream/<tts_model>/voices`.
|
||||||
|
|
||||||
|
/// Root of the llama-swap proxy: `base_url` with a trailing `/v1` removed.
|
||||||
|
/// The `/upstream/...` passthrough lives here, not under `/v1`.
|
||||||
|
fn swap_root(&self) -> &str {
|
||||||
|
let b = self.base_url.trim_end_matches('/');
|
||||||
|
b.strip_suffix("/v1").unwrap_or(b)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Synthesize speech for `input` in an optional named `voice`, returning
|
||||||
|
/// the raw audio bytes (format per `response_format`, e.g. `"mp3"`/`"wav"`).
|
||||||
|
///
|
||||||
|
/// Chatterbox generation knobs are forwarded when set (caller is expected
|
||||||
|
/// to have range-clamped them): `exaggeration` (0.25–2.0, emotion),
|
||||||
|
/// `cfg_weight` (0.0–1.0, pace), `temperature` (0.05–5.0, randomness).
|
||||||
|
pub async fn text_to_speech(
|
||||||
|
&self,
|
||||||
|
input: &str,
|
||||||
|
voice: Option<&str>,
|
||||||
|
response_format: &str,
|
||||||
|
exaggeration: Option<f32>,
|
||||||
|
cfg_weight: Option<f32>,
|
||||||
|
temperature: Option<f32>,
|
||||||
|
) -> Result<Vec<u8>> {
|
||||||
|
let url = format!("{}/audio/speech", self.base_url);
|
||||||
|
let mut body = json!({
|
||||||
|
"model": self.tts_model,
|
||||||
|
"input": input,
|
||||||
|
"response_format": response_format,
|
||||||
|
});
|
||||||
|
if let Some(v) = voice {
|
||||||
|
body["voice"] = Value::String(v.to_string());
|
||||||
|
}
|
||||||
|
if let Some(x) = exaggeration {
|
||||||
|
body["exaggeration"] = json!(x);
|
||||||
|
}
|
||||||
|
if let Some(x) = cfg_weight {
|
||||||
|
body["cfg_weight"] = json!(x);
|
||||||
|
}
|
||||||
|
if let Some(x) = temperature {
|
||||||
|
body["temperature"] = json!(x);
|
||||||
|
}
|
||||||
|
|
||||||
|
// TTS gets its own (longer) timeout: synthesizing a long, internally
|
||||||
|
// chunked insight can take minutes, well past the shared chat/embedding
|
||||||
|
// client timeout. Per-request `.timeout()` overrides the client default.
|
||||||
|
let tts_timeout = std::env::var("LLAMA_SWAP_TTS_REQUEST_TIMEOUT_SECONDS")
|
||||||
|
.ok()
|
||||||
|
.and_then(|v| v.parse::<u64>().ok())
|
||||||
|
.filter(|n| *n > 0)
|
||||||
|
.unwrap_or(600);
|
||||||
|
|
||||||
|
let resp = self
|
||||||
|
.client
|
||||||
|
.post(&url)
|
||||||
|
.timeout(Duration::from_secs(tts_timeout))
|
||||||
|
.json(&body)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("POST {} failed", url))?;
|
||||||
|
|
||||||
|
if !resp.status().is_success() {
|
||||||
|
let status = resp.status();
|
||||||
|
let text = resp.text().await.unwrap_or_default();
|
||||||
|
bail!("llama-swap TTS request failed: {} — {}", status, text);
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(resp
|
||||||
|
.bytes()
|
||||||
|
.await
|
||||||
|
.context("reading TTS audio bytes")?
|
||||||
|
.to_vec())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// List voices in the Chatterbox voice library (raw JSON passthrough).
|
||||||
|
pub async fn list_voices(&self) -> Result<Value> {
|
||||||
|
let url = format!("{}/upstream/{}/voices", self.swap_root(), self.tts_model);
|
||||||
|
let resp = self
|
||||||
|
.client
|
||||||
|
.get(&url)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("GET {} failed", url))?;
|
||||||
|
|
||||||
|
if !resp.status().is_success() {
|
||||||
|
let status = resp.status();
|
||||||
|
let text = resp.text().await.unwrap_or_default();
|
||||||
|
bail!("llama-swap list_voices failed: {} — {}", status, text);
|
||||||
|
}
|
||||||
|
resp.json().await.context("parsing voices response")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Register a cloned voice from raw audio bytes (multipart `voice_name` +
|
||||||
|
/// `voice_file`). Returns the upstream JSON response.
|
||||||
|
pub async fn create_voice(
|
||||||
|
&self,
|
||||||
|
voice_name: &str,
|
||||||
|
audio_bytes: Vec<u8>,
|
||||||
|
filename: &str,
|
||||||
|
mime: &str,
|
||||||
|
) -> Result<Value> {
|
||||||
|
let url = format!("{}/upstream/{}/voices", self.swap_root(), self.tts_model);
|
||||||
|
let part = reqwest::multipart::Part::bytes(audio_bytes)
|
||||||
|
.file_name(filename.to_string())
|
||||||
|
.mime_str(mime)
|
||||||
|
.context("invalid audio mime type")?;
|
||||||
|
let form = reqwest::multipart::Form::new()
|
||||||
|
.text("voice_name", voice_name.to_string())
|
||||||
|
.part("voice_file", part);
|
||||||
|
|
||||||
|
let resp = self
|
||||||
|
.client
|
||||||
|
.post(&url)
|
||||||
|
.multipart(form)
|
||||||
|
.send()
|
||||||
|
.await
|
||||||
|
.with_context(|| format!("POST {} (multipart) failed", url))?;
|
||||||
|
|
||||||
|
if !resp.status().is_success() {
|
||||||
|
let status = resp.status();
|
||||||
|
let text = resp.text().await.unwrap_or_default();
|
||||||
|
bail!("llama-swap create_voice failed: {} — {}", status, text);
|
||||||
|
}
|
||||||
|
resp.json().await.context("parsing create_voice response")
|
||||||
|
}
|
||||||
|
|
||||||
/// Translate canonical messages to the OpenAI-compatible wire shape.
|
/// Translate canonical messages to the OpenAI-compatible wire shape.
|
||||||
/// Behaviorally identical to `OpenRouterClient::messages_to_openai` —
|
/// Behaviorally identical to `OpenRouterClient::messages_to_openai` —
|
||||||
/// stringify tool-call arguments, rewrite images into content-parts, attach
|
/// stringify tool-call arguments, rewrite images into content-parts, attach
|
||||||
@@ -1140,4 +1282,24 @@ mod tests {
|
|||||||
let wire = LlamaCppClient::messages_to_openai(&[msg]);
|
let wire = LlamaCppClient::messages_to_openai(&[msg]);
|
||||||
assert_eq!(wire[0]["content"], "");
|
assert_eq!(wire[0]["content"], "");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn swap_root_strips_v1_suffix() {
|
||||||
|
let c = LlamaCppClient::new(Some("http://localhost:9292/v1".to_string()), None);
|
||||||
|
assert_eq!(c.swap_root(), "http://localhost:9292");
|
||||||
|
|
||||||
|
// Tolerates a trailing slash on the base URL.
|
||||||
|
let c2 = LlamaCppClient::new(Some("http://localhost:9292/v1/".to_string()), None);
|
||||||
|
assert_eq!(c2.swap_root(), "http://localhost:9292");
|
||||||
|
|
||||||
|
// No /v1 suffix → returned unchanged.
|
||||||
|
let c3 = LlamaCppClient::new(Some("http://host:1234".to_string()), None);
|
||||||
|
assert_eq!(c3.swap_root(), "http://host:1234");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn tts_model_defaults_to_chatterbox() {
|
||||||
|
let c = LlamaCppClient::new(None, None);
|
||||||
|
assert_eq!(c.tts_model, "chatterbox");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -11,6 +11,7 @@ pub mod llm_client;
|
|||||||
pub mod ollama;
|
pub mod ollama;
|
||||||
pub mod openrouter;
|
pub mod openrouter;
|
||||||
pub mod sms_client;
|
pub mod sms_client;
|
||||||
|
pub mod tts;
|
||||||
pub mod turn_registry;
|
pub mod turn_registry;
|
||||||
|
|
||||||
// strip_summary_boilerplate is used by binaries (test_daily_summary), not the library
|
// strip_summary_boilerplate is used by binaries (test_daily_summary), not the library
|
||||||
@@ -34,6 +35,10 @@ pub use llm_client::{
|
|||||||
};
|
};
|
||||||
pub use ollama::{EMBEDDING_MODEL, OllamaClient};
|
pub use ollama::{EMBEDDING_MODEL, OllamaClient};
|
||||||
pub use sms_client::{SmsApiClient, SmsMessage};
|
pub use sms_client::{SmsApiClient, SmsMessage};
|
||||||
|
pub use tts::{
|
||||||
|
create_voice_from_library_handler, create_voice_upload_handler, list_voices_handler,
|
||||||
|
tts_speech_handler,
|
||||||
|
};
|
||||||
|
|
||||||
/// Display name used for the user in message transcripts and first-person
|
/// Display name used for the user in message transcripts and first-person
|
||||||
/// prompt text. Reads the `USER_NAME` env var; defaults to `"Me"`. Models
|
/// prompt text. Reads the `USER_NAME` env var; defaults to `"Me"`. Models
|
||||||
|
|||||||
+580
@@ -0,0 +1,580 @@
|
|||||||
|
// TTS endpoints: proxy text-to-speech + voice-library management to the
|
||||||
|
// Chatterbox server that sits behind llama-swap (via LlamaCppClient). Speech
|
||||||
|
// synthesis returns audio as base64-in-JSON so the mobile app can play it as a
|
||||||
|
// `data:` URI without a binary-fetch path. Voice cloning registers a named
|
||||||
|
// voice from either an uploaded clip (device) or an existing library file
|
||||||
|
// (audio read directly; video has its audio track extracted via ffmpeg).
|
||||||
|
|
||||||
|
use actix_multipart::Multipart;
|
||||||
|
use actix_web::{HttpRequest, HttpResponse, Responder, get, post, web};
|
||||||
|
use anyhow::Context;
|
||||||
|
use base64::Engine;
|
||||||
|
use bytes::{BufMut, BytesMut};
|
||||||
|
use futures::StreamExt;
|
||||||
|
use opentelemetry::KeyValue;
|
||||||
|
use opentelemetry::trace::{Span, Status, Tracer};
|
||||||
|
use regex::Regex;
|
||||||
|
use serde::{Deserialize, Serialize};
|
||||||
|
use serde_json::json;
|
||||||
|
use std::path::Path;
|
||||||
|
use std::sync::LazyLock;
|
||||||
|
use tokio::sync::Semaphore;
|
||||||
|
|
||||||
|
use crate::data::Claims;
|
||||||
|
use crate::file_types::{is_audio_file, is_video_file};
|
||||||
|
use crate::files::is_valid_full_path;
|
||||||
|
use crate::libraries;
|
||||||
|
use crate::otel::{extract_context_from_request, global_tracer};
|
||||||
|
use crate::state::AppState;
|
||||||
|
|
||||||
|
/// Hard cap on an uploaded voice-reference clip. Chatterbox itself caps the
|
||||||
|
/// payload (~60s clip); this is a defensive ceiling so a hostile/oversized
|
||||||
|
/// upload can't balloon ImageApi memory before we ever forward it.
|
||||||
|
const MAX_VOICE_UPLOAD_BYTES: usize = 25 * 1024 * 1024; // 25 MB
|
||||||
|
|
||||||
|
/// Serialize speech synthesis: the Chatterbox server has no internal lock or
|
||||||
|
/// queue, so concurrent requests contend on the single GPU and cascade into
|
||||||
|
/// timeouts. One permit; when busy we fast-fail with 429 rather than queue —
|
||||||
|
/// the app surfaces "busy" immediately, and typical jobs clear in well under a
|
||||||
|
/// minute. (An abandoned upstream job can still occupy the GPU until it
|
||||||
|
/// finishes — that's a wrapper limitation; the chunked-queue plan fixes it.)
|
||||||
|
static TTS_PERMIT: LazyLock<Semaphore> = LazyLock::new(|| Semaphore::new(1));
|
||||||
|
|
||||||
|
/// Sanitize a user-supplied voice name. The name is forwarded to Chatterbox
|
||||||
|
/// where it becomes a filename in the voice-library directory, so we restrict
|
||||||
|
/// it to a safe charset (alphanumerics, dash, underscore) — no path
|
||||||
|
/// separators, dots, or whitespace — and bound its length. Returns `None`
|
||||||
|
/// when nothing usable remains.
|
||||||
|
fn sanitize_voice_name(raw: &str) -> Option<String> {
|
||||||
|
let cleaned: String = raw
|
||||||
|
.trim()
|
||||||
|
.chars()
|
||||||
|
.map(|c| {
|
||||||
|
if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
|
||||||
|
c
|
||||||
|
} else {
|
||||||
|
'-'
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.collect();
|
||||||
|
let cleaned = cleaned.trim_matches('-').to_string();
|
||||||
|
if cleaned.is_empty() {
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
Some(cleaned.chars().take(64).collect())
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Optional default voice for synthesis when the request doesn't name one.
|
||||||
|
/// Set `LLAMA_SWAP_TTS_VOICE=m` to read insights in a cloned voice by default.
|
||||||
|
fn default_voice() -> Option<String> {
|
||||||
|
std::env::var("LLAMA_SWAP_TTS_VOICE")
|
||||||
|
.ok()
|
||||||
|
.map(|s| s.trim().to_string())
|
||||||
|
.filter(|s| !s.is_empty())
|
||||||
|
}
|
||||||
|
|
||||||
|
// Markdown / formatting strippers, compiled once. Insight text is markdown,
|
||||||
|
// which TTS would otherwise read literally ("star star bold star star").
|
||||||
|
static MD_IMAGE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\([^)]*\)").unwrap());
|
||||||
|
static MD_LINK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\([^)]*\)").unwrap());
|
||||||
|
static MD_HEADING: LazyLock<Regex> =
|
||||||
|
LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}#{1,6}\s*").unwrap());
|
||||||
|
static MD_BLOCKQUOTE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}>\s?").unwrap());
|
||||||
|
static MD_LIST: LazyLock<Regex> =
|
||||||
|
LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}([-*+]|\d+\.)\s+").unwrap());
|
||||||
|
static MD_EMPHASIS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[*_`~]+").unwrap());
|
||||||
|
static URL_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"https?://\S+").unwrap());
|
||||||
|
static MULTISPACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[ \t]{2,}").unwrap());
|
||||||
|
// Any run of 2+ newlines (incl. whitespace-only blank lines) collapses to ONE
|
||||||
|
// newline: Chatterbox inserts a long pause (sometimes ~20s of silence) per
|
||||||
|
// blank line, so paragraph breaks must reach it as a single line break at most.
|
||||||
|
static MULTINEWLINE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\n(?:[ \t]*\n)+").unwrap());
|
||||||
|
|
||||||
|
/// True for emoji / pictographic symbols, which most TTS models either skip or
|
||||||
|
/// mispronounce. Covers the main emoji blocks plus dingbats, misc-technical,
|
||||||
|
/// variation selectors, and the ZWJ used to glue emoji sequences. We do NOT
|
||||||
|
/// strip `[bracketed]` tags — non-turbo Chatterbox ignores them, and a future
|
||||||
|
/// Turbo switch uses them as paralinguistic cues.
|
||||||
|
fn is_emoji_like(c: char) -> bool {
|
||||||
|
let u = c as u32;
|
||||||
|
matches!(u,
|
||||||
|
0x1F000..=0x1FAFF // emoji, pictographs, supplemental symbols, flags
|
||||||
|
| 0x2300..=0x23FF // misc technical (⌚ ⏰ ⏳ …)
|
||||||
|
| 0x2600..=0x27BF // misc symbols + dingbats
|
||||||
|
| 0x2B00..=0x2BFF // misc symbols & arrows (★ ⬆ …)
|
||||||
|
| 0xFE00..=0xFE0F // variation selectors
|
||||||
|
| 0x200D // zero-width joiner
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Normalize insight text for speech: unwrap markdown links/images to their
|
||||||
|
/// visible text, drop heading/list/blockquote/emphasis markers and URLs, strip
|
||||||
|
/// emoji, and collapse whitespace. Centralized here so every caller (app,
|
||||||
|
/// WebUI, curl) gets clean audio.
|
||||||
|
fn clean_for_tts(input: &str) -> String {
|
||||||
|
let s = MD_IMAGE.replace_all(input, "$1");
|
||||||
|
let s = MD_LINK.replace_all(&s, "$1");
|
||||||
|
let s = MD_HEADING.replace_all(&s, "");
|
||||||
|
let s = MD_BLOCKQUOTE.replace_all(&s, "");
|
||||||
|
let s = MD_LIST.replace_all(&s, "");
|
||||||
|
let s = MD_EMPHASIS.replace_all(&s, "");
|
||||||
|
let s = URL_RE.replace_all(&s, " ");
|
||||||
|
let s: String = s.chars().filter(|c| !is_emoji_like(*c)).collect();
|
||||||
|
let s = MULTISPACE.replace_all(&s, " ");
|
||||||
|
let s = MULTINEWLINE.replace_all(&s, "\n");
|
||||||
|
s.trim().to_string()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Decode an audio/video file to mono 24 kHz WAV via ffmpeg, returning the WAV
|
||||||
|
/// bytes. Chatterbox validates the reference clip by file *extension* and
|
||||||
|
/// rejects several formats (e.g. `.aac`, `.opus`), so we always normalize to
|
||||||
|
/// WAV regardless of the source container. Capped at 30s — references only need
|
||||||
|
/// a few seconds of clean speech.
|
||||||
|
async fn run_ffmpeg_to_wav(input_path: &str) -> anyhow::Result<Vec<u8>> {
|
||||||
|
let out = tempfile::Builder::new()
|
||||||
|
.suffix(".wav")
|
||||||
|
.tempfile()
|
||||||
|
.context("creating temp wav")?;
|
||||||
|
let out_s = out.path().to_string_lossy().to_string();
|
||||||
|
|
||||||
|
// Cap the reference clip length. Chatterbox is zero-shot — a clean ~10–20s
|
||||||
|
// sample is the sweet spot and more rarely helps — so we use the first N
|
||||||
|
// seconds. Tune via LLAMA_SWAP_TTS_REF_SECONDS (default 30).
|
||||||
|
let secs = std::env::var("LLAMA_SWAP_TTS_REF_SECONDS")
|
||||||
|
.ok()
|
||||||
|
.and_then(|s| s.trim().parse::<u32>().ok())
|
||||||
|
.filter(|n| *n > 0)
|
||||||
|
.unwrap_or(30)
|
||||||
|
.to_string();
|
||||||
|
|
||||||
|
let output = tokio::process::Command::new("ffmpeg")
|
||||||
|
.args([
|
||||||
|
"-y", "-i", input_path, "-vn", "-ac", "1", "-ar", "24000", "-t", &secs, "-f", "wav",
|
||||||
|
&out_s,
|
||||||
|
])
|
||||||
|
.output()
|
||||||
|
.await
|
||||||
|
.context("spawning ffmpeg")?;
|
||||||
|
|
||||||
|
if !output.status.success() {
|
||||||
|
anyhow::bail!("ffmpeg failed: {}", String::from_utf8_lossy(&output.stderr));
|
||||||
|
}
|
||||||
|
std::fs::read(&out_s).context("reading transcoded audio")
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Normalize in-memory upload bytes to WAV: write to a temp file (keeping the
|
||||||
|
/// source extension as an ffmpeg probe hint) then transcode.
|
||||||
|
async fn transcode_bytes_to_wav(input: &[u8], src_ext: Option<&str>) -> anyhow::Result<Vec<u8>> {
|
||||||
|
let suffix = src_ext
|
||||||
|
.filter(|e| !e.is_empty())
|
||||||
|
.map(|e| format!(".{e}"))
|
||||||
|
.unwrap_or_else(|| ".bin".to_string());
|
||||||
|
let in_tmp = tempfile::Builder::new()
|
||||||
|
.suffix(&suffix)
|
||||||
|
.tempfile()
|
||||||
|
.context("creating temp input")?;
|
||||||
|
std::fs::write(in_tmp.path(), input).context("writing temp input")?;
|
||||||
|
run_ffmpeg_to_wav(&in_tmp.path().to_string_lossy()).await
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct TtsSpeechRequest {
|
||||||
|
pub text: String,
|
||||||
|
#[serde(default)]
|
||||||
|
pub voice: Option<String>,
|
||||||
|
/// Audio container, e.g. `"mp3"` (default) or `"wav"`.
|
||||||
|
#[serde(default)]
|
||||||
|
pub format: Option<String>,
|
||||||
|
/// Chatterbox knobs (clamped server-side). exaggeration 0.25–2.0 (emotion),
|
||||||
|
/// cfg_weight 0.0–1.0 (pace; ~0.3 for fast speakers, 0 to neutralize a
|
||||||
|
/// reference accent), temperature 0.05–5.0 (randomness).
|
||||||
|
#[serde(default)]
|
||||||
|
pub exaggeration: Option<f32>,
|
||||||
|
#[serde(default)]
|
||||||
|
pub cfg_weight: Option<f32>,
|
||||||
|
#[serde(default)]
|
||||||
|
pub temperature: Option<f32>,
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Serialize)]
|
||||||
|
pub struct TtsSpeechResponse {
|
||||||
|
pub audio_base64: String,
|
||||||
|
pub format: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// POST /tts/speech — synthesize `text` (optionally in a named `voice`) and
|
||||||
|
/// return base64-encoded audio for `data:` URI playback on the client.
|
||||||
|
#[post("/tts/speech")]
|
||||||
|
pub async fn tts_speech_handler(
|
||||||
|
http_request: HttpRequest,
|
||||||
|
_claims: Claims,
|
||||||
|
req: web::Json<TtsSpeechRequest>,
|
||||||
|
app_state: web::Data<AppState>,
|
||||||
|
) -> impl Responder {
|
||||||
|
let parent_context = extract_context_from_request(&http_request);
|
||||||
|
let mut span = global_tracer().start_with_context("http.tts.speech", &parent_context);
|
||||||
|
|
||||||
|
let text = clean_for_tts(&req.text);
|
||||||
|
if text.is_empty() {
|
||||||
|
span.set_status(Status::error("text is required"));
|
||||||
|
return HttpResponse::BadRequest().json(json!({ "error": "text is required" }));
|
||||||
|
}
|
||||||
|
let Some(client) = app_state.llamacpp.as_ref() else {
|
||||||
|
span.set_status(Status::error("tts backend not configured"));
|
||||||
|
return HttpResponse::ServiceUnavailable()
|
||||||
|
.json(json!({ "error": "TTS backend not configured (set LLAMA_SWAP_URL)" }));
|
||||||
|
};
|
||||||
|
|
||||||
|
let format = req
|
||||||
|
.format
|
||||||
|
.as_deref()
|
||||||
|
.filter(|s| !s.is_empty())
|
||||||
|
.unwrap_or("mp3");
|
||||||
|
let dv = default_voice();
|
||||||
|
let voice = req
|
||||||
|
.voice
|
||||||
|
.as_deref()
|
||||||
|
.filter(|s| !s.is_empty())
|
||||||
|
.or(dv.as_deref());
|
||||||
|
|
||||||
|
span.set_attribute(KeyValue::new("tts.model", client.tts_model.clone()));
|
||||||
|
span.set_attribute(KeyValue::new("tts.format", format.to_string()));
|
||||||
|
span.set_attribute(KeyValue::new("tts.has_voice", voice.is_some()));
|
||||||
|
span.set_attribute(KeyValue::new("tts.text_len", text.len() as i64));
|
||||||
|
|
||||||
|
// Clamp generation knobs to Chatterbox's documented ranges before forwarding.
|
||||||
|
let exaggeration = req.exaggeration.map(|x| x.clamp(0.25, 2.0));
|
||||||
|
let cfg_weight = req.cfg_weight.map(|x| x.clamp(0.0, 1.0));
|
||||||
|
let temperature = req.temperature.map(|x| x.clamp(0.05, 5.0));
|
||||||
|
|
||||||
|
// One synthesis at a time (see TTS_PERMIT) — fast-fail when busy.
|
||||||
|
let Ok(_permit) = TTS_PERMIT.try_acquire() else {
|
||||||
|
span.set_status(Status::error("tts busy"));
|
||||||
|
return HttpResponse::TooManyRequests().json(json!({
|
||||||
|
"error": "TTS is busy with another request — try again shortly"
|
||||||
|
}));
|
||||||
|
};
|
||||||
|
|
||||||
|
match client
|
||||||
|
.text_to_speech(&text, voice, format, exaggeration, cfg_weight, temperature)
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(bytes) => {
|
||||||
|
span.set_attribute(KeyValue::new("tts.audio_bytes", bytes.len() as i64));
|
||||||
|
span.set_status(Status::Ok);
|
||||||
|
let audio_base64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
|
||||||
|
HttpResponse::Ok().json(TtsSpeechResponse {
|
||||||
|
audio_base64,
|
||||||
|
format: format.to_string(),
|
||||||
|
})
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
span.set_status(Status::error("tts synthesis failed"));
|
||||||
|
log::error!("TTS synth failed: {:?}", e);
|
||||||
|
HttpResponse::BadGateway().json(json!({ "error": format!("TTS failed: {e}") }))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// GET /tts/voices — list the Chatterbox voice library (raw passthrough).
|
||||||
|
#[get("/tts/voices")]
|
||||||
|
pub async fn list_voices_handler(
|
||||||
|
http_request: HttpRequest,
|
||||||
|
_claims: Claims,
|
||||||
|
app_state: web::Data<AppState>,
|
||||||
|
) -> impl Responder {
|
||||||
|
let parent_context = extract_context_from_request(&http_request);
|
||||||
|
let mut span = global_tracer().start_with_context("http.tts.voices.list", &parent_context);
|
||||||
|
|
||||||
|
let Some(client) = app_state.llamacpp.as_ref() else {
|
||||||
|
span.set_status(Status::error("tts backend not configured"));
|
||||||
|
return HttpResponse::ServiceUnavailable()
|
||||||
|
.json(json!({ "error": "TTS backend not configured" }));
|
||||||
|
};
|
||||||
|
match client.list_voices().await {
|
||||||
|
Ok(v) => {
|
||||||
|
span.set_status(Status::Ok);
|
||||||
|
HttpResponse::Ok().json(v)
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
span.set_status(Status::error("list_voices failed"));
|
||||||
|
log::error!("list_voices failed: {:?}", e);
|
||||||
|
HttpResponse::BadGateway().json(json!({ "error": format!("{e}") }))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// POST /tts/voices/upload — register a cloned voice from an uploaded audio
|
||||||
|
/// clip. Multipart fields: `voice_name` (text) + a file part (`voice_file`).
|
||||||
|
#[post("/tts/voices/upload")]
|
||||||
|
pub async fn create_voice_upload_handler(
|
||||||
|
http_request: HttpRequest,
|
||||||
|
_claims: Claims,
|
||||||
|
mut payload: Multipart,
|
||||||
|
app_state: web::Data<AppState>,
|
||||||
|
) -> impl Responder {
|
||||||
|
let parent_context = extract_context_from_request(&http_request);
|
||||||
|
let mut span = global_tracer().start_with_context("http.tts.voices.upload", &parent_context);
|
||||||
|
|
||||||
|
let Some(client) = app_state.llamacpp.as_ref() else {
|
||||||
|
span.set_status(Status::error("tts backend not configured"));
|
||||||
|
return HttpResponse::ServiceUnavailable()
|
||||||
|
.json(json!({ "error": "TTS backend not configured" }));
|
||||||
|
};
|
||||||
|
|
||||||
|
let mut voice_name: Option<String> = None;
|
||||||
|
let mut file_bytes = BytesMut::new();
|
||||||
|
let mut filename = "voice.wav".to_string();
|
||||||
|
|
||||||
|
while let Some(Ok(mut part)) = payload.next().await {
|
||||||
|
// Capture disposition fields up front so the immutable borrow ends
|
||||||
|
// before we mutably stream the part body (mirrors handlers/image.rs).
|
||||||
|
let (fname_opt, name_opt) = {
|
||||||
|
let cd = part.content_disposition();
|
||||||
|
(
|
||||||
|
cd.and_then(|c| c.get_filename()).map(|s| s.to_string()),
|
||||||
|
cd.and_then(|c| c.get_name()).map(|s| s.to_string()),
|
||||||
|
)
|
||||||
|
};
|
||||||
|
|
||||||
|
if let Some(fname) = fname_opt {
|
||||||
|
filename = fname;
|
||||||
|
while let Some(Ok(data)) = part.next().await {
|
||||||
|
if file_bytes.len() + data.len() > MAX_VOICE_UPLOAD_BYTES {
|
||||||
|
span.set_status(Status::error("voice clip exceeds limit"));
|
||||||
|
return HttpResponse::PayloadTooLarge()
|
||||||
|
.json(json!({ "error": "voice clip exceeds 25 MB" }));
|
||||||
|
}
|
||||||
|
file_bytes.put(data);
|
||||||
|
}
|
||||||
|
} else if name_opt.as_deref() == Some("voice_name") {
|
||||||
|
let mut buf = BytesMut::new();
|
||||||
|
while let Some(Ok(data)) = part.next().await {
|
||||||
|
buf.put(data);
|
||||||
|
}
|
||||||
|
voice_name = Some(String::from_utf8_lossy(&buf).trim().to_string());
|
||||||
|
} else {
|
||||||
|
while let Some(Ok(_)) = part.next().await {}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let Some(name) = voice_name.as_deref().and_then(sanitize_voice_name) else {
|
||||||
|
span.set_status(Status::error("voice_name is required"));
|
||||||
|
return HttpResponse::BadRequest()
|
||||||
|
.json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
|
||||||
|
};
|
||||||
|
if file_bytes.is_empty() {
|
||||||
|
span.set_status(Status::error("voice_file is required"));
|
||||||
|
return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" }));
|
||||||
|
}
|
||||||
|
span.set_attribute(KeyValue::new("tts.voice_name", name.clone()));
|
||||||
|
span.set_attribute(KeyValue::new("tts.upload_bytes", file_bytes.len() as i64));
|
||||||
|
|
||||||
|
// Normalize to WAV so any device format (e.g. .aac / .opus, which Chatterbox
|
||||||
|
// rejects by extension) is accepted.
|
||||||
|
let src_ext = Path::new(&filename).extension().and_then(|e| e.to_str());
|
||||||
|
let wav = match transcode_bytes_to_wav(file_bytes.as_ref(), src_ext).await {
|
||||||
|
Ok(w) => w,
|
||||||
|
Err(e) => {
|
||||||
|
span.set_status(Status::error("audio decode failed"));
|
||||||
|
log::error!("voice upload transcode failed: {:?}", e);
|
||||||
|
return HttpResponse::BadRequest()
|
||||||
|
.json(json!({ "error": "couldn't decode that audio file" }));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
match client
|
||||||
|
.create_voice(&name, wav, "reference.wav", "audio/wav")
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(v) => {
|
||||||
|
span.set_status(Status::Ok);
|
||||||
|
HttpResponse::Ok().json(v)
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
span.set_status(Status::error("create_voice failed"));
|
||||||
|
log::error!("create_voice (upload) failed: {:?}", e);
|
||||||
|
HttpResponse::BadGateway().json(json!({ "error": format!("{e}") }))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Debug, Deserialize)]
|
||||||
|
pub struct CreateVoiceFromLibraryRequest {
|
||||||
|
pub voice_name: String,
|
||||||
|
/// Library-relative path to an audio or video file.
|
||||||
|
pub path: String,
|
||||||
|
#[serde(default)]
|
||||||
|
pub library: Option<String>,
|
||||||
|
}
|
||||||
|
|
||||||
|
/// POST /tts/voices/from-library — register a cloned voice from a file already
|
||||||
|
/// in a library. Audio and video alike are ffmpeg-normalized to a mono 24 kHz
|
||||||
|
/// WAV reference clip (length capped by LLAMA_SWAP_TTS_REF_SECONDS).
|
||||||
|
#[post("/tts/voices/from-library")]
|
||||||
|
pub async fn create_voice_from_library_handler(
|
||||||
|
http_request: HttpRequest,
|
||||||
|
_claims: Claims,
|
||||||
|
req: web::Json<CreateVoiceFromLibraryRequest>,
|
||||||
|
app_state: web::Data<AppState>,
|
||||||
|
) -> impl Responder {
|
||||||
|
let parent_context = extract_context_from_request(&http_request);
|
||||||
|
let mut span =
|
||||||
|
global_tracer().start_with_context("http.tts.voices.from_library", &parent_context);
|
||||||
|
|
||||||
|
let Some(client) = app_state.llamacpp.as_ref() else {
|
||||||
|
span.set_status(Status::error("tts backend not configured"));
|
||||||
|
return HttpResponse::ServiceUnavailable()
|
||||||
|
.json(json!({ "error": "TTS backend not configured" }));
|
||||||
|
};
|
||||||
|
let Some(voice_name) = sanitize_voice_name(&req.voice_name) else {
|
||||||
|
span.set_status(Status::error("voice_name is required"));
|
||||||
|
return HttpResponse::BadRequest()
|
||||||
|
.json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
|
||||||
|
};
|
||||||
|
|
||||||
|
let library = match libraries::resolve_library_param(&app_state, req.library.as_deref()) {
|
||||||
|
Ok(Some(l)) => l,
|
||||||
|
Ok(None) => app_state.primary_library(),
|
||||||
|
Err(msg) => {
|
||||||
|
span.set_status(Status::error("invalid library"));
|
||||||
|
return HttpResponse::BadRequest().json(json!({ "error": msg }));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// is_valid_full_path confines the path to the library root (no traversal).
|
||||||
|
let abs = match is_valid_full_path(&library.root_path, &req.path, false) {
|
||||||
|
Some(p) if p.exists() => p,
|
||||||
|
_ => {
|
||||||
|
span.set_status(Status::error("file not found"));
|
||||||
|
return HttpResponse::NotFound().json(json!({ "error": "file not found in library" }));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Only real audio/video sources are valid voice references — refuse to
|
||||||
|
// slurp arbitrary library files into memory / ffmpeg.
|
||||||
|
if !is_audio_file(&abs) && !is_video_file(&abs) {
|
||||||
|
span.set_status(Status::error("not an audio/video file"));
|
||||||
|
return HttpResponse::BadRequest()
|
||||||
|
.json(json!({ "error": "file is not an audio or video file" }));
|
||||||
|
}
|
||||||
|
span.set_attribute(KeyValue::new("tts.voice_name", voice_name.clone()));
|
||||||
|
|
||||||
|
let wav = match prepare_reference_audio(&abs).await {
|
||||||
|
Ok(b) => b,
|
||||||
|
Err(e) => {
|
||||||
|
span.set_status(Status::error("audio decode failed"));
|
||||||
|
log::error!("voice reference prep failed for {:?}: {:?}", abs, e);
|
||||||
|
return HttpResponse::BadRequest()
|
||||||
|
.json(json!({ "error": "couldn't decode that file's audio" }));
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
match client
|
||||||
|
.create_voice(&voice_name, wav, "reference.wav", "audio/wav")
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(v) => {
|
||||||
|
span.set_status(Status::Ok);
|
||||||
|
HttpResponse::Ok().json(v)
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
span.set_status(Status::error("create_voice failed"));
|
||||||
|
log::error!("create_voice (from-library) failed: {:?}", e);
|
||||||
|
HttpResponse::BadGateway().json(json!({ "error": format!("{e}") }))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Read a library file (audio or video) as a Chatterbox-ready reference: ffmpeg
|
||||||
|
/// decodes/extracts its audio to mono 24 kHz WAV. Reading straight from the
|
||||||
|
/// library path avoids slurping a (possibly large) video into memory.
|
||||||
|
async fn prepare_reference_audio(abs: &Path) -> anyhow::Result<Vec<u8>> {
|
||||||
|
run_ffmpeg_to_wav(&abs.to_string_lossy()).await
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(test)]
|
||||||
|
mod tests {
|
||||||
|
use super::*;
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sanitize_voice_name_keeps_safe_chars() {
|
||||||
|
assert_eq!(sanitize_voice_name("m").as_deref(), Some("m"));
|
||||||
|
assert_eq!(
|
||||||
|
sanitize_voice_name(" Cameron ").as_deref(),
|
||||||
|
Some("Cameron")
|
||||||
|
);
|
||||||
|
assert_eq!(
|
||||||
|
sanitize_voice_name("voice_01-a").as_deref(),
|
||||||
|
Some("voice_01-a")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sanitize_voice_name_strips_unsafe_chars() {
|
||||||
|
// Path separators / dots / spaces become '-' and are trimmed at edges.
|
||||||
|
assert_eq!(sanitize_voice_name("a b.c").as_deref(), Some("a-b-c"));
|
||||||
|
assert_eq!(
|
||||||
|
sanitize_voice_name("../etc/passwd").as_deref(),
|
||||||
|
Some("etc-passwd")
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sanitize_voice_name_rejects_empty_or_all_unsafe() {
|
||||||
|
assert_eq!(sanitize_voice_name(""), None);
|
||||||
|
assert_eq!(sanitize_voice_name(" "), None);
|
||||||
|
assert_eq!(sanitize_voice_name("../../"), None);
|
||||||
|
assert_eq!(sanitize_voice_name("...."), None);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn sanitize_voice_name_bounds_length() {
|
||||||
|
let long = "a".repeat(200);
|
||||||
|
assert_eq!(sanitize_voice_name(&long).unwrap().len(), 64);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn clean_for_tts_strips_markdown() {
|
||||||
|
assert_eq!(
|
||||||
|
clean_for_tts("**Bold** and _italic_ and `code`"),
|
||||||
|
"Bold and italic and code"
|
||||||
|
);
|
||||||
|
assert_eq!(clean_for_tts("# Title\n\nbody"), "Title\nbody");
|
||||||
|
assert_eq!(
|
||||||
|
clean_for_tts("See [docs](http://x.com) now"),
|
||||||
|
"See docs now"
|
||||||
|
);
|
||||||
|
assert_eq!(clean_for_tts("- one\n- two"), "one\ntwo");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn clean_for_tts_strips_emoji_and_urls() {
|
||||||
|
assert_eq!(clean_for_tts("Hello 😀 world 🎉"), "Hello world");
|
||||||
|
assert_eq!(
|
||||||
|
clean_for_tts("visit https://example.com today"),
|
||||||
|
"visit today"
|
||||||
|
);
|
||||||
|
// ZWJ-glued emoji sequence is fully removed.
|
||||||
|
assert_eq!(clean_for_tts("family 👨👩👧 photo"), "family photo");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn clean_for_tts_collapses_blank_lines_to_single_break() {
|
||||||
|
// Chatterbox pauses (sometimes ~20s) per blank line, so paragraph
|
||||||
|
// breaks must collapse to a single newline.
|
||||||
|
assert_eq!(clean_for_tts("para one\n\npara two"), "para one\npara two");
|
||||||
|
assert_eq!(clean_for_tts("a\n\n\n\nb"), "a\nb");
|
||||||
|
// Whitespace-only "blank" lines collapse too.
|
||||||
|
assert_eq!(clean_for_tts("a\n \t \nb"), "a\nb");
|
||||||
|
// A single newline is left alone.
|
||||||
|
assert_eq!(clean_for_tts("a\nb"), "a\nb");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn clean_for_tts_preserves_bracket_tags() {
|
||||||
|
// Non-turbo Chatterbox ignores these; a future Turbo uses them as
|
||||||
|
// paralinguistic cues — so we must not strip them.
|
||||||
|
assert_eq!(clean_for_tts("hello [laugh] there"), "hello [laugh] there");
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -22,6 +22,10 @@ pub fn needs_ffmpeg_thumbnail(path: &Path) -> bool {
|
|||||||
/// Supported video file extensions
|
/// Supported video file extensions
|
||||||
pub const VIDEO_EXTENSIONS: &[&str] = &["mp4", "mov", "avi", "mkv"];
|
pub const VIDEO_EXTENSIONS: &[&str] = &["mp4", "mov", "avi", "mkv"];
|
||||||
|
|
||||||
|
/// Audio file extensions accepted as voice-clone references (TTS). Mirrors
|
||||||
|
/// the formats Chatterbox can decode (wav/mp3/flac/m4a/aac/ogg).
|
||||||
|
pub const AUDIO_EXTENSIONS: &[&str] = &["wav", "mp3", "flac", "m4a", "aac", "ogg", "oga", "opus"];
|
||||||
|
|
||||||
/// Filenames that are filesystem metadata, not real media — exact
|
/// Filenames that are filesystem metadata, not real media — exact
|
||||||
/// basename match. Extend if a new platform sidecar appears (Windows
|
/// basename match. Extend if a new platform sidecar appears (Windows
|
||||||
/// Thumbs.db / desktop.ini live here too if those libraries land).
|
/// Thumbs.db / desktop.ini live here too if those libraries land).
|
||||||
@@ -75,6 +79,19 @@ pub fn is_video_file(path: &Path) -> bool {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Check if a path has an audio extension (voice-clone references)
|
||||||
|
pub fn is_audio_file(path: &Path) -> bool {
|
||||||
|
if is_filesystem_metadata(path) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
|
||||||
|
let ext_lower = ext.to_lowercase();
|
||||||
|
AUDIO_EXTENSIONS.contains(&ext_lower.as_str())
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Check if a path has a supported media extension (image or video)
|
/// Check if a path has a supported media extension (image or video)
|
||||||
pub fn is_media_file(path: &Path) -> bool {
|
pub fn is_media_file(path: &Path) -> bool {
|
||||||
is_image_file(path) || is_video_file(path)
|
is_image_file(path) || is_video_file(path)
|
||||||
|
|||||||
@@ -362,6 +362,10 @@ fn main() -> std::io::Result<()> {
|
|||||||
.service(ai::cancel_turn_handler)
|
.service(ai::cancel_turn_handler)
|
||||||
.service(ai::rate_insight_handler)
|
.service(ai::rate_insight_handler)
|
||||||
.service(ai::export_training_data_handler)
|
.service(ai::export_training_data_handler)
|
||||||
|
.service(ai::tts_speech_handler)
|
||||||
|
.service(ai::list_voices_handler)
|
||||||
|
.service(ai::create_voice_upload_handler)
|
||||||
|
.service(ai::create_voice_from_library_handler)
|
||||||
.service(libraries::list_libraries)
|
.service(libraries::list_libraries)
|
||||||
.service(libraries::patch_library)
|
.service(libraries::patch_library)
|
||||||
.add_feature(add_tag_services::<_, SqliteTagDao>)
|
.add_feature(add_tag_services::<_, SqliteTagDao>)
|
||||||
|
|||||||
@@ -391,6 +391,9 @@ fn build_llamacpp_from_env() -> Option<Arc<LlamaCppClient>> {
|
|||||||
if let Ok(model) = env::var("LLAMA_SWAP_VISION_MODEL") {
|
if let Ok(model) = env::var("LLAMA_SWAP_VISION_MODEL") {
|
||||||
client.set_vision_model(model);
|
client.set_vision_model(model);
|
||||||
}
|
}
|
||||||
|
if let Ok(model) = env::var("LLAMA_SWAP_TTS_MODEL") {
|
||||||
|
client.set_tts_model(model);
|
||||||
|
}
|
||||||
Some(Arc::new(client))
|
Some(Arc::new(client))
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user