Feature/tts integration #103
@@ -138,11 +138,18 @@ impl LlamaCppClient {
|
|||||||
|
|
||||||
/// Synthesize speech for `input` in an optional named `voice`, returning
|
/// Synthesize speech for `input` in an optional named `voice`, returning
|
||||||
/// the raw audio bytes (format per `response_format`, e.g. `"mp3"`/`"wav"`).
|
/// the raw audio bytes (format per `response_format`, e.g. `"mp3"`/`"wav"`).
|
||||||
|
///
|
||||||
|
/// Chatterbox generation knobs are forwarded when set (caller is expected
|
||||||
|
/// to have range-clamped them): `exaggeration` (0.25–2.0, emotion),
|
||||||
|
/// `cfg_weight` (0.0–1.0, pace), `temperature` (0.05–5.0, randomness).
|
||||||
pub async fn text_to_speech(
|
pub async fn text_to_speech(
|
||||||
&self,
|
&self,
|
||||||
input: &str,
|
input: &str,
|
||||||
voice: Option<&str>,
|
voice: Option<&str>,
|
||||||
response_format: &str,
|
response_format: &str,
|
||||||
|
exaggeration: Option<f32>,
|
||||||
|
cfg_weight: Option<f32>,
|
||||||
|
temperature: Option<f32>,
|
||||||
) -> Result<Vec<u8>> {
|
) -> Result<Vec<u8>> {
|
||||||
let url = format!("{}/audio/speech", self.base_url);
|
let url = format!("{}/audio/speech", self.base_url);
|
||||||
let mut body = json!({
|
let mut body = json!({
|
||||||
@@ -153,6 +160,15 @@ impl LlamaCppClient {
|
|||||||
if let Some(v) = voice {
|
if let Some(v) = voice {
|
||||||
body["voice"] = Value::String(v.to_string());
|
body["voice"] = Value::String(v.to_string());
|
||||||
}
|
}
|
||||||
|
if let Some(x) = exaggeration {
|
||||||
|
body["exaggeration"] = json!(x);
|
||||||
|
}
|
||||||
|
if let Some(x) = cfg_weight {
|
||||||
|
body["cfg_weight"] = json!(x);
|
||||||
|
}
|
||||||
|
if let Some(x) = temperature {
|
||||||
|
body["temperature"] = json!(x);
|
||||||
|
}
|
||||||
|
|
||||||
let resp = self
|
let resp = self
|
||||||
.client
|
.client
|
||||||
|
|||||||
+102
-2
@@ -11,9 +11,11 @@ use anyhow::Context;
|
|||||||
use base64::Engine;
|
use base64::Engine;
|
||||||
use bytes::{BufMut, BytesMut};
|
use bytes::{BufMut, BytesMut};
|
||||||
use futures::StreamExt;
|
use futures::StreamExt;
|
||||||
|
use regex::Regex;
|
||||||
use serde::{Deserialize, Serialize};
|
use serde::{Deserialize, Serialize};
|
||||||
use serde_json::json;
|
use serde_json::json;
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
use std::sync::LazyLock;
|
||||||
|
|
||||||
use crate::data::Claims;
|
use crate::data::Claims;
|
||||||
use crate::file_types::{is_audio_file, is_video_file};
|
use crate::file_types::{is_audio_file, is_video_file};
|
||||||
@@ -59,6 +61,55 @@ fn default_voice() -> Option<String> {
|
|||||||
.filter(|s| !s.is_empty())
|
.filter(|s| !s.is_empty())
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Markdown / formatting strippers, compiled once. Insight text is markdown,
|
||||||
|
// which TTS would otherwise read literally ("star star bold star star").
|
||||||
|
static MD_IMAGE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\([^)]*\)").unwrap());
|
||||||
|
static MD_LINK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\([^)]*\)").unwrap());
|
||||||
|
static MD_HEADING: LazyLock<Regex> =
|
||||||
|
LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}#{1,6}\s*").unwrap());
|
||||||
|
static MD_BLOCKQUOTE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}>\s?").unwrap());
|
||||||
|
static MD_LIST: LazyLock<Regex> =
|
||||||
|
LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}([-*+]|\d+\.)\s+").unwrap());
|
||||||
|
static MD_EMPHASIS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[*_`~]+").unwrap());
|
||||||
|
static URL_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"https?://\S+").unwrap());
|
||||||
|
static MULTISPACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[ \t]{2,}").unwrap());
|
||||||
|
static MULTINEWLINE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\n{3,}").unwrap());
|
||||||
|
|
||||||
|
/// True for emoji / pictographic symbols, which most TTS models either skip or
|
||||||
|
/// mispronounce. Covers the main emoji blocks plus dingbats, misc-technical,
|
||||||
|
/// variation selectors, and the ZWJ used to glue emoji sequences. We do NOT
|
||||||
|
/// strip `[bracketed]` tags — non-turbo Chatterbox ignores them, and a future
|
||||||
|
/// Turbo switch uses them as paralinguistic cues.
|
||||||
|
fn is_emoji_like(c: char) -> bool {
|
||||||
|
let u = c as u32;
|
||||||
|
matches!(u,
|
||||||
|
0x1F000..=0x1FAFF // emoji, pictographs, supplemental symbols, flags
|
||||||
|
| 0x2300..=0x23FF // misc technical (⌚ ⏰ ⏳ …)
|
||||||
|
| 0x2600..=0x27BF // misc symbols + dingbats
|
||||||
|
| 0x2B00..=0x2BFF // misc symbols & arrows (★ ⬆ …)
|
||||||
|
| 0xFE00..=0xFE0F // variation selectors
|
||||||
|
| 0x200D // zero-width joiner
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Normalize insight text for speech: unwrap markdown links/images to their
|
||||||
|
/// visible text, drop heading/list/blockquote/emphasis markers and URLs, strip
|
||||||
|
/// emoji, and collapse whitespace. Centralized here so every caller (app,
|
||||||
|
/// WebUI, curl) gets clean audio.
|
||||||
|
fn clean_for_tts(input: &str) -> String {
|
||||||
|
let s = MD_IMAGE.replace_all(input, "$1");
|
||||||
|
let s = MD_LINK.replace_all(&s, "$1");
|
||||||
|
let s = MD_HEADING.replace_all(&s, "");
|
||||||
|
let s = MD_BLOCKQUOTE.replace_all(&s, "");
|
||||||
|
let s = MD_LIST.replace_all(&s, "");
|
||||||
|
let s = MD_EMPHASIS.replace_all(&s, "");
|
||||||
|
let s = URL_RE.replace_all(&s, " ");
|
||||||
|
let s: String = s.chars().filter(|c| !is_emoji_like(*c)).collect();
|
||||||
|
let s = MULTISPACE.replace_all(&s, " ");
|
||||||
|
let s = MULTINEWLINE.replace_all(&s, "\n\n");
|
||||||
|
s.trim().to_string()
|
||||||
|
}
|
||||||
|
|
||||||
fn guess_audio_mime(path: &Path) -> String {
|
fn guess_audio_mime(path: &Path) -> String {
|
||||||
match path
|
match path
|
||||||
.extension()
|
.extension()
|
||||||
@@ -84,6 +135,15 @@ pub struct TtsSpeechRequest {
|
|||||||
/// Audio container, e.g. `"mp3"` (default) or `"wav"`.
|
/// Audio container, e.g. `"mp3"` (default) or `"wav"`.
|
||||||
#[serde(default)]
|
#[serde(default)]
|
||||||
pub format: Option<String>,
|
pub format: Option<String>,
|
||||||
|
/// Chatterbox knobs (clamped server-side). exaggeration 0.25–2.0 (emotion),
|
||||||
|
/// cfg_weight 0.0–1.0 (pace; ~0.3 for fast speakers, 0 to neutralize a
|
||||||
|
/// reference accent), temperature 0.05–5.0 (randomness).
|
||||||
|
#[serde(default)]
|
||||||
|
pub exaggeration: Option<f32>,
|
||||||
|
#[serde(default)]
|
||||||
|
pub cfg_weight: Option<f32>,
|
||||||
|
#[serde(default)]
|
||||||
|
pub temperature: Option<f32>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[derive(Debug, Serialize)]
|
#[derive(Debug, Serialize)]
|
||||||
@@ -100,7 +160,7 @@ pub async fn tts_speech_handler(
|
|||||||
req: web::Json<TtsSpeechRequest>,
|
req: web::Json<TtsSpeechRequest>,
|
||||||
app_state: web::Data<AppState>,
|
app_state: web::Data<AppState>,
|
||||||
) -> impl Responder {
|
) -> impl Responder {
|
||||||
let text = req.text.trim();
|
let text = clean_for_tts(&req.text);
|
||||||
if text.is_empty() {
|
if text.is_empty() {
|
||||||
return HttpResponse::BadRequest().json(json!({ "error": "text is required" }));
|
return HttpResponse::BadRequest().json(json!({ "error": "text is required" }));
|
||||||
}
|
}
|
||||||
@@ -121,7 +181,15 @@ pub async fn tts_speech_handler(
|
|||||||
.filter(|s| !s.is_empty())
|
.filter(|s| !s.is_empty())
|
||||||
.or(dv.as_deref());
|
.or(dv.as_deref());
|
||||||
|
|
||||||
match client.text_to_speech(text, voice, format).await {
|
// Clamp generation knobs to Chatterbox's documented ranges before forwarding.
|
||||||
|
let exaggeration = req.exaggeration.map(|x| x.clamp(0.25, 2.0));
|
||||||
|
let cfg_weight = req.cfg_weight.map(|x| x.clamp(0.0, 1.0));
|
||||||
|
let temperature = req.temperature.map(|x| x.clamp(0.05, 5.0));
|
||||||
|
|
||||||
|
match client
|
||||||
|
.text_to_speech(&text, voice, format, exaggeration, cfg_weight, temperature)
|
||||||
|
.await
|
||||||
|
{
|
||||||
Ok(bytes) => {
|
Ok(bytes) => {
|
||||||
let audio_base64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
|
let audio_base64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
|
||||||
HttpResponse::Ok().json(TtsSpeechResponse {
|
HttpResponse::Ok().json(TtsSpeechResponse {
|
||||||
@@ -390,4 +458,36 @@ mod tests {
|
|||||||
"application/octet-stream"
|
"application/octet-stream"
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn clean_for_tts_strips_markdown() {
|
||||||
|
assert_eq!(
|
||||||
|
clean_for_tts("**Bold** and _italic_ and `code`"),
|
||||||
|
"Bold and italic and code"
|
||||||
|
);
|
||||||
|
assert_eq!(clean_for_tts("# Title\n\nbody"), "Title\n\nbody");
|
||||||
|
assert_eq!(
|
||||||
|
clean_for_tts("See [docs](http://x.com) now"),
|
||||||
|
"See docs now"
|
||||||
|
);
|
||||||
|
assert_eq!(clean_for_tts("- one\n- two"), "one\ntwo");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn clean_for_tts_strips_emoji_and_urls() {
|
||||||
|
assert_eq!(clean_for_tts("Hello 😀 world 🎉"), "Hello world");
|
||||||
|
assert_eq!(
|
||||||
|
clean_for_tts("visit https://example.com today"),
|
||||||
|
"visit today"
|
||||||
|
);
|
||||||
|
// ZWJ-glued emoji sequence is fully removed.
|
||||||
|
assert_eq!(clean_for_tts("family 👨👩👧 photo"), "family photo");
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn clean_for_tts_preserves_bracket_tags() {
|
||||||
|
// Non-turbo Chatterbox ignores these; a future Turbo uses them as
|
||||||
|
// paralinguistic cues — so we must not strip them.
|
||||||
|
assert_eq!(clean_for_tts("hello [laugh] there"), "hello [laugh] there");
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user