2026-06-07 21:35:50 +00:00
2 changed files with 118 additions and 2 deletions
@@ -138,11 +138,18 @@ impl LlamaCppClient {
    /// Synthesize speech for `input` in an optional named `voice`, returning
    /// the raw audio bytes (format per `response_format`, e.g. `"mp3"`/`"wav"`).
    ///
    /// Chatterbox generation knobs are forwarded when set (caller is expected
    /// to have range-clamped them): `exaggeration` (0.25–2.0, emotion),
    /// `cfg_weight` (0.0–1.0, pace), `temperature` (0.05–5.0, randomness).
    pub async fn text_to_speech(
        &self,
        input: &str,
        voice: Option<&str>,
        response_format: &str,
        exaggeration: Option<f32>,
        cfg_weight: Option<f32>,
        temperature: Option<f32>,
    ) -> Result<Vec<u8>> {
        let url = format!("{}/audio/speech", self.base_url);
        let mut body = json!({
@@ -153,6 +160,15 @@ impl LlamaCppClient {
        if let Some(v) = voice {
            body["voice"] = Value::String(v.to_string());
        }
        if let Some(x) = exaggeration {
            body["exaggeration"] = json!(x);
        }
        if let Some(x) = cfg_weight {
            body["cfg_weight"] = json!(x);
        }
        if let Some(x) = temperature {
            body["temperature"] = json!(x);
        }
        let resp = self
            .client
@@ -11,9 +11,11 @@ use anyhow::Context;
 use base64::Engine;
 use bytes::{BufMut, BytesMut};
 use futures::StreamExt;
 use regex::Regex;
 use serde::{Deserialize, Serialize};
 use serde_json::json;
 use std::path::Path;
 use std::sync::LazyLock;
 use crate::data::Claims;
 use crate::file_types::{is_audio_file, is_video_file};
@@ -59,6 +61,55 @@ fn default_voice() -> Option<String> {
        .filter(|s| !s.is_empty())
 }
 // Markdown / formatting strippers, compiled once. Insight text is markdown,
 // which TTS would otherwise read literally ("star star bold star star").
 static MD_IMAGE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\([^)]*\)").unwrap());
 static MD_LINK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\([^)]*\)").unwrap());
 static MD_HEADING: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}#{1,6}\s*").unwrap());
 static MD_BLOCKQUOTE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}>\s?").unwrap());
 static MD_LIST: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}([-*+]|\d+\.)\s+").unwrap());
 static MD_EMPHASIS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[*_`~]+").unwrap());
 static URL_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"https?://\S+").unwrap());
 static MULTISPACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[ \t]{2,}").unwrap());
 static MULTINEWLINE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\n{3,}").unwrap());
 /// True for emoji / pictographic symbols, which most TTS models either skip or
 /// mispronounce. Covers the main emoji blocks plus dingbats, misc-technical,
 /// variation selectors, and the ZWJ used to glue emoji sequences. We do NOT
 /// strip `[bracketed]` tags — non-turbo Chatterbox ignores them, and a future
 /// Turbo switch uses them as paralinguistic cues.
 fn is_emoji_like(c: char) -> bool {
    let u = c as u32;
    matches!(u,
        0x1F000..=0x1FAFF   // emoji, pictographs, supplemental symbols, flags
        | 0x2300..=0x23FF   // misc technical (⌚ ⏰ ⏳ …)
        | 0x2600..=0x27BF   // misc symbols + dingbats
        | 0x2B00..=0x2BFF   // misc symbols & arrows (★ ⬆ …)
        | 0xFE00..=0xFE0F   // variation selectors
        | 0x200D            // zero-width joiner
    )
 }
 /// Normalize insight text for speech: unwrap markdown links/images to their
 /// visible text, drop heading/list/blockquote/emphasis markers and URLs, strip
 /// emoji, and collapse whitespace. Centralized here so every caller (app,
 /// WebUI, curl) gets clean audio.
 fn clean_for_tts(input: &str) -> String {
    let s = MD_IMAGE.replace_all(input, "$1");
    let s = MD_LINK.replace_all(&s, "$1");
    let s = MD_HEADING.replace_all(&s, "");
    let s = MD_BLOCKQUOTE.replace_all(&s, "");
    let s = MD_LIST.replace_all(&s, "");
    let s = MD_EMPHASIS.replace_all(&s, "");
    let s = URL_RE.replace_all(&s, " ");
    let s: String = s.chars().filter(|c| !is_emoji_like(*c)).collect();
    let s = MULTISPACE.replace_all(&s, " ");
    let s = MULTINEWLINE.replace_all(&s, "\n\n");
    s.trim().to_string()
 }
 fn guess_audio_mime(path: &Path) -> String {
    match path
        .extension()
@@ -84,6 +135,15 @@ pub struct TtsSpeechRequest {
    /// Audio container, e.g. `"mp3"` (default) or `"wav"`.
    #[serde(default)]
    pub format: Option<String>,
    /// Chatterbox knobs (clamped server-side). exaggeration 0.25–2.0 (emotion),
    /// cfg_weight 0.0–1.0 (pace; ~0.3 for fast speakers, 0 to neutralize a
    /// reference accent), temperature 0.05–5.0 (randomness).
    #[serde(default)]
    pub exaggeration: Option<f32>,
    #[serde(default)]
    pub cfg_weight: Option<f32>,
    #[serde(default)]
    pub temperature: Option<f32>,
 }
 #[derive(Debug, Serialize)]
@@ -100,7 +160,7 @@ pub async fn tts_speech_handler(
    req: web::Json<TtsSpeechRequest>,
    app_state: web::Data<AppState>,
 ) -> impl Responder {
-    let text = req.text.trim();
+    let text = clean_for_tts(&req.text);
    if text.is_empty() {
        return HttpResponse::BadRequest().json(json!({ "error": "text is required" }));
    }
@@ -121,7 +181,15 @@ pub async fn tts_speech_handler(
        .filter(|s| !s.is_empty())
        .or(dv.as_deref());
-    match client.text_to_speech(text, voice, format).await {
+    // Clamp generation knobs to Chatterbox's documented ranges before forwarding.
    let exaggeration = req.exaggeration.map(|x| x.clamp(0.25, 2.0));
    let cfg_weight = req.cfg_weight.map(|x| x.clamp(0.0, 1.0));
    let temperature = req.temperature.map(|x| x.clamp(0.05, 5.0));
    match client
        .text_to_speech(&text, voice, format, exaggeration, cfg_weight, temperature)
        .await
    {
        Ok(bytes) => {
            let audio_base64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
            HttpResponse::Ok().json(TtsSpeechResponse {
@@ -390,4 +458,36 @@ mod tests {
            "application/octet-stream"
        );
    }
    #[test]
    fn clean_for_tts_strips_markdown() {
        assert_eq!(
            clean_for_tts("**Bold** and _italic_ and `code`"),
            "Bold and italic and code"
        );
        assert_eq!(clean_for_tts("# Title\n\nbody"), "Title\n\nbody");
        assert_eq!(
            clean_for_tts("See [docs](http://x.com) now"),
            "See docs now"
        );
        assert_eq!(clean_for_tts("- one\n- two"), "one\ntwo");
    }
    #[test]
    fn clean_for_tts_strips_emoji_and_urls() {
        assert_eq!(clean_for_tts("Hello 😀 world 🎉"), "Hello world");
        assert_eq!(
            clean_for_tts("visit https://example.com today"),
            "visit today"
        );
        // ZWJ-glued emoji sequence is fully removed.
        assert_eq!(clean_for_tts("family 👨‍👩‍👧 photo"), "family photo");
    }
    #[test]
    fn clean_for_tts_preserves_bracket_tags() {
        // Non-turbo Chatterbox ignores these; a future Turbo uses them as
        // paralinguistic cues — so we must not strip them.
        assert_eq!(clean_for_tts("hello [laugh] there"), "hello [laugh] there");
    }
 }