Clean insight text for TTS and pass through Chatterbox tuning knobs

/tts/speech now normalizes input before synthesis: unwraps markdown links/images to visible text, drops heading/list/blockquote/emphasis markers and URLs, strips emoji (which non-turbo Chatterbox mispronounces or skips), and collapses whitespace. Centralized in clean_for_tts so the app, WebUI, and curl all get clean audio. Bracketed tags are deliberately preserved for a future Turbo (paralinguistic) switch. Adds optional exaggeration / cfg_weight / temperature to the request, clamped to Chatterbox's documented ranges and forwarded on the speech body. Unit tests cover markdown/emoji/URL stripping and tag preservation. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-02 22:15:05 -04:00
parent 69268d03fe
commit 51be5df214
2 changed files with 118 additions and 2 deletions
@@ -11,9 +11,11 @@ use anyhow::Context;
 use base64::Engine;
 use bytes::{BufMut, BytesMut};
 use futures::StreamExt;
+use regex::Regex;
 use serde::{Deserialize, Serialize};
 use serde_json::json;
 use std::path::Path;
+use std::sync::LazyLock;

 use crate::data::Claims;
 use crate::file_types::{is_audio_file, is_video_file};
@@ -59,6 +61,55 @@ fn default_voice() -> Option<String> {
        .filter(|s| !s.is_empty())
 }

+// Markdown / formatting strippers, compiled once. Insight text is markdown,
+// which TTS would otherwise read literally ("star star bold star star").
+static MD_IMAGE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\([^)]*\)").unwrap());
+static MD_LINK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\([^)]*\)").unwrap());
+static MD_HEADING: LazyLock<Regex> =
+    LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}#{1,6}\s*").unwrap());
+static MD_BLOCKQUOTE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}>\s?").unwrap());
+static MD_LIST: LazyLock<Regex> =
+    LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}([-*+]|\d+\.)\s+").unwrap());
+static MD_EMPHASIS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[*_`~]+").unwrap());
+static URL_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"https?://\S+").unwrap());
+static MULTISPACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[ \t]{2,}").unwrap());
+static MULTINEWLINE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\n{3,}").unwrap());
+
+/// True for emoji / pictographic symbols, which most TTS models either skip or
+/// mispronounce. Covers the main emoji blocks plus dingbats, misc-technical,
+/// variation selectors, and the ZWJ used to glue emoji sequences. We do NOT
+/// strip `[bracketed]` tags — non-turbo Chatterbox ignores them, and a future
+/// Turbo switch uses them as paralinguistic cues.
+fn is_emoji_like(c: char) -> bool {
+    let u = c as u32;
+    matches!(u,
+        0x1F000..=0x1FAFF   // emoji, pictographs, supplemental symbols, flags
+        | 0x2300..=0x23FF   // misc technical (⌚ ⏰ ⏳ …)
+        | 0x2600..=0x27BF   // misc symbols + dingbats
+        | 0x2B00..=0x2BFF   // misc symbols & arrows (★ ⬆ …)
+        | 0xFE00..=0xFE0F   // variation selectors
+        | 0x200D            // zero-width joiner
+    )
+}
+
+/// Normalize insight text for speech: unwrap markdown links/images to their
+/// visible text, drop heading/list/blockquote/emphasis markers and URLs, strip
+/// emoji, and collapse whitespace. Centralized here so every caller (app,
+/// WebUI, curl) gets clean audio.
+fn clean_for_tts(input: &str) -> String {
+    let s = MD_IMAGE.replace_all(input, "$1");
+    let s = MD_LINK.replace_all(&s, "$1");
+    let s = MD_HEADING.replace_all(&s, "");
+    let s = MD_BLOCKQUOTE.replace_all(&s, "");
+    let s = MD_LIST.replace_all(&s, "");
+    let s = MD_EMPHASIS.replace_all(&s, "");
+    let s = URL_RE.replace_all(&s, " ");
+    let s: String = s.chars().filter(|c| !is_emoji_like(*c)).collect();
+    let s = MULTISPACE.replace_all(&s, " ");
+    let s = MULTINEWLINE.replace_all(&s, "\n\n");
+    s.trim().to_string()
+}
+
 fn guess_audio_mime(path: &Path) -> String {
    match path
        .extension()
@@ -84,6 +135,15 @@ pub struct TtsSpeechRequest {
    /// Audio container, e.g. `"mp3"` (default) or `"wav"`.
    #[serde(default)]
    pub format: Option<String>,
+    /// Chatterbox knobs (clamped server-side). exaggeration 0.25–2.0 (emotion),
+    /// cfg_weight 0.0–1.0 (pace; ~0.3 for fast speakers, 0 to neutralize a
+    /// reference accent), temperature 0.05–5.0 (randomness).
+    #[serde(default)]
+    pub exaggeration: Option<f32>,
+    #[serde(default)]
+    pub cfg_weight: Option<f32>,
+    #[serde(default)]
+    pub temperature: Option<f32>,
 }

 #[derive(Debug, Serialize)]
@@ -100,7 +160,7 @@ pub async fn tts_speech_handler(
    req: web::Json<TtsSpeechRequest>,
    app_state: web::Data<AppState>,
 ) -> impl Responder {
-    let text = req.text.trim();
+    let text = clean_for_tts(&req.text);
    if text.is_empty() {
        return HttpResponse::BadRequest().json(json!({ "error": "text is required" }));
    }
@@ -121,7 +181,15 @@ pub async fn tts_speech_handler(
        .filter(|s| !s.is_empty())
        .or(dv.as_deref());

-    match client.text_to_speech(text, voice, format).await {
+    // Clamp generation knobs to Chatterbox's documented ranges before forwarding.
+    let exaggeration = req.exaggeration.map(|x| x.clamp(0.25, 2.0));
+    let cfg_weight = req.cfg_weight.map(|x| x.clamp(0.0, 1.0));
+    let temperature = req.temperature.map(|x| x.clamp(0.05, 5.0));
+
+    match client
+        .text_to_speech(&text, voice, format, exaggeration, cfg_weight, temperature)
+        .await
+    {
        Ok(bytes) => {
            let audio_base64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
            HttpResponse::Ok().json(TtsSpeechResponse {
@@ -390,4 +458,36 @@ mod tests {
            "application/octet-stream"
        );
    }
+
+    #[test]
+    fn clean_for_tts_strips_markdown() {
+        assert_eq!(
+            clean_for_tts("**Bold** and _italic_ and `code`"),
+            "Bold and italic and code"
+        );
+        assert_eq!(clean_for_tts("# Title\n\nbody"), "Title\n\nbody");
+        assert_eq!(
+            clean_for_tts("See [docs](http://x.com) now"),
+            "See docs now"
+        );
+        assert_eq!(clean_for_tts("- one\n- two"), "one\ntwo");
+    }
+
+    #[test]
+    fn clean_for_tts_strips_emoji_and_urls() {
+        assert_eq!(clean_for_tts("Hello 😀 world 🎉"), "Hello world");
+        assert_eq!(
+            clean_for_tts("visit https://example.com today"),
+            "visit today"
+        );
+        // ZWJ-glued emoji sequence is fully removed.
+        assert_eq!(clean_for_tts("family 👨‍👩‍👧 photo"), "family photo");
+    }
+
+    #[test]
+    fn clean_for_tts_preserves_bracket_tags() {
+        // Non-turbo Chatterbox ignores these; a future Turbo uses them as
+        // paralinguistic cues — so we must not strip them.
+        assert_eq!(clean_for_tts("hello [laugh] there"), "hello [laugh] there");
+    }
 }