From 51be5df2148f53014ef5f0161c5f305dc17e0a92 Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Tue, 2 Jun 2026 22:15:05 -0400 Subject: [PATCH] Clean insight text for TTS and pass through Chatterbox tuning knobs /tts/speech now normalizes input before synthesis: unwraps markdown links/images to visible text, drops heading/list/blockquote/emphasis markers and URLs, strips emoji (which non-turbo Chatterbox mispronounces or skips), and collapses whitespace. Centralized in clean_for_tts so the app, WebUI, and curl all get clean audio. Bracketed tags are deliberately preserved for a future Turbo (paralinguistic) switch. Adds optional exaggeration / cfg_weight / temperature to the request, clamped to Chatterbox's documented ranges and forwarded on the speech body. Unit tests cover markdown/emoji/URL stripping and tag preservation. Co-Authored-By: Claude Opus 4.8 (1M context) --- src/ai/llamacpp.rs | 16 +++++++ src/ai/tts.rs | 104 ++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 118 insertions(+), 2 deletions(-) diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs index afd7f1b..2946688 100644 --- a/src/ai/llamacpp.rs +++ b/src/ai/llamacpp.rs @@ -138,11 +138,18 @@ impl LlamaCppClient { /// Synthesize speech for `input` in an optional named `voice`, returning /// the raw audio bytes (format per `response_format`, e.g. `"mp3"`/`"wav"`). + /// + /// Chatterbox generation knobs are forwarded when set (caller is expected + /// to have range-clamped them): `exaggeration` (0.25–2.0, emotion), + /// `cfg_weight` (0.0–1.0, pace), `temperature` (0.05–5.0, randomness). pub async fn text_to_speech( &self, input: &str, voice: Option<&str>, response_format: &str, + exaggeration: Option, + cfg_weight: Option, + temperature: Option, ) -> Result> { let url = format!("{}/audio/speech", self.base_url); let mut body = json!({ @@ -153,6 +160,15 @@ impl LlamaCppClient { if let Some(v) = voice { body["voice"] = Value::String(v.to_string()); } + if let Some(x) = exaggeration { + body["exaggeration"] = json!(x); + } + if let Some(x) = cfg_weight { + body["cfg_weight"] = json!(x); + } + if let Some(x) = temperature { + body["temperature"] = json!(x); + } let resp = self .client diff --git a/src/ai/tts.rs b/src/ai/tts.rs index b2bd675..8078132 100644 --- a/src/ai/tts.rs +++ b/src/ai/tts.rs @@ -11,9 +11,11 @@ use anyhow::Context; use base64::Engine; use bytes::{BufMut, BytesMut}; use futures::StreamExt; +use regex::Regex; use serde::{Deserialize, Serialize}; use serde_json::json; use std::path::Path; +use std::sync::LazyLock; use crate::data::Claims; use crate::file_types::{is_audio_file, is_video_file}; @@ -59,6 +61,55 @@ fn default_voice() -> Option { .filter(|s| !s.is_empty()) } +// Markdown / formatting strippers, compiled once. Insight text is markdown, +// which TTS would otherwise read literally ("star star bold star star"). +static MD_IMAGE: LazyLock = LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\([^)]*\)").unwrap()); +static MD_LINK: LazyLock = LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\([^)]*\)").unwrap()); +static MD_HEADING: LazyLock = + LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}#{1,6}\s*").unwrap()); +static MD_BLOCKQUOTE: LazyLock = LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}>\s?").unwrap()); +static MD_LIST: LazyLock = + LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}([-*+]|\d+\.)\s+").unwrap()); +static MD_EMPHASIS: LazyLock = LazyLock::new(|| Regex::new(r"[*_`~]+").unwrap()); +static URL_RE: LazyLock = LazyLock::new(|| Regex::new(r"https?://\S+").unwrap()); +static MULTISPACE: LazyLock = LazyLock::new(|| Regex::new(r"[ \t]{2,}").unwrap()); +static MULTINEWLINE: LazyLock = LazyLock::new(|| Regex::new(r"\n{3,}").unwrap()); + +/// True for emoji / pictographic symbols, which most TTS models either skip or +/// mispronounce. Covers the main emoji blocks plus dingbats, misc-technical, +/// variation selectors, and the ZWJ used to glue emoji sequences. We do NOT +/// strip `[bracketed]` tags — non-turbo Chatterbox ignores them, and a future +/// Turbo switch uses them as paralinguistic cues. +fn is_emoji_like(c: char) -> bool { + let u = c as u32; + matches!(u, + 0x1F000..=0x1FAFF // emoji, pictographs, supplemental symbols, flags + | 0x2300..=0x23FF // misc technical (⌚ ⏰ ⏳ …) + | 0x2600..=0x27BF // misc symbols + dingbats + | 0x2B00..=0x2BFF // misc symbols & arrows (★ ⬆ …) + | 0xFE00..=0xFE0F // variation selectors + | 0x200D // zero-width joiner + ) +} + +/// Normalize insight text for speech: unwrap markdown links/images to their +/// visible text, drop heading/list/blockquote/emphasis markers and URLs, strip +/// emoji, and collapse whitespace. Centralized here so every caller (app, +/// WebUI, curl) gets clean audio. +fn clean_for_tts(input: &str) -> String { + let s = MD_IMAGE.replace_all(input, "$1"); + let s = MD_LINK.replace_all(&s, "$1"); + let s = MD_HEADING.replace_all(&s, ""); + let s = MD_BLOCKQUOTE.replace_all(&s, ""); + let s = MD_LIST.replace_all(&s, ""); + let s = MD_EMPHASIS.replace_all(&s, ""); + let s = URL_RE.replace_all(&s, " "); + let s: String = s.chars().filter(|c| !is_emoji_like(*c)).collect(); + let s = MULTISPACE.replace_all(&s, " "); + let s = MULTINEWLINE.replace_all(&s, "\n\n"); + s.trim().to_string() +} + fn guess_audio_mime(path: &Path) -> String { match path .extension() @@ -84,6 +135,15 @@ pub struct TtsSpeechRequest { /// Audio container, e.g. `"mp3"` (default) or `"wav"`. #[serde(default)] pub format: Option, + /// Chatterbox knobs (clamped server-side). exaggeration 0.25–2.0 (emotion), + /// cfg_weight 0.0–1.0 (pace; ~0.3 for fast speakers, 0 to neutralize a + /// reference accent), temperature 0.05–5.0 (randomness). + #[serde(default)] + pub exaggeration: Option, + #[serde(default)] + pub cfg_weight: Option, + #[serde(default)] + pub temperature: Option, } #[derive(Debug, Serialize)] @@ -100,7 +160,7 @@ pub async fn tts_speech_handler( req: web::Json, app_state: web::Data, ) -> impl Responder { - let text = req.text.trim(); + let text = clean_for_tts(&req.text); if text.is_empty() { return HttpResponse::BadRequest().json(json!({ "error": "text is required" })); } @@ -121,7 +181,15 @@ pub async fn tts_speech_handler( .filter(|s| !s.is_empty()) .or(dv.as_deref()); - match client.text_to_speech(text, voice, format).await { + // Clamp generation knobs to Chatterbox's documented ranges before forwarding. + let exaggeration = req.exaggeration.map(|x| x.clamp(0.25, 2.0)); + let cfg_weight = req.cfg_weight.map(|x| x.clamp(0.0, 1.0)); + let temperature = req.temperature.map(|x| x.clamp(0.05, 5.0)); + + match client + .text_to_speech(&text, voice, format, exaggeration, cfg_weight, temperature) + .await + { Ok(bytes) => { let audio_base64 = base64::engine::general_purpose::STANDARD.encode(&bytes); HttpResponse::Ok().json(TtsSpeechResponse { @@ -390,4 +458,36 @@ mod tests { "application/octet-stream" ); } + + #[test] + fn clean_for_tts_strips_markdown() { + assert_eq!( + clean_for_tts("**Bold** and _italic_ and `code`"), + "Bold and italic and code" + ); + assert_eq!(clean_for_tts("# Title\n\nbody"), "Title\n\nbody"); + assert_eq!( + clean_for_tts("See [docs](http://x.com) now"), + "See docs now" + ); + assert_eq!(clean_for_tts("- one\n- two"), "one\ntwo"); + } + + #[test] + fn clean_for_tts_strips_emoji_and_urls() { + assert_eq!(clean_for_tts("Hello 😀 world 🎉"), "Hello world"); + assert_eq!( + clean_for_tts("visit https://example.com today"), + "visit today" + ); + // ZWJ-glued emoji sequence is fully removed. + assert_eq!(clean_for_tts("family 👨‍👩‍👧 photo"), "family photo"); + } + + #[test] + fn clean_for_tts_preserves_bracket_tags() { + // Non-turbo Chatterbox ignores these; a future Turbo uses them as + // paralinguistic cues — so we must not strip them. + assert_eq!(clean_for_tts("hello [laugh] there"), "hello [laugh] there"); + } }