Clean insight text for TTS and pass through Chatterbox tuning knobs

/tts/speech now normalizes input before synthesis: unwraps markdown
links/images to visible text, drops heading/list/blockquote/emphasis
markers and URLs, strips emoji (which non-turbo Chatterbox mispronounces
or skips), and collapses whitespace. Centralized in clean_for_tts so the
app, WebUI, and curl all get clean audio. Bracketed tags are deliberately
preserved for a future Turbo (paralinguistic) switch.

Adds optional exaggeration / cfg_weight / temperature to the request,
clamped to Chatterbox's documented ranges and forwarded on the speech
body. Unit tests cover markdown/emoji/URL stripping and tag preservation.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
Cameron Cordes
2026-06-02 22:15:05 -04:00
parent 69268d03fe
commit 51be5df214
2 changed files with 118 additions and 2 deletions
+16
View File
@@ -138,11 +138,18 @@ impl LlamaCppClient {
/// Synthesize speech for `input` in an optional named `voice`, returning
/// the raw audio bytes (format per `response_format`, e.g. `"mp3"`/`"wav"`).
///
/// Chatterbox generation knobs are forwarded when set (caller is expected
/// to have range-clamped them): `exaggeration` (0.252.0, emotion),
/// `cfg_weight` (0.01.0, pace), `temperature` (0.055.0, randomness).
pub async fn text_to_speech(
&self,
input: &str,
voice: Option<&str>,
response_format: &str,
exaggeration: Option<f32>,
cfg_weight: Option<f32>,
temperature: Option<f32>,
) -> Result<Vec<u8>> {
let url = format!("{}/audio/speech", self.base_url);
let mut body = json!({
@@ -153,6 +160,15 @@ impl LlamaCppClient {
if let Some(v) = voice {
body["voice"] = Value::String(v.to_string());
}
if let Some(x) = exaggeration {
body["exaggeration"] = json!(x);
}
if let Some(x) = cfg_weight {
body["cfg_weight"] = json!(x);
}
if let Some(x) = temperature {
body["temperature"] = json!(x);
}
let resp = self
.client
+102 -2
View File
@@ -11,9 +11,11 @@ use anyhow::Context;
use base64::Engine;
use bytes::{BufMut, BytesMut};
use futures::StreamExt;
use regex::Regex;
use serde::{Deserialize, Serialize};
use serde_json::json;
use std::path::Path;
use std::sync::LazyLock;
use crate::data::Claims;
use crate::file_types::{is_audio_file, is_video_file};
@@ -59,6 +61,55 @@ fn default_voice() -> Option<String> {
.filter(|s| !s.is_empty())
}
// Markdown / formatting strippers, compiled once. Insight text is markdown,
// which TTS would otherwise read literally ("star star bold star star").
static MD_IMAGE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\([^)]*\)").unwrap());
static MD_LINK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\([^)]*\)").unwrap());
static MD_HEADING: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}#{1,6}\s*").unwrap());
static MD_BLOCKQUOTE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}>\s?").unwrap());
static MD_LIST: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}([-*+]|\d+\.)\s+").unwrap());
static MD_EMPHASIS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[*_`~]+").unwrap());
static URL_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"https?://\S+").unwrap());
static MULTISPACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[ \t]{2,}").unwrap());
static MULTINEWLINE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\n{3,}").unwrap());
/// True for emoji / pictographic symbols, which most TTS models either skip or
/// mispronounce. Covers the main emoji blocks plus dingbats, misc-technical,
/// variation selectors, and the ZWJ used to glue emoji sequences. We do NOT
/// strip `[bracketed]` tags — non-turbo Chatterbox ignores them, and a future
/// Turbo switch uses them as paralinguistic cues.
fn is_emoji_like(c: char) -> bool {
let u = c as u32;
matches!(u,
0x1F000..=0x1FAFF // emoji, pictographs, supplemental symbols, flags
| 0x2300..=0x23FF // misc technical (⌚ ⏰ ⏳ …)
| 0x2600..=0x27BF // misc symbols + dingbats
| 0x2B00..=0x2BFF // misc symbols & arrows (★ ⬆ …)
| 0xFE00..=0xFE0F // variation selectors
| 0x200D // zero-width joiner
)
}
/// Normalize insight text for speech: unwrap markdown links/images to their
/// visible text, drop heading/list/blockquote/emphasis markers and URLs, strip
/// emoji, and collapse whitespace. Centralized here so every caller (app,
/// WebUI, curl) gets clean audio.
fn clean_for_tts(input: &str) -> String {
let s = MD_IMAGE.replace_all(input, "$1");
let s = MD_LINK.replace_all(&s, "$1");
let s = MD_HEADING.replace_all(&s, "");
let s = MD_BLOCKQUOTE.replace_all(&s, "");
let s = MD_LIST.replace_all(&s, "");
let s = MD_EMPHASIS.replace_all(&s, "");
let s = URL_RE.replace_all(&s, " ");
let s: String = s.chars().filter(|c| !is_emoji_like(*c)).collect();
let s = MULTISPACE.replace_all(&s, " ");
let s = MULTINEWLINE.replace_all(&s, "\n\n");
s.trim().to_string()
}
fn guess_audio_mime(path: &Path) -> String {
match path
.extension()
@@ -84,6 +135,15 @@ pub struct TtsSpeechRequest {
/// Audio container, e.g. `"mp3"` (default) or `"wav"`.
#[serde(default)]
pub format: Option<String>,
/// Chatterbox knobs (clamped server-side). exaggeration 0.252.0 (emotion),
/// cfg_weight 0.01.0 (pace; ~0.3 for fast speakers, 0 to neutralize a
/// reference accent), temperature 0.055.0 (randomness).
#[serde(default)]
pub exaggeration: Option<f32>,
#[serde(default)]
pub cfg_weight: Option<f32>,
#[serde(default)]
pub temperature: Option<f32>,
}
#[derive(Debug, Serialize)]
@@ -100,7 +160,7 @@ pub async fn tts_speech_handler(
req: web::Json<TtsSpeechRequest>,
app_state: web::Data<AppState>,
) -> impl Responder {
let text = req.text.trim();
let text = clean_for_tts(&req.text);
if text.is_empty() {
return HttpResponse::BadRequest().json(json!({ "error": "text is required" }));
}
@@ -121,7 +181,15 @@ pub async fn tts_speech_handler(
.filter(|s| !s.is_empty())
.or(dv.as_deref());
match client.text_to_speech(text, voice, format).await {
// Clamp generation knobs to Chatterbox's documented ranges before forwarding.
let exaggeration = req.exaggeration.map(|x| x.clamp(0.25, 2.0));
let cfg_weight = req.cfg_weight.map(|x| x.clamp(0.0, 1.0));
let temperature = req.temperature.map(|x| x.clamp(0.05, 5.0));
match client
.text_to_speech(&text, voice, format, exaggeration, cfg_weight, temperature)
.await
{
Ok(bytes) => {
let audio_base64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
HttpResponse::Ok().json(TtsSpeechResponse {
@@ -390,4 +458,36 @@ mod tests {
"application/octet-stream"
);
}
#[test]
fn clean_for_tts_strips_markdown() {
assert_eq!(
clean_for_tts("**Bold** and _italic_ and `code`"),
"Bold and italic and code"
);
assert_eq!(clean_for_tts("# Title\n\nbody"), "Title\n\nbody");
assert_eq!(
clean_for_tts("See [docs](http://x.com) now"),
"See docs now"
);
assert_eq!(clean_for_tts("- one\n- two"), "one\ntwo");
}
#[test]
fn clean_for_tts_strips_emoji_and_urls() {
assert_eq!(clean_for_tts("Hello 😀 world 🎉"), "Hello world");
assert_eq!(
clean_for_tts("visit https://example.com today"),
"visit today"
);
// ZWJ-glued emoji sequence is fully removed.
assert_eq!(clean_for_tts("family 👨‍👩‍👧 photo"), "family photo");
}
#[test]
fn clean_for_tts_preserves_bracket_tags() {
// Non-turbo Chatterbox ignores these; a future Turbo uses them as
// paralinguistic cues — so we must not strip them.
assert_eq!(clean_for_tts("hello [laugh] there"), "hello [laugh] there");
}
}