Clean insight text for TTS and pass through Chatterbox tuning knobs
/tts/speech now normalizes input before synthesis: unwraps markdown links/images to visible text, drops heading/list/blockquote/emphasis markers and URLs, strips emoji (which non-turbo Chatterbox mispronounces or skips), and collapses whitespace. Centralized in clean_for_tts so the app, WebUI, and curl all get clean audio. Bracketed tags are deliberately preserved for a future Turbo (paralinguistic) switch. Adds optional exaggeration / cfg_weight / temperature to the request, clamped to Chatterbox's documented ranges and forwarded on the speech body. Unit tests cover markdown/emoji/URL stripping and tag preservation. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+102
-2
@@ -11,9 +11,11 @@ use anyhow::Context;
|
||||
use base64::Engine;
|
||||
use bytes::{BufMut, BytesMut};
|
||||
use futures::StreamExt;
|
||||
use regex::Regex;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_json::json;
|
||||
use std::path::Path;
|
||||
use std::sync::LazyLock;
|
||||
|
||||
use crate::data::Claims;
|
||||
use crate::file_types::{is_audio_file, is_video_file};
|
||||
@@ -59,6 +61,55 @@ fn default_voice() -> Option<String> {
|
||||
.filter(|s| !s.is_empty())
|
||||
}
|
||||
|
||||
// Markdown / formatting strippers, compiled once. Insight text is markdown,
|
||||
// which TTS would otherwise read literally ("star star bold star star").
|
||||
static MD_IMAGE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\([^)]*\)").unwrap());
|
||||
static MD_LINK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\([^)]*\)").unwrap());
|
||||
static MD_HEADING: LazyLock<Regex> =
|
||||
LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}#{1,6}\s*").unwrap());
|
||||
static MD_BLOCKQUOTE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}>\s?").unwrap());
|
||||
static MD_LIST: LazyLock<Regex> =
|
||||
LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}([-*+]|\d+\.)\s+").unwrap());
|
||||
static MD_EMPHASIS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[*_`~]+").unwrap());
|
||||
static URL_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"https?://\S+").unwrap());
|
||||
static MULTISPACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[ \t]{2,}").unwrap());
|
||||
static MULTINEWLINE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\n{3,}").unwrap());
|
||||
|
||||
/// True for emoji / pictographic symbols, which most TTS models either skip or
|
||||
/// mispronounce. Covers the main emoji blocks plus dingbats, misc-technical,
|
||||
/// variation selectors, and the ZWJ used to glue emoji sequences. We do NOT
|
||||
/// strip `[bracketed]` tags — non-turbo Chatterbox ignores them, and a future
|
||||
/// Turbo switch uses them as paralinguistic cues.
|
||||
fn is_emoji_like(c: char) -> bool {
|
||||
let u = c as u32;
|
||||
matches!(u,
|
||||
0x1F000..=0x1FAFF // emoji, pictographs, supplemental symbols, flags
|
||||
| 0x2300..=0x23FF // misc technical (⌚ ⏰ ⏳ …)
|
||||
| 0x2600..=0x27BF // misc symbols + dingbats
|
||||
| 0x2B00..=0x2BFF // misc symbols & arrows (★ ⬆ …)
|
||||
| 0xFE00..=0xFE0F // variation selectors
|
||||
| 0x200D // zero-width joiner
|
||||
)
|
||||
}
|
||||
|
||||
/// Normalize insight text for speech: unwrap markdown links/images to their
|
||||
/// visible text, drop heading/list/blockquote/emphasis markers and URLs, strip
|
||||
/// emoji, and collapse whitespace. Centralized here so every caller (app,
|
||||
/// WebUI, curl) gets clean audio.
|
||||
fn clean_for_tts(input: &str) -> String {
|
||||
let s = MD_IMAGE.replace_all(input, "$1");
|
||||
let s = MD_LINK.replace_all(&s, "$1");
|
||||
let s = MD_HEADING.replace_all(&s, "");
|
||||
let s = MD_BLOCKQUOTE.replace_all(&s, "");
|
||||
let s = MD_LIST.replace_all(&s, "");
|
||||
let s = MD_EMPHASIS.replace_all(&s, "");
|
||||
let s = URL_RE.replace_all(&s, " ");
|
||||
let s: String = s.chars().filter(|c| !is_emoji_like(*c)).collect();
|
||||
let s = MULTISPACE.replace_all(&s, " ");
|
||||
let s = MULTINEWLINE.replace_all(&s, "\n\n");
|
||||
s.trim().to_string()
|
||||
}
|
||||
|
||||
fn guess_audio_mime(path: &Path) -> String {
|
||||
match path
|
||||
.extension()
|
||||
@@ -84,6 +135,15 @@ pub struct TtsSpeechRequest {
|
||||
/// Audio container, e.g. `"mp3"` (default) or `"wav"`.
|
||||
#[serde(default)]
|
||||
pub format: Option<String>,
|
||||
/// Chatterbox knobs (clamped server-side). exaggeration 0.25–2.0 (emotion),
|
||||
/// cfg_weight 0.0–1.0 (pace; ~0.3 for fast speakers, 0 to neutralize a
|
||||
/// reference accent), temperature 0.05–5.0 (randomness).
|
||||
#[serde(default)]
|
||||
pub exaggeration: Option<f32>,
|
||||
#[serde(default)]
|
||||
pub cfg_weight: Option<f32>,
|
||||
#[serde(default)]
|
||||
pub temperature: Option<f32>,
|
||||
}
|
||||
|
||||
#[derive(Debug, Serialize)]
|
||||
@@ -100,7 +160,7 @@ pub async fn tts_speech_handler(
|
||||
req: web::Json<TtsSpeechRequest>,
|
||||
app_state: web::Data<AppState>,
|
||||
) -> impl Responder {
|
||||
let text = req.text.trim();
|
||||
let text = clean_for_tts(&req.text);
|
||||
if text.is_empty() {
|
||||
return HttpResponse::BadRequest().json(json!({ "error": "text is required" }));
|
||||
}
|
||||
@@ -121,7 +181,15 @@ pub async fn tts_speech_handler(
|
||||
.filter(|s| !s.is_empty())
|
||||
.or(dv.as_deref());
|
||||
|
||||
match client.text_to_speech(text, voice, format).await {
|
||||
// Clamp generation knobs to Chatterbox's documented ranges before forwarding.
|
||||
let exaggeration = req.exaggeration.map(|x| x.clamp(0.25, 2.0));
|
||||
let cfg_weight = req.cfg_weight.map(|x| x.clamp(0.0, 1.0));
|
||||
let temperature = req.temperature.map(|x| x.clamp(0.05, 5.0));
|
||||
|
||||
match client
|
||||
.text_to_speech(&text, voice, format, exaggeration, cfg_weight, temperature)
|
||||
.await
|
||||
{
|
||||
Ok(bytes) => {
|
||||
let audio_base64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
|
||||
HttpResponse::Ok().json(TtsSpeechResponse {
|
||||
@@ -390,4 +458,36 @@ mod tests {
|
||||
"application/octet-stream"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn clean_for_tts_strips_markdown() {
|
||||
assert_eq!(
|
||||
clean_for_tts("**Bold** and _italic_ and `code`"),
|
||||
"Bold and italic and code"
|
||||
);
|
||||
assert_eq!(clean_for_tts("# Title\n\nbody"), "Title\n\nbody");
|
||||
assert_eq!(
|
||||
clean_for_tts("See [docs](http://x.com) now"),
|
||||
"See docs now"
|
||||
);
|
||||
assert_eq!(clean_for_tts("- one\n- two"), "one\ntwo");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn clean_for_tts_strips_emoji_and_urls() {
|
||||
assert_eq!(clean_for_tts("Hello 😀 world 🎉"), "Hello world");
|
||||
assert_eq!(
|
||||
clean_for_tts("visit https://example.com today"),
|
||||
"visit today"
|
||||
);
|
||||
// ZWJ-glued emoji sequence is fully removed.
|
||||
assert_eq!(clean_for_tts("family 👨👩👧 photo"), "family photo");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn clean_for_tts_preserves_bracket_tags() {
|
||||
// Non-turbo Chatterbox ignores these; a future Turbo uses them as
|
||||
// paralinguistic cues — so we must not strip them.
|
||||
assert_eq!(clean_for_tts("hello [laugh] there"), "hello [laugh] there");
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user