From 2e0f78aa1b1a5782d4444675eceee0f7bb82683e Mon Sep 17 00:00:00 2001 From: Cameron Cordes Date: Thu, 11 Jun 2026 23:06:18 -0400 Subject: [PATCH] Add user-configurable TTS pronunciation overrides MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A JSON map (TTS_PRONUNCIATIONS_PATH, default tts_pronunciations.json) rewrites mispronounced words — place names, initialisms, dotted abbreviations — to phonetic spellings before synthesis, applied after markdown cleanup in both /tts/speech paths. Whole-word smartcase matching (lowercase keys match any casing, uppercase keys exact), longest key wins, hot-reloaded on mtime change with last-good fallback on parse errors. See tts_pronunciations.example.json. Co-Authored-By: Claude Fable 5 --- .gitignore | 2 + CLAUDE.md | 2 + README.md | 10 +- src/ai/mod.rs | 1 + src/ai/pronunciation.rs | 282 ++++++++++++++++++++++++++++++++ src/ai/tts.rs | 12 +- tts_pronunciations.example.json | 13 ++ 7 files changed, 319 insertions(+), 3 deletions(-) create mode 100644 src/ai/pronunciation.rs create mode 100644 tts_pronunciations.example.json diff --git a/.gitignore b/.gitignore index 2bd4d6e..5dceed2 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,8 @@ database/target *.db-shm *.db-wal .env +# Server-local TTS pronunciation overrides (tts_pronunciations.example.json is the template) +/tts_pronunciations.json /tmp /docs /specs diff --git a/CLAUDE.md b/CLAUDE.md index 816391b..4faec1c 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -676,6 +676,8 @@ LLAMA_SWAP_TTS_REF_SECONDS=30 # Max voice-clone reference clip # (Chatterbox is zero-shot; ~10-20s clean ref is ideal) LLAMA_SWAP_TTS_REQUEST_TIMEOUT_SECONDS=600 # Per-request synth timeout (long chunked insights take # minutes); overrides the shared client timeout for /tts/speech +TTS_PRONUNCIATIONS_PATH=tts_pronunciations.json # JSON map of pronunciation overrides applied before synth + # (see tts_pronunciations.example.json); hot-reloaded on change # Insight Chat Continuation AGENTIC_CHAT_MAX_ITERATIONS=6 # Cap on tool-calling iterations per chat turn (default 6) diff --git a/README.md b/README.md index f355e32..8a6421b 100644 --- a/README.md +++ b/README.md @@ -153,7 +153,8 @@ behind the same llama-swap proxy. Only requires `LLAMA_SWAP_URL` (the TTS client is built whenever that's set — independent of `LLM_BACKEND`). Endpoints: - `POST /tts/speech` — body `{ text, voice?, format?, exaggeration?, cfg_weight?, temperature? }`; returns `{ audio_base64, format }`. Input is cleaned - server-side (markdown + emoji stripped) and the generation knobs are clamped + server-side (markdown + emoji stripped, then pronunciation overrides applied — + see below) and the generation knobs are clamped to Chatterbox's ranges. Synthesis is serialized (one at a time — the upstream has no GPU lock of its own); a concurrent request gets a fast `429`. - `POST /tts/speech/jobs` — durable variant for long syntheses: same body as @@ -177,7 +178,14 @@ is built whenever that's set — independent of `LLM_BACKEND`). Endpoints: Created voice names are tagged with the ref-clip cap in effect (e.g. `grandma-30s`) so the library shows which reference length produced each clone. +Words the model mispronounces (place names, initialisms) can be rewritten +before synthesis via a JSON map — copy `tts_pronunciations.example.json` to +`tts_pronunciations.json` and edit; changes apply without a restart. Full +matching rules are documented in `src/ai/pronunciation.rs`. + Env: +- `TTS_PRONUNCIATIONS_PATH` - pronunciation-override JSON file + [default: `tts_pronunciations.json` in the working directory] - `LLAMA_SWAP_TTS_MODEL` - TTS model id in llama-swap's `config.yaml` [default: `chatterbox`] - `LLAMA_SWAP_TTS_VOICE` - default voice used when a `/tts/speech` request omits `voice` (optional) - `LLAMA_SWAP_TTS_REF_SECONDS` - max voice-clone reference clip length in seconds diff --git a/src/ai/mod.rs b/src/ai/mod.rs index d358f6c..c5302fb 100644 --- a/src/ai/mod.rs +++ b/src/ai/mod.rs @@ -12,6 +12,7 @@ pub mod llm_client; pub mod local_llm; pub mod ollama; pub mod openrouter; +pub mod pronunciation; pub mod sms_client; pub mod tts; pub mod turn_registry; diff --git a/src/ai/pronunciation.rs b/src/ai/pronunciation.rs new file mode 100644 index 0000000..b9d7f6e --- /dev/null +++ b/src/ai/pronunciation.rs @@ -0,0 +1,282 @@ +// User-configurable pronunciation overrides for TTS. Chatterbox mispronounces +// place names ("Worcester"), initialisms ("WSL"), and clipped abbreviations +// ("blvd"), so we rewrite them to phonetic spellings before synthesis. +// +// The map lives in a JSON file on the server — a flat object of +// `"written form": "spoken form"` pairs, e.g.: +// +// { +// "Worcester": "Wuster", +// "WSL": "W S L", +// "blvd": "boulevard", +// "Dr.": "Doctor" +// } +// +// Path comes from `TTS_PRONUNCIATIONS_PATH` (default `tts_pronunciations.json` +// in the working directory). A missing file simply disables the feature. The +// file is re-read whenever its mtime changes, so edits apply to the next +// synthesis without a restart; a malformed edit keeps the last good map and +// logs the parse error instead of silently dropping all overrides. +// +// Matching rules: +// - Whole words only — `cat` never rewrites `category`. (Boundaries are only +// asserted next to word characters, so keys like `Dr.` still work.) +// - Smartcase: an all-lowercase key matches case-insensitively; a key with +// any uppercase matches exactly. That lets `worcester` catch every casing +// while `US` (the country) leaves the pronoun `us` alone. +// - Longer keys win over shorter ones (`New York Times` before `New York`). + +use regex::Regex; +use std::collections::HashMap; +use std::path::Path; +use std::sync::{Arc, LazyLock, Mutex as StdMutex}; +use std::time::SystemTime; + +/// A compiled pronunciation map: one alternation regex over every key plus +/// the lookup tables the replacement closure resolves matches against. +#[derive(Default)] +struct CompiledMap { + /// `None` when the map is empty — apply() is then a no-op. + regex: Option, + /// Case-sensitive entries, keyed verbatim. + exact: HashMap, + /// Case-insensitive entries, keyed lowercased. + folded: HashMap, +} + +impl CompiledMap { + fn from_entries(entries: &HashMap) -> Self { + let mut keys: Vec<&str> = entries + .keys() + .map(|k| k.as_str()) + .filter(|k| !k.trim().is_empty()) + .collect(); + if keys.is_empty() { + return Self::default(); + } + // Longest key first so overlapping entries prefer the more specific + // one (regex alternation is first-match-wins, not longest-match). + keys.sort_by(|a, b| b.len().cmp(&a.len()).then(a.cmp(b))); + + let mut exact = HashMap::new(); + let mut folded = HashMap::new(); + let alternatives: Vec = keys + .iter() + .map(|key| { + let escaped = regex::escape(key); + // Only assert a word boundary where the key edge is a word + // character — `\b` adjacent to punctuation (e.g. the dot in + // `Dr.`) would otherwise never match. + let lead = if key + .chars() + .next() + .is_some_and(|c| c.is_alphanumeric() || c == '_') + { + r"\b" + } else { + "" + }; + let trail = if key + .chars() + .last() + .is_some_and(|c| c.is_alphanumeric() || c == '_') + { + r"\b" + } else { + "" + }; + let case_sensitive = key.chars().any(|c| c.is_uppercase()); + if case_sensitive { + exact.insert(key.to_string(), entries[*key].clone()); + format!("{lead}{escaped}{trail}") + } else { + folded.insert(key.to_lowercase(), entries[*key].clone()); + format!("{lead}(?i:{escaped}){trail}") + } + }) + .collect(); + + // Escaped fixed strings can't produce an invalid pattern; if one ever + // does, treat the whole map as empty rather than panicking a handler. + let pattern = alternatives.join("|"); + let regex = match Regex::new(&pattern) { + Ok(r) => Some(r), + Err(e) => { + log::error!("pronunciation map failed to compile: {e}"); + None + } + }; + Self { + regex, + exact, + folded, + } + } + + fn apply(&self, text: &str) -> String { + let Some(re) = &self.regex else { + return text.to_string(); + }; + re.replace_all(text, |caps: ®ex::Captures| { + let m = &caps[0]; + self.exact + .get(m) + .or_else(|| self.folded.get(&m.to_lowercase())) + .cloned() + // Unreachable in practice — every alternative came from one + // of the two maps — but never drop the user's text. + .unwrap_or_else(|| m.to_string()) + }) + .into_owned() + } +} + +struct CacheEntry { + mtime: Option, + compiled: Arc, +} + +static CACHE: LazyLock>> = LazyLock::new(|| StdMutex::new(None)); + +fn config_path() -> String { + std::env::var("TTS_PRONUNCIATIONS_PATH") + .ok() + .map(|s| s.trim().to_string()) + .filter(|s| !s.is_empty()) + .unwrap_or_else(|| "tts_pronunciations.json".to_string()) +} + +/// Load the compiled map, re-reading the file only when its mtime changed +/// since the last call (or it appeared/disappeared). Synthesis is serialized +/// on a single GPU permit, so a stat per call is noise. +fn current_map() -> Arc { + let path_s = config_path(); + let path = Path::new(&path_s); + let mtime = std::fs::metadata(path).and_then(|m| m.modified()).ok(); + + let mut cache = CACHE.lock().unwrap(); + if let Some(entry) = cache.as_ref() + && entry.mtime == mtime + { + return entry.compiled.clone(); + } + + let compiled = match mtime { + None => Arc::new(CompiledMap::default()), // no file → no overrides + Some(_) => match std::fs::read_to_string(path) + .map_err(anyhow::Error::from) + .and_then(|s| Ok(serde_json::from_str::>(&s)?)) + { + Ok(entries) => { + log::info!( + "loaded {} pronunciation override(s) from {path_s}", + entries.len() + ); + Arc::new(CompiledMap::from_entries(&entries)) + } + Err(e) => { + log::error!("failed to load pronunciation map {path_s}: {e}"); + // Keep serving the previous map rather than regressing to + // none mid-edit; still record the new mtime so the error + // logs once per bad save, not once per synthesis. + cache + .as_ref() + .map(|c| c.compiled.clone()) + .unwrap_or_default() + } + }, + }; + *cache = Some(CacheEntry { + mtime, + compiled: compiled.clone(), + }); + compiled +} + +/// Rewrite configured words/abbreviations to their phonetic spellings. +/// Call on cleaned (post-markdown-strip) text, right before synthesis. +pub fn apply_pronunciations(text: &str) -> String { + current_map().apply(text) +} + +#[cfg(test)] +mod tests { + use super::*; + + fn compile(pairs: &[(&str, &str)]) -> CompiledMap { + let entries = pairs + .iter() + .map(|(k, v)| (k.to_string(), v.to_string())) + .collect(); + CompiledMap::from_entries(&entries) + } + + #[test] + fn empty_map_is_a_noop() { + let m = compile(&[]); + assert_eq!(m.apply("nothing changes"), "nothing changes"); + } + + #[test] + fn replaces_whole_words_only() { + let m = compile(&[("cat", "kitty")]); + assert_eq!(m.apply("the cat sat"), "the kitty sat"); + // No substring rewrites. + assert_eq!(m.apply("the category"), "the category"); + assert_eq!(m.apply("concatenate"), "concatenate"); + } + + #[test] + fn lowercase_keys_match_any_casing() { + let m = compile(&[("worcester", "Wuster")]); + assert_eq!(m.apply("Worcester is nice"), "Wuster is nice"); + assert_eq!(m.apply("in WORCESTER today"), "in Wuster today"); + assert_eq!(m.apply("worcester sauce"), "Wuster sauce"); + } + + #[test] + fn uppercase_keys_match_case_sensitively() { + let m = compile(&[("US", "U S")]); + assert_eq!(m.apply("the US economy"), "the U S economy"); + // The pronoun survives. + assert_eq!(m.apply("join us today"), "join us today"); + } + + #[test] + fn keys_with_punctuation_work() { + // `\b` is only asserted next to word characters, so the trailing dot + // doesn't break matching. + let m = compile(&[("Dr.", "Doctor"), ("blvd", "boulevard")]); + assert_eq!( + m.apply("Dr. Smith on Sunset blvd"), + "Doctor Smith on Sunset boulevard" + ); + } + + #[test] + fn longer_keys_win_over_shorter() { + let m = compile(&[("new york", "Noo York"), ("new york times", "the Times")]); + assert_eq!(m.apply("read the new york times"), "read the the Times"); + assert_eq!(m.apply("visit new york soon"), "visit Noo York soon"); + } + + #[test] + fn multiple_occurrences_all_rewrite() { + let m = compile(&[("wsl", "W S L")]); + assert_eq!(m.apply("WSL and wsl and Wsl"), "W S L and W S L and W S L"); + } + + #[test] + fn replacement_text_is_verbatim() { + // Replacements aren't re-scanned — a value containing another key + // doesn't cascade. + let m = compile(&[("a1", "b2"), ("b2", "c3")]); + assert_eq!(m.apply("a1"), "b2"); + } + + #[test] + fn blank_keys_are_ignored() { + let m = compile(&[("", "x"), (" ", "y"), ("ok", "fine")]); + assert_eq!(m.apply("ok then"), "fine then"); + } +} diff --git a/src/ai/tts.rs b/src/ai/tts.rs index f76810d..a0b9bd2 100644 --- a/src/ai/tts.rs +++ b/src/ai/tts.rs @@ -254,6 +254,14 @@ fn clean_for_tts(input: &str) -> String { s.trim().to_string() } +/// Full text-preparation pipeline for synthesis: markdown/emoji cleanup, then +/// the user's pronunciation overrides (see [`crate::ai::pronunciation`]) on +/// the resulting plain text — after cleanup so word boundaries aren't +/// obscured by `**WSL**`-style markup. +fn prepare_for_tts(input: &str) -> String { + crate::ai::pronunciation::apply_pronunciations(&clean_for_tts(input)) +} + /// Decode an audio/video file to mono 24 kHz WAV via ffmpeg, returning the WAV /// bytes. Chatterbox validates the reference clip by file *extension* and /// rejects several formats (e.g. `.aac`, `.opus`), so we always normalize to @@ -337,7 +345,7 @@ pub async fn tts_speech_handler( let parent_context = extract_context_from_request(&http_request); let mut span = global_tracer().start_with_context("http.tts.speech", &parent_context); - let text = clean_for_tts(&req.text); + let text = prepare_for_tts(&req.text); if text.is_empty() { span.set_status(Status::error("text is required")); return HttpResponse::BadRequest().json(json!({ "error": "text is required" })); @@ -435,7 +443,7 @@ pub async fn create_speech_job_handler( let mut span = global_tracer().start_with_context("http.tts.speech_job.create", &parent_context); - let text = clean_for_tts(&req.text); + let text = prepare_for_tts(&req.text); if text.is_empty() { span.set_status(Status::error("text is required")); return HttpResponse::BadRequest().json(json!({ "error": "text is required" })); diff --git a/tts_pronunciations.example.json b/tts_pronunciations.example.json new file mode 100644 index 0000000..9bc9df9 --- /dev/null +++ b/tts_pronunciations.example.json @@ -0,0 +1,13 @@ +{ + "Worcester": "Wuster", + "Spokane": "Spo can", + "wsl": "W S L", + "sql": "sequel", + "api": "A P I", + "US": "U S", + "Dr.": "Doctor", + "St.": "Saint", + "blvd": "boulevard", + "vs.": "versus", + "etc.": "et cetera" +}