// User-configurable pronunciation overrides for TTS. Chatterbox mispronounces // place names ("Worcester"), initialisms ("WSL"), and clipped abbreviations // ("blvd"), so we rewrite them to phonetic spellings before synthesis. // // The map lives in a JSON file on the server — a flat object of // `"written form": "spoken form"` pairs, e.g.: // // { // "Worcester": "Wuster", // "WSL": "W S L", // "blvd": "boulevard", // "Dr.": "Doctor" // } // // Path comes from `TTS_PRONUNCIATIONS_PATH` (default `tts_pronunciations.json` // in the working directory). A missing file simply disables the feature. The // file is re-read whenever its mtime changes, so edits apply to the next // synthesis without a restart; a malformed edit keeps the last good map and // logs the parse error instead of silently dropping all overrides. // // Matching rules: // - Whole words only — `cat` never rewrites `category`. (Boundaries are only // asserted next to word characters, so keys like `Dr.` still work.) // - Smartcase: an all-lowercase key matches case-insensitively; a key with // any uppercase matches exactly. That lets `worcester` catch every casing // while `US` (the country) leaves the pronoun `us` alone. // - Longer keys win over shorter ones (`New York Times` before `New York`). use regex::Regex; use std::collections::HashMap; use std::path::Path; use std::sync::{Arc, LazyLock, Mutex as StdMutex}; use std::time::SystemTime; /// A compiled pronunciation map: one alternation regex over every key plus /// the lookup tables the replacement closure resolves matches against. #[derive(Default)] struct CompiledMap { /// `None` when the map is empty — apply() is then a no-op. regex: Option, /// Case-sensitive entries, keyed verbatim. exact: HashMap, /// Case-insensitive entries, keyed lowercased. folded: HashMap, } impl CompiledMap { fn from_entries(entries: &HashMap) -> Self { let mut keys: Vec<&str> = entries .keys() .map(|k| k.as_str()) .filter(|k| !k.trim().is_empty()) .collect(); if keys.is_empty() { return Self::default(); } // Longest key first so overlapping entries prefer the more specific // one (regex alternation is first-match-wins, not longest-match). keys.sort_by(|a, b| b.len().cmp(&a.len()).then(a.cmp(b))); let mut exact = HashMap::new(); let mut folded = HashMap::new(); let alternatives: Vec = keys .iter() .map(|key| { let escaped = regex::escape(key); // Only assert a word boundary where the key edge is a word // character — `\b` adjacent to punctuation (e.g. the dot in // `Dr.`) would otherwise never match. let lead = if key .chars() .next() .is_some_and(|c| c.is_alphanumeric() || c == '_') { r"\b" } else { "" }; let trail = if key .chars() .last() .is_some_and(|c| c.is_alphanumeric() || c == '_') { r"\b" } else { "" }; let case_sensitive = key.chars().any(|c| c.is_uppercase()); if case_sensitive { exact.insert(key.to_string(), entries[*key].clone()); format!("{lead}{escaped}{trail}") } else { folded.insert(key.to_lowercase(), entries[*key].clone()); format!("{lead}(?i:{escaped}){trail}") } }) .collect(); // Escaped fixed strings can't produce an invalid pattern; if one ever // does, treat the whole map as empty rather than panicking a handler. let pattern = alternatives.join("|"); let regex = match Regex::new(&pattern) { Ok(r) => Some(r), Err(e) => { log::error!("pronunciation map failed to compile: {e}"); None } }; Self { regex, exact, folded, } } fn apply(&self, text: &str) -> String { let Some(re) = &self.regex else { return text.to_string(); }; re.replace_all(text, |caps: ®ex::Captures| { let m = &caps[0]; self.exact .get(m) .or_else(|| self.folded.get(&m.to_lowercase())) .cloned() // Unreachable in practice — every alternative came from one // of the two maps — but never drop the user's text. .unwrap_or_else(|| m.to_string()) }) .into_owned() } } struct CacheEntry { mtime: Option, compiled: Arc, } static CACHE: LazyLock>> = LazyLock::new(|| StdMutex::new(None)); fn config_path() -> String { std::env::var("TTS_PRONUNCIATIONS_PATH") .ok() .map(|s| s.trim().to_string()) .filter(|s| !s.is_empty()) .unwrap_or_else(|| "tts_pronunciations.json".to_string()) } /// Load the compiled map, re-reading the file only when its mtime changed /// since the last call (or it appeared/disappeared). Synthesis is serialized /// on a single GPU permit, so a stat per call is noise. fn current_map() -> Arc { let path_s = config_path(); let path = Path::new(&path_s); let mtime = std::fs::metadata(path).and_then(|m| m.modified()).ok(); let mut cache = CACHE.lock().unwrap(); if let Some(entry) = cache.as_ref() && entry.mtime == mtime { return entry.compiled.clone(); } let compiled = match mtime { None => Arc::new(CompiledMap::default()), // no file → no overrides Some(_) => match std::fs::read_to_string(path) .map_err(anyhow::Error::from) .and_then(|s| Ok(serde_json::from_str::>(&s)?)) { Ok(entries) => { log::info!( "loaded {} pronunciation override(s) from {path_s}", entries.len() ); Arc::new(CompiledMap::from_entries(&entries)) } Err(e) => { log::error!("failed to load pronunciation map {path_s}: {e}"); // Keep serving the previous map rather than regressing to // none mid-edit; still record the new mtime so the error // logs once per bad save, not once per synthesis. cache .as_ref() .map(|c| c.compiled.clone()) .unwrap_or_default() } }, }; *cache = Some(CacheEntry { mtime, compiled: compiled.clone(), }); compiled } /// Rewrite configured words/abbreviations to their phonetic spellings. /// Call on cleaned (post-markdown-strip) text, right before synthesis. pub fn apply_pronunciations(text: &str) -> String { current_map().apply(text) } #[cfg(test)] mod tests { use super::*; fn compile(pairs: &[(&str, &str)]) -> CompiledMap { let entries = pairs .iter() .map(|(k, v)| (k.to_string(), v.to_string())) .collect(); CompiledMap::from_entries(&entries) } #[test] fn empty_map_is_a_noop() { let m = compile(&[]); assert_eq!(m.apply("nothing changes"), "nothing changes"); } #[test] fn replaces_whole_words_only() { let m = compile(&[("cat", "kitty")]); assert_eq!(m.apply("the cat sat"), "the kitty sat"); // No substring rewrites. assert_eq!(m.apply("the category"), "the category"); assert_eq!(m.apply("concatenate"), "concatenate"); } #[test] fn lowercase_keys_match_any_casing() { let m = compile(&[("worcester", "Wuster")]); assert_eq!(m.apply("Worcester is nice"), "Wuster is nice"); assert_eq!(m.apply("in WORCESTER today"), "in Wuster today"); assert_eq!(m.apply("worcester sauce"), "Wuster sauce"); } #[test] fn uppercase_keys_match_case_sensitively() { let m = compile(&[("US", "U S")]); assert_eq!(m.apply("the US economy"), "the U S economy"); // The pronoun survives. assert_eq!(m.apply("join us today"), "join us today"); } #[test] fn keys_with_punctuation_work() { // `\b` is only asserted next to word characters, so the trailing dot // doesn't break matching. let m = compile(&[("Dr.", "Doctor"), ("blvd", "boulevard")]); assert_eq!( m.apply("Dr. Smith on Sunset blvd"), "Doctor Smith on Sunset boulevard" ); } #[test] fn longer_keys_win_over_shorter() { let m = compile(&[("new york", "Noo York"), ("new york times", "the Times")]); assert_eq!(m.apply("read the new york times"), "read the the Times"); assert_eq!(m.apply("visit new york soon"), "visit Noo York soon"); } #[test] fn multiple_occurrences_all_rewrite() { let m = compile(&[("wsl", "W S L")]); assert_eq!(m.apply("WSL and wsl and Wsl"), "W S L and W S L and W S L"); } #[test] fn replacement_text_is_verbatim() { // Replacements aren't re-scanned — a value containing another key // doesn't cascade. let m = compile(&[("a1", "b2"), ("b2", "c3")]); assert_eq!(m.apply("a1"), "b2"); } #[test] fn blank_keys_are_ignored() { let m = compile(&[("", "x"), (" ", "y"), ("ok", "fine")]); assert_eq!(m.apply("ok then"), "fine then"); } }