Add user-configurable TTS pronunciation overrides

A JSON map (TTS_PRONUNCIATIONS_PATH, default tts_pronunciations.json) rewrites mispronounced words — place names, initialisms, dotted abbreviations — to phonetic spellings before synthesis, applied after markdown cleanup in both /tts/speech paths. Whole-word smartcase matching (lowercase keys match any casing, uppercase keys exact), longest key wins, hot-reloaded on mtime change with last-good fallback on parse errors. See tts_pronunciations.example.json. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-11 23:06:18 -04:00
parent 3fa4fa8501
commit 2e0f78aa1b
7 changed files with 319 additions and 3 deletions
@@ -5,6 +5,8 @@ database/target
 *.db-shm
 *.db-wal
 .env
 # Server-local TTS pronunciation overrides (tts_pronunciations.example.json is the template)
 /tts_pronunciations.json
 /tmp
 /docs
 /specs
@@ -676,6 +676,8 @@ LLAMA_SWAP_TTS_REF_SECONDS=30                  # Max voice-clone reference clip
                                               # (Chatterbox is zero-shot; ~10-20s clean ref is ideal)
 LLAMA_SWAP_TTS_REQUEST_TIMEOUT_SECONDS=600     # Per-request synth timeout (long chunked insights take
                                               # minutes); overrides the shared client timeout for /tts/speech
 TTS_PRONUNCIATIONS_PATH=tts_pronunciations.json # JSON map of pronunciation overrides applied before synth
                                               # (see tts_pronunciations.example.json); hot-reloaded on change
 # Insight Chat Continuation
 AGENTIC_CHAT_MAX_ITERATIONS=6                  # Cap on tool-calling iterations per chat turn (default 6)
@@ -153,7 +153,8 @@ behind the same llama-swap proxy. Only requires `LLAMA_SWAP_URL` (the TTS client
 is built whenever that's set — independent of `LLM_BACKEND`). Endpoints:
 - `POST /tts/speech` — body `{ text, voice?, format?, exaggeration?, cfg_weight?,
  temperature? }`; returns `{ audio_base64, format }`. Input is cleaned
-  server-side (markdown + emoji stripped) and the generation knobs are clamped
+  server-side (markdown + emoji stripped, then pronunciation overrides applied —
  see below) and the generation knobs are clamped
  to Chatterbox's ranges. Synthesis is serialized (one at a time — the upstream
  has no GPU lock of its own); a concurrent request gets a fast `429`.
 - `POST /tts/speech/jobs` — durable variant for long syntheses: same body as
@@ -177,7 +178,14 @@ is built whenever that's set — independent of `LLM_BACKEND`). Endpoints:
 Created voice names are tagged with the ref-clip cap in effect (e.g.
 `grandma-30s`) so the library shows which reference length produced each clone.
 Words the model mispronounces (place names, initialisms) can be rewritten
 before synthesis via a JSON map — copy `tts_pronunciations.example.json` to
 `tts_pronunciations.json` and edit; changes apply without a restart. Full
 matching rules are documented in `src/ai/pronunciation.rs`.
 Env:
 - `TTS_PRONUNCIATIONS_PATH` - pronunciation-override JSON file
  [default: `tts_pronunciations.json` in the working directory]
 - `LLAMA_SWAP_TTS_MODEL` - TTS model id in llama-swap's `config.yaml` [default: `chatterbox`]
 - `LLAMA_SWAP_TTS_VOICE` - default voice used when a `/tts/speech` request omits `voice` (optional)
 - `LLAMA_SWAP_TTS_REF_SECONDS` - max voice-clone reference clip length in seconds
@@ -12,6 +12,7 @@ pub mod llm_client;
 pub mod local_llm;
 pub mod ollama;
 pub mod openrouter;
 pub mod pronunciation;
 pub mod sms_client;
 pub mod tts;
 pub mod turn_registry;
@@ -0,0 +1,282 @@
 // User-configurable pronunciation overrides for TTS. Chatterbox mispronounces
 // place names ("Worcester"), initialisms ("WSL"), and clipped abbreviations
 // ("blvd"), so we rewrite them to phonetic spellings before synthesis.
 //
 // The map lives in a JSON file on the server — a flat object of
 // `"written form": "spoken form"` pairs, e.g.:
 //
 //   {
 //     "Worcester": "Wuster",
 //     "WSL": "W S L",
 //     "blvd": "boulevard",
 //     "Dr.": "Doctor"
 //   }
 //
 // Path comes from `TTS_PRONUNCIATIONS_PATH` (default `tts_pronunciations.json`
 // in the working directory). A missing file simply disables the feature. The
 // file is re-read whenever its mtime changes, so edits apply to the next
 // synthesis without a restart; a malformed edit keeps the last good map and
 // logs the parse error instead of silently dropping all overrides.
 //
 // Matching rules:
 // - Whole words only — `cat` never rewrites `category`. (Boundaries are only
 //   asserted next to word characters, so keys like `Dr.` still work.)
 // - Smartcase: an all-lowercase key matches case-insensitively; a key with
 //   any uppercase matches exactly. That lets `worcester` catch every casing
 //   while `US` (the country) leaves the pronoun `us` alone.
 // - Longer keys win over shorter ones (`New York Times` before `New York`).
 use regex::Regex;
 use std::collections::HashMap;
 use std::path::Path;
 use std::sync::{Arc, LazyLock, Mutex as StdMutex};
 use std::time::SystemTime;
 /// A compiled pronunciation map: one alternation regex over every key plus
 /// the lookup tables the replacement closure resolves matches against.
 #[derive(Default)]
 struct CompiledMap {
    /// `None` when the map is empty — apply() is then a no-op.
    regex: Option<Regex>,
    /// Case-sensitive entries, keyed verbatim.
    exact: HashMap<String, String>,
    /// Case-insensitive entries, keyed lowercased.
    folded: HashMap<String, String>,
 }
 impl CompiledMap {
    fn from_entries(entries: &HashMap<String, String>) -> Self {
        let mut keys: Vec<&str> = entries
            .keys()
            .map(|k| k.as_str())
            .filter(|k| !k.trim().is_empty())
            .collect();
        if keys.is_empty() {
            return Self::default();
        }
        // Longest key first so overlapping entries prefer the more specific
        // one (regex alternation is first-match-wins, not longest-match).
        keys.sort_by(|a, b| b.len().cmp(&a.len()).then(a.cmp(b)));
        let mut exact = HashMap::new();
        let mut folded = HashMap::new();
        let alternatives: Vec<String> = keys
            .iter()
            .map(|key| {
                let escaped = regex::escape(key);
                // Only assert a word boundary where the key edge is a word
                // character — `\b` adjacent to punctuation (e.g. the dot in
                // `Dr.`) would otherwise never match.
                let lead = if key
                    .chars()
                    .next()
                    .is_some_and(|c| c.is_alphanumeric() || c == '_')
                {
                    r"\b"
                } else {
                    ""
                };
                let trail = if key
                    .chars()
                    .last()
                    .is_some_and(|c| c.is_alphanumeric() || c == '_')
                {
                    r"\b"
                } else {
                    ""
                };
                let case_sensitive = key.chars().any(|c| c.is_uppercase());
                if case_sensitive {
                    exact.insert(key.to_string(), entries[*key].clone());
                    format!("{lead}{escaped}{trail}")
                } else {
                    folded.insert(key.to_lowercase(), entries[*key].clone());
                    format!("{lead}(?i:{escaped}){trail}")
                }
            })
            .collect();
        // Escaped fixed strings can't produce an invalid pattern; if one ever
        // does, treat the whole map as empty rather than panicking a handler.
        let pattern = alternatives.join("|");
        let regex = match Regex::new(&pattern) {
            Ok(r) => Some(r),
            Err(e) => {
                log::error!("pronunciation map failed to compile: {e}");
                None
            }
        };
        Self {
            regex,
            exact,
            folded,
        }
    }
    fn apply(&self, text: &str) -> String {
        let Some(re) = &self.regex else {
            return text.to_string();
        };
        re.replace_all(text, |caps: &regex::Captures| {
            let m = &caps[0];
            self.exact
                .get(m)
                .or_else(|| self.folded.get(&m.to_lowercase()))
                .cloned()
                // Unreachable in practice — every alternative came from one
                // of the two maps — but never drop the user's text.
                .unwrap_or_else(|| m.to_string())
        })
        .into_owned()
    }
 }
 struct CacheEntry {
    mtime: Option<SystemTime>,
    compiled: Arc<CompiledMap>,
 }
 static CACHE: LazyLock<StdMutex<Option<CacheEntry>>> = LazyLock::new(|| StdMutex::new(None));
 fn config_path() -> String {
    std::env::var("TTS_PRONUNCIATIONS_PATH")
        .ok()
        .map(|s| s.trim().to_string())
        .filter(|s| !s.is_empty())
        .unwrap_or_else(|| "tts_pronunciations.json".to_string())
 }
 /// Load the compiled map, re-reading the file only when its mtime changed
 /// since the last call (or it appeared/disappeared). Synthesis is serialized
 /// on a single GPU permit, so a stat per call is noise.
 fn current_map() -> Arc<CompiledMap> {
    let path_s = config_path();
    let path = Path::new(&path_s);
    let mtime = std::fs::metadata(path).and_then(|m| m.modified()).ok();
    let mut cache = CACHE.lock().unwrap();
    if let Some(entry) = cache.as_ref()
        && entry.mtime == mtime
    {
        return entry.compiled.clone();
    }
    let compiled = match mtime {
        None => Arc::new(CompiledMap::default()), // no file → no overrides
        Some(_) => match std::fs::read_to_string(path)
            .map_err(anyhow::Error::from)
            .and_then(|s| Ok(serde_json::from_str::<HashMap<String, String>>(&s)?))
        {
            Ok(entries) => {
                log::info!(
                    "loaded {} pronunciation override(s) from {path_s}",
                    entries.len()
                );
                Arc::new(CompiledMap::from_entries(&entries))
            }
            Err(e) => {
                log::error!("failed to load pronunciation map {path_s}: {e}");
                // Keep serving the previous map rather than regressing to
                // none mid-edit; still record the new mtime so the error
                // logs once per bad save, not once per synthesis.
                cache
                    .as_ref()
                    .map(|c| c.compiled.clone())
                    .unwrap_or_default()
            }
        },
    };
    *cache = Some(CacheEntry {
        mtime,
        compiled: compiled.clone(),
    });
    compiled
 }
 /// Rewrite configured words/abbreviations to their phonetic spellings.
 /// Call on cleaned (post-markdown-strip) text, right before synthesis.
 pub fn apply_pronunciations(text: &str) -> String {
    current_map().apply(text)
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    fn compile(pairs: &[(&str, &str)]) -> CompiledMap {
        let entries = pairs
            .iter()
            .map(|(k, v)| (k.to_string(), v.to_string()))
            .collect();
        CompiledMap::from_entries(&entries)
    }
    #[test]
    fn empty_map_is_a_noop() {
        let m = compile(&[]);
        assert_eq!(m.apply("nothing changes"), "nothing changes");
    }
    #[test]
    fn replaces_whole_words_only() {
        let m = compile(&[("cat", "kitty")]);
        assert_eq!(m.apply("the cat sat"), "the kitty sat");
        // No substring rewrites.
        assert_eq!(m.apply("the category"), "the category");
        assert_eq!(m.apply("concatenate"), "concatenate");
    }
    #[test]
    fn lowercase_keys_match_any_casing() {
        let m = compile(&[("worcester", "Wuster")]);
        assert_eq!(m.apply("Worcester is nice"), "Wuster is nice");
        assert_eq!(m.apply("in WORCESTER today"), "in Wuster today");
        assert_eq!(m.apply("worcester sauce"), "Wuster sauce");
    }
    #[test]
    fn uppercase_keys_match_case_sensitively() {
        let m = compile(&[("US", "U S")]);
        assert_eq!(m.apply("the US economy"), "the U S economy");
        // The pronoun survives.
        assert_eq!(m.apply("join us today"), "join us today");
    }
    #[test]
    fn keys_with_punctuation_work() {
        // `\b` is only asserted next to word characters, so the trailing dot
        // doesn't break matching.
        let m = compile(&[("Dr.", "Doctor"), ("blvd", "boulevard")]);
        assert_eq!(
            m.apply("Dr. Smith on Sunset blvd"),
            "Doctor Smith on Sunset boulevard"
        );
    }
    #[test]
    fn longer_keys_win_over_shorter() {
        let m = compile(&[("new york", "Noo York"), ("new york times", "the Times")]);
        assert_eq!(m.apply("read the new york times"), "read the the Times");
        assert_eq!(m.apply("visit new york soon"), "visit Noo York soon");
    }
    #[test]
    fn multiple_occurrences_all_rewrite() {
        let m = compile(&[("wsl", "W S L")]);
        assert_eq!(m.apply("WSL and wsl and Wsl"), "W S L and W S L and W S L");
    }
    #[test]
    fn replacement_text_is_verbatim() {
        // Replacements aren't re-scanned — a value containing another key
        // doesn't cascade.
        let m = compile(&[("a1", "b2"), ("b2", "c3")]);
        assert_eq!(m.apply("a1"), "b2");
    }
    #[test]
    fn blank_keys_are_ignored() {
        let m = compile(&[("", "x"), ("  ", "y"), ("ok", "fine")]);
        assert_eq!(m.apply("ok then"), "fine then");
    }
 }
@@ -254,6 +254,14 @@ fn clean_for_tts(input: &str) -> String {
    s.trim().to_string()
 }
 /// Full text-preparation pipeline for synthesis: markdown/emoji cleanup, then
 /// the user's pronunciation overrides (see [`crate::ai::pronunciation`]) on
 /// the resulting plain text — after cleanup so word boundaries aren't
 /// obscured by `**WSL**`-style markup.
 fn prepare_for_tts(input: &str) -> String {
    crate::ai::pronunciation::apply_pronunciations(&clean_for_tts(input))
 }
 /// Decode an audio/video file to mono 24 kHz WAV via ffmpeg, returning the WAV
 /// bytes. Chatterbox validates the reference clip by file *extension* and
 /// rejects several formats (e.g. `.aac`, `.opus`), so we always normalize to
@@ -337,7 +345,7 @@ pub async fn tts_speech_handler(
    let parent_context = extract_context_from_request(&http_request);
    let mut span = global_tracer().start_with_context("http.tts.speech", &parent_context);
-    let text = clean_for_tts(&req.text);
+    let text = prepare_for_tts(&req.text);
    if text.is_empty() {
        span.set_status(Status::error("text is required"));
        return HttpResponse::BadRequest().json(json!({ "error": "text is required" }));
@@ -435,7 +443,7 @@ pub async fn create_speech_job_handler(
    let mut span =
        global_tracer().start_with_context("http.tts.speech_job.create", &parent_context);
-    let text = clean_for_tts(&req.text);
+    let text = prepare_for_tts(&req.text);
    if text.is_empty() {
        span.set_status(Status::error("text is required"));
        return HttpResponse::BadRequest().json(json!({ "error": "text is required" }));
@@ -0,0 +1,13 @@
 {
  "Worcester": "Wuster",
  "Spokane": "Spo can",
  "wsl": "W S L",
  "sql": "sequel",
  "api": "A P I",
  "US": "U S",
  "Dr.": "Doctor",
  "St.": "Saint",
  "blvd": "boulevard",
  "vs.": "versus",
  "etc.": "et cetera"
 }