From 2e0f78aa1b1a5782d4444675eceee0f7bb82683e Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Thu, 11 Jun 2026 23:06:18 -0400
Subject: [PATCH] Add user-configurable TTS pronunciation overrides
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A JSON map (TTS_PRONUNCIATIONS_PATH, default tts_pronunciations.json)
rewrites mispronounced words — place names, initialisms, dotted
abbreviations — to phonetic spellings before synthesis, applied after
markdown cleanup in both /tts/speech paths. Whole-word smartcase
matching (lowercase keys match any casing, uppercase keys exact),
longest key wins, hot-reloaded on mtime change with last-good fallback
on parse errors. See tts_pronunciations.example.json.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .gitignore                      |   2 +
 CLAUDE.md                       |   2 +
 README.md                       |  10 +-
 src/ai/mod.rs                   |   1 +
 src/ai/pronunciation.rs         | 282 ++++++++++++++++++++++++++++++++
 src/ai/tts.rs                   |  12 +-
 tts_pronunciations.example.json |  13 ++
 7 files changed, 319 insertions(+), 3 deletions(-)
 create mode 100644 src/ai/pronunciation.rs
 create mode 100644 tts_pronunciations.example.json

diff --git a/.gitignore b/.gitignore
index 2bd4d6e..5dceed2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,8 @@ database/target
 *.db-shm
 *.db-wal
 .env
+# Server-local TTS pronunciation overrides (tts_pronunciations.example.json is the template)
+/tts_pronunciations.json
 /tmp
 /docs
 /specs
diff --git a/CLAUDE.md b/CLAUDE.md
index 816391b..4faec1c 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -676,6 +676,8 @@ LLAMA_SWAP_TTS_REF_SECONDS=30                  # Max voice-clone reference clip
                                                # (Chatterbox is zero-shot; ~10-20s clean ref is ideal)
 LLAMA_SWAP_TTS_REQUEST_TIMEOUT_SECONDS=600     # Per-request synth timeout (long chunked insights take
                                                # minutes); overrides the shared client timeout for /tts/speech
+TTS_PRONUNCIATIONS_PATH=tts_pronunciations.json # JSON map of pronunciation overrides applied before synth
+                                               # (see tts_pronunciations.example.json); hot-reloaded on change
 
 # Insight Chat Continuation
 AGENTIC_CHAT_MAX_ITERATIONS=6                  # Cap on tool-calling iterations per chat turn (default 6)
diff --git a/README.md b/README.md
index f355e32..8a6421b 100644
--- a/README.md
+++ b/README.md
@@ -153,7 +153,8 @@ behind the same llama-swap proxy. Only requires `LLAMA_SWAP_URL` (the TTS client
 is built whenever that's set — independent of `LLM_BACKEND`). Endpoints:
 - `POST /tts/speech` — body `{ text, voice?, format?, exaggeration?, cfg_weight?,
   temperature? }`; returns `{ audio_base64, format }`. Input is cleaned
-  server-side (markdown + emoji stripped) and the generation knobs are clamped
+  server-side (markdown + emoji stripped, then pronunciation overrides applied —
+  see below) and the generation knobs are clamped
   to Chatterbox's ranges. Synthesis is serialized (one at a time — the upstream
   has no GPU lock of its own); a concurrent request gets a fast `429`.
 - `POST /tts/speech/jobs` — durable variant for long syntheses: same body as
@@ -177,7 +178,14 @@ is built whenever that's set — independent of `LLM_BACKEND`). Endpoints:
 Created voice names are tagged with the ref-clip cap in effect (e.g.
 `grandma-30s`) so the library shows which reference length produced each clone.
 
+Words the model mispronounces (place names, initialisms) can be rewritten
+before synthesis via a JSON map — copy `tts_pronunciations.example.json` to
+`tts_pronunciations.json` and edit; changes apply without a restart. Full
+matching rules are documented in `src/ai/pronunciation.rs`.
+
 Env:
+- `TTS_PRONUNCIATIONS_PATH` - pronunciation-override JSON file
+  [default: `tts_pronunciations.json` in the working directory]
 - `LLAMA_SWAP_TTS_MODEL` - TTS model id in llama-swap's `config.yaml` [default: `chatterbox`]
 - `LLAMA_SWAP_TTS_VOICE` - default voice used when a `/tts/speech` request omits `voice` (optional)
 - `LLAMA_SWAP_TTS_REF_SECONDS` - max voice-clone reference clip length in seconds
diff --git a/src/ai/mod.rs b/src/ai/mod.rs
index d358f6c..c5302fb 100644
--- a/src/ai/mod.rs
+++ b/src/ai/mod.rs
@@ -12,6 +12,7 @@ pub mod llm_client;
 pub mod local_llm;
 pub mod ollama;
 pub mod openrouter;
+pub mod pronunciation;
 pub mod sms_client;
 pub mod tts;
 pub mod turn_registry;
diff --git a/src/ai/pronunciation.rs b/src/ai/pronunciation.rs
new file mode 100644
index 0000000..b9d7f6e
--- /dev/null
+++ b/src/ai/pronunciation.rs
@@ -0,0 +1,282 @@
+// User-configurable pronunciation overrides for TTS. Chatterbox mispronounces
+// place names ("Worcester"), initialisms ("WSL"), and clipped abbreviations
+// ("blvd"), so we rewrite them to phonetic spellings before synthesis.
+//
+// The map lives in a JSON file on the server — a flat object of
+// `"written form": "spoken form"` pairs, e.g.:
+//
+//   {
+//     "Worcester": "Wuster",
+//     "WSL": "W S L",
+//     "blvd": "boulevard",
+//     "Dr.": "Doctor"
+//   }
+//
+// Path comes from `TTS_PRONUNCIATIONS_PATH` (default `tts_pronunciations.json`
+// in the working directory). A missing file simply disables the feature. The
+// file is re-read whenever its mtime changes, so edits apply to the next
+// synthesis without a restart; a malformed edit keeps the last good map and
+// logs the parse error instead of silently dropping all overrides.
+//
+// Matching rules:
+// - Whole words only — `cat` never rewrites `category`. (Boundaries are only
+//   asserted next to word characters, so keys like `Dr.` still work.)
+// - Smartcase: an all-lowercase key matches case-insensitively; a key with
+//   any uppercase matches exactly. That lets `worcester` catch every casing
+//   while `US` (the country) leaves the pronoun `us` alone.
+// - Longer keys win over shorter ones (`New York Times` before `New York`).
+
+use regex::Regex;
+use std::collections::HashMap;
+use std::path::Path;
+use std::sync::{Arc, LazyLock, Mutex as StdMutex};
+use std::time::SystemTime;
+
+/// A compiled pronunciation map: one alternation regex over every key plus
+/// the lookup tables the replacement closure resolves matches against.
+#[derive(Default)]
+struct CompiledMap {
+    /// `None` when the map is empty — apply() is then a no-op.
+    regex: Option<Regex>,
+    /// Case-sensitive entries, keyed verbatim.
+    exact: HashMap<String, String>,
+    /// Case-insensitive entries, keyed lowercased.
+    folded: HashMap<String, String>,
+}
+
+impl CompiledMap {
+    fn from_entries(entries: &HashMap<String, String>) -> Self {
+        let mut keys: Vec<&str> = entries
+            .keys()
+            .map(|k| k.as_str())
+            .filter(|k| !k.trim().is_empty())
+            .collect();
+        if keys.is_empty() {
+            return Self::default();
+        }
+        // Longest key first so overlapping entries prefer the more specific
+        // one (regex alternation is first-match-wins, not longest-match).
+        keys.sort_by(|a, b| b.len().cmp(&a.len()).then(a.cmp(b)));
+
+        let mut exact = HashMap::new();
+        let mut folded = HashMap::new();
+        let alternatives: Vec<String> = keys
+            .iter()
+            .map(|key| {
+                let escaped = regex::escape(key);
+                // Only assert a word boundary where the key edge is a word
+                // character — `\b` adjacent to punctuation (e.g. the dot in
+                // `Dr.`) would otherwise never match.
+                let lead = if key
+                    .chars()
+                    .next()
+                    .is_some_and(|c| c.is_alphanumeric() || c == '_')
+                {
+                    r"\b"
+                } else {
+                    ""
+                };
+                let trail = if key
+                    .chars()
+                    .last()
+                    .is_some_and(|c| c.is_alphanumeric() || c == '_')
+                {
+                    r"\b"
+                } else {
+                    ""
+                };
+                let case_sensitive = key.chars().any(|c| c.is_uppercase());
+                if case_sensitive {
+                    exact.insert(key.to_string(), entries[*key].clone());
+                    format!("{lead}{escaped}{trail}")
+                } else {
+                    folded.insert(key.to_lowercase(), entries[*key].clone());
+                    format!("{lead}(?i:{escaped}){trail}")
+                }
+            })
+            .collect();
+
+        // Escaped fixed strings can't produce an invalid pattern; if one ever
+        // does, treat the whole map as empty rather than panicking a handler.
+        let pattern = alternatives.join("|");
+        let regex = match Regex::new(&pattern) {
+            Ok(r) => Some(r),
+            Err(e) => {
+                log::error!("pronunciation map failed to compile: {e}");
+                None
+            }
+        };
+        Self {
+            regex,
+            exact,
+            folded,
+        }
+    }
+
+    fn apply(&self, text: &str) -> String {
+        let Some(re) = &self.regex else {
+            return text.to_string();
+        };
+        re.replace_all(text, |caps: &regex::Captures| {
+            let m = &caps[0];
+            self.exact
+                .get(m)
+                .or_else(|| self.folded.get(&m.to_lowercase()))
+                .cloned()
+                // Unreachable in practice — every alternative came from one
+                // of the two maps — but never drop the user's text.
+                .unwrap_or_else(|| m.to_string())
+        })
+        .into_owned()
+    }
+}
+
+struct CacheEntry {
+    mtime: Option<SystemTime>,
+    compiled: Arc<CompiledMap>,
+}
+
+static CACHE: LazyLock<StdMutex<Option<CacheEntry>>> = LazyLock::new(|| StdMutex::new(None));
+
+fn config_path() -> String {
+    std::env::var("TTS_PRONUNCIATIONS_PATH")
+        .ok()
+        .map(|s| s.trim().to_string())
+        .filter(|s| !s.is_empty())
+        .unwrap_or_else(|| "tts_pronunciations.json".to_string())
+}
+
+/// Load the compiled map, re-reading the file only when its mtime changed
+/// since the last call (or it appeared/disappeared). Synthesis is serialized
+/// on a single GPU permit, so a stat per call is noise.
+fn current_map() -> Arc<CompiledMap> {
+    let path_s = config_path();
+    let path = Path::new(&path_s);
+    let mtime = std::fs::metadata(path).and_then(|m| m.modified()).ok();
+
+    let mut cache = CACHE.lock().unwrap();
+    if let Some(entry) = cache.as_ref()
+        && entry.mtime == mtime
+    {
+        return entry.compiled.clone();
+    }
+
+    let compiled = match mtime {
+        None => Arc::new(CompiledMap::default()), // no file → no overrides
+        Some(_) => match std::fs::read_to_string(path)
+            .map_err(anyhow::Error::from)
+            .and_then(|s| Ok(serde_json::from_str::<HashMap<String, String>>(&s)?))
+        {
+            Ok(entries) => {
+                log::info!(
+                    "loaded {} pronunciation override(s) from {path_s}",
+                    entries.len()
+                );
+                Arc::new(CompiledMap::from_entries(&entries))
+            }
+            Err(e) => {
+                log::error!("failed to load pronunciation map {path_s}: {e}");
+                // Keep serving the previous map rather than regressing to
+                // none mid-edit; still record the new mtime so the error
+                // logs once per bad save, not once per synthesis.
+                cache
+                    .as_ref()
+                    .map(|c| c.compiled.clone())
+                    .unwrap_or_default()
+            }
+        },
+    };
+    *cache = Some(CacheEntry {
+        mtime,
+        compiled: compiled.clone(),
+    });
+    compiled
+}
+
+/// Rewrite configured words/abbreviations to their phonetic spellings.
+/// Call on cleaned (post-markdown-strip) text, right before synthesis.
+pub fn apply_pronunciations(text: &str) -> String {
+    current_map().apply(text)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn compile(pairs: &[(&str, &str)]) -> CompiledMap {
+        let entries = pairs
+            .iter()
+            .map(|(k, v)| (k.to_string(), v.to_string()))
+            .collect();
+        CompiledMap::from_entries(&entries)
+    }
+
+    #[test]
+    fn empty_map_is_a_noop() {
+        let m = compile(&[]);
+        assert_eq!(m.apply("nothing changes"), "nothing changes");
+    }
+
+    #[test]
+    fn replaces_whole_words_only() {
+        let m = compile(&[("cat", "kitty")]);
+        assert_eq!(m.apply("the cat sat"), "the kitty sat");
+        // No substring rewrites.
+        assert_eq!(m.apply("the category"), "the category");
+        assert_eq!(m.apply("concatenate"), "concatenate");
+    }
+
+    #[test]
+    fn lowercase_keys_match_any_casing() {
+        let m = compile(&[("worcester", "Wuster")]);
+        assert_eq!(m.apply("Worcester is nice"), "Wuster is nice");
+        assert_eq!(m.apply("in WORCESTER today"), "in Wuster today");
+        assert_eq!(m.apply("worcester sauce"), "Wuster sauce");
+    }
+
+    #[test]
+    fn uppercase_keys_match_case_sensitively() {
+        let m = compile(&[("US", "U S")]);
+        assert_eq!(m.apply("the US economy"), "the U S economy");
+        // The pronoun survives.
+        assert_eq!(m.apply("join us today"), "join us today");
+    }
+
+    #[test]
+    fn keys_with_punctuation_work() {
+        // `\b` is only asserted next to word characters, so the trailing dot
+        // doesn't break matching.
+        let m = compile(&[("Dr.", "Doctor"), ("blvd", "boulevard")]);
+        assert_eq!(
+            m.apply("Dr. Smith on Sunset blvd"),
+            "Doctor Smith on Sunset boulevard"
+        );
+    }
+
+    #[test]
+    fn longer_keys_win_over_shorter() {
+        let m = compile(&[("new york", "Noo York"), ("new york times", "the Times")]);
+        assert_eq!(m.apply("read the new york times"), "read the the Times");
+        assert_eq!(m.apply("visit new york soon"), "visit Noo York soon");
+    }
+
+    #[test]
+    fn multiple_occurrences_all_rewrite() {
+        let m = compile(&[("wsl", "W S L")]);
+        assert_eq!(m.apply("WSL and wsl and Wsl"), "W S L and W S L and W S L");
+    }
+
+    #[test]
+    fn replacement_text_is_verbatim() {
+        // Replacements aren't re-scanned — a value containing another key
+        // doesn't cascade.
+        let m = compile(&[("a1", "b2"), ("b2", "c3")]);
+        assert_eq!(m.apply("a1"), "b2");
+    }
+
+    #[test]
+    fn blank_keys_are_ignored() {
+        let m = compile(&[("", "x"), ("  ", "y"), ("ok", "fine")]);
+        assert_eq!(m.apply("ok then"), "fine then");
+    }
+}
diff --git a/src/ai/tts.rs b/src/ai/tts.rs
index f76810d..a0b9bd2 100644
--- a/src/ai/tts.rs
+++ b/src/ai/tts.rs
@@ -254,6 +254,14 @@ fn clean_for_tts(input: &str) -> String {
     s.trim().to_string()
 }
 
+/// Full text-preparation pipeline for synthesis: markdown/emoji cleanup, then
+/// the user's pronunciation overrides (see [`crate::ai::pronunciation`]) on
+/// the resulting plain text — after cleanup so word boundaries aren't
+/// obscured by `**WSL**`-style markup.
+fn prepare_for_tts(input: &str) -> String {
+    crate::ai::pronunciation::apply_pronunciations(&clean_for_tts(input))
+}
+
 /// Decode an audio/video file to mono 24 kHz WAV via ffmpeg, returning the WAV
 /// bytes. Chatterbox validates the reference clip by file *extension* and
 /// rejects several formats (e.g. `.aac`, `.opus`), so we always normalize to
@@ -337,7 +345,7 @@ pub async fn tts_speech_handler(
     let parent_context = extract_context_from_request(&http_request);
     let mut span = global_tracer().start_with_context("http.tts.speech", &parent_context);
 
-    let text = clean_for_tts(&req.text);
+    let text = prepare_for_tts(&req.text);
     if text.is_empty() {
         span.set_status(Status::error("text is required"));
         return HttpResponse::BadRequest().json(json!({ "error": "text is required" }));
@@ -435,7 +443,7 @@ pub async fn create_speech_job_handler(
     let mut span =
         global_tracer().start_with_context("http.tts.speech_job.create", &parent_context);
 
-    let text = clean_for_tts(&req.text);
+    let text = prepare_for_tts(&req.text);
     if text.is_empty() {
         span.set_status(Status::error("text is required"));
         return HttpResponse::BadRequest().json(json!({ "error": "text is required" }));
diff --git a/tts_pronunciations.example.json b/tts_pronunciations.example.json
new file mode 100644
index 0000000..9bc9df9
--- /dev/null
+++ b/tts_pronunciations.example.json
@@ -0,0 +1,13 @@
+{
+  "Worcester": "Wuster",
+  "Spokane": "Spo can",
+  "wsl": "W S L",
+  "sql": "sequel",
+  "api": "A P I",
+  "US": "U S",
+  "Dr.": "Doctor",
+  "St.": "Saint",
+  "blvd": "boulevard",
+  "vs.": "versus",
+  "etc.": "et cetera"
+}