Add user-configurable TTS pronunciation overrides
A JSON map (TTS_PRONUNCIATIONS_PATH, default tts_pronunciations.json) rewrites mispronounced words — place names, initialisms, dotted abbreviations — to phonetic spellings before synthesis, applied after markdown cleanup in both /tts/speech paths. Whole-word smartcase matching (lowercase keys match any casing, uppercase keys exact), longest key wins, hot-reloaded on mtime change with last-good fallback on parse errors. See tts_pronunciations.example.json. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
@@ -12,6 +12,7 @@ pub mod llm_client;
|
||||
pub mod local_llm;
|
||||
pub mod ollama;
|
||||
pub mod openrouter;
|
||||
pub mod pronunciation;
|
||||
pub mod sms_client;
|
||||
pub mod tts;
|
||||
pub mod turn_registry;
|
||||
|
||||
@@ -0,0 +1,282 @@
|
||||
// User-configurable pronunciation overrides for TTS. Chatterbox mispronounces
|
||||
// place names ("Worcester"), initialisms ("WSL"), and clipped abbreviations
|
||||
// ("blvd"), so we rewrite them to phonetic spellings before synthesis.
|
||||
//
|
||||
// The map lives in a JSON file on the server — a flat object of
|
||||
// `"written form": "spoken form"` pairs, e.g.:
|
||||
//
|
||||
// {
|
||||
// "Worcester": "Wuster",
|
||||
// "WSL": "W S L",
|
||||
// "blvd": "boulevard",
|
||||
// "Dr.": "Doctor"
|
||||
// }
|
||||
//
|
||||
// Path comes from `TTS_PRONUNCIATIONS_PATH` (default `tts_pronunciations.json`
|
||||
// in the working directory). A missing file simply disables the feature. The
|
||||
// file is re-read whenever its mtime changes, so edits apply to the next
|
||||
// synthesis without a restart; a malformed edit keeps the last good map and
|
||||
// logs the parse error instead of silently dropping all overrides.
|
||||
//
|
||||
// Matching rules:
|
||||
// - Whole words only — `cat` never rewrites `category`. (Boundaries are only
|
||||
// asserted next to word characters, so keys like `Dr.` still work.)
|
||||
// - Smartcase: an all-lowercase key matches case-insensitively; a key with
|
||||
// any uppercase matches exactly. That lets `worcester` catch every casing
|
||||
// while `US` (the country) leaves the pronoun `us` alone.
|
||||
// - Longer keys win over shorter ones (`New York Times` before `New York`).
|
||||
|
||||
use regex::Regex;
|
||||
use std::collections::HashMap;
|
||||
use std::path::Path;
|
||||
use std::sync::{Arc, LazyLock, Mutex as StdMutex};
|
||||
use std::time::SystemTime;
|
||||
|
||||
/// A compiled pronunciation map: one alternation regex over every key plus
|
||||
/// the lookup tables the replacement closure resolves matches against.
|
||||
#[derive(Default)]
|
||||
struct CompiledMap {
|
||||
/// `None` when the map is empty — apply() is then a no-op.
|
||||
regex: Option<Regex>,
|
||||
/// Case-sensitive entries, keyed verbatim.
|
||||
exact: HashMap<String, String>,
|
||||
/// Case-insensitive entries, keyed lowercased.
|
||||
folded: HashMap<String, String>,
|
||||
}
|
||||
|
||||
impl CompiledMap {
|
||||
fn from_entries(entries: &HashMap<String, String>) -> Self {
|
||||
let mut keys: Vec<&str> = entries
|
||||
.keys()
|
||||
.map(|k| k.as_str())
|
||||
.filter(|k| !k.trim().is_empty())
|
||||
.collect();
|
||||
if keys.is_empty() {
|
||||
return Self::default();
|
||||
}
|
||||
// Longest key first so overlapping entries prefer the more specific
|
||||
// one (regex alternation is first-match-wins, not longest-match).
|
||||
keys.sort_by(|a, b| b.len().cmp(&a.len()).then(a.cmp(b)));
|
||||
|
||||
let mut exact = HashMap::new();
|
||||
let mut folded = HashMap::new();
|
||||
let alternatives: Vec<String> = keys
|
||||
.iter()
|
||||
.map(|key| {
|
||||
let escaped = regex::escape(key);
|
||||
// Only assert a word boundary where the key edge is a word
|
||||
// character — `\b` adjacent to punctuation (e.g. the dot in
|
||||
// `Dr.`) would otherwise never match.
|
||||
let lead = if key
|
||||
.chars()
|
||||
.next()
|
||||
.is_some_and(|c| c.is_alphanumeric() || c == '_')
|
||||
{
|
||||
r"\b"
|
||||
} else {
|
||||
""
|
||||
};
|
||||
let trail = if key
|
||||
.chars()
|
||||
.last()
|
||||
.is_some_and(|c| c.is_alphanumeric() || c == '_')
|
||||
{
|
||||
r"\b"
|
||||
} else {
|
||||
""
|
||||
};
|
||||
let case_sensitive = key.chars().any(|c| c.is_uppercase());
|
||||
if case_sensitive {
|
||||
exact.insert(key.to_string(), entries[*key].clone());
|
||||
format!("{lead}{escaped}{trail}")
|
||||
} else {
|
||||
folded.insert(key.to_lowercase(), entries[*key].clone());
|
||||
format!("{lead}(?i:{escaped}){trail}")
|
||||
}
|
||||
})
|
||||
.collect();
|
||||
|
||||
// Escaped fixed strings can't produce an invalid pattern; if one ever
|
||||
// does, treat the whole map as empty rather than panicking a handler.
|
||||
let pattern = alternatives.join("|");
|
||||
let regex = match Regex::new(&pattern) {
|
||||
Ok(r) => Some(r),
|
||||
Err(e) => {
|
||||
log::error!("pronunciation map failed to compile: {e}");
|
||||
None
|
||||
}
|
||||
};
|
||||
Self {
|
||||
regex,
|
||||
exact,
|
||||
folded,
|
||||
}
|
||||
}
|
||||
|
||||
fn apply(&self, text: &str) -> String {
|
||||
let Some(re) = &self.regex else {
|
||||
return text.to_string();
|
||||
};
|
||||
re.replace_all(text, |caps: ®ex::Captures| {
|
||||
let m = &caps[0];
|
||||
self.exact
|
||||
.get(m)
|
||||
.or_else(|| self.folded.get(&m.to_lowercase()))
|
||||
.cloned()
|
||||
// Unreachable in practice — every alternative came from one
|
||||
// of the two maps — but never drop the user's text.
|
||||
.unwrap_or_else(|| m.to_string())
|
||||
})
|
||||
.into_owned()
|
||||
}
|
||||
}
|
||||
|
||||
struct CacheEntry {
|
||||
mtime: Option<SystemTime>,
|
||||
compiled: Arc<CompiledMap>,
|
||||
}
|
||||
|
||||
static CACHE: LazyLock<StdMutex<Option<CacheEntry>>> = LazyLock::new(|| StdMutex::new(None));
|
||||
|
||||
fn config_path() -> String {
|
||||
std::env::var("TTS_PRONUNCIATIONS_PATH")
|
||||
.ok()
|
||||
.map(|s| s.trim().to_string())
|
||||
.filter(|s| !s.is_empty())
|
||||
.unwrap_or_else(|| "tts_pronunciations.json".to_string())
|
||||
}
|
||||
|
||||
/// Load the compiled map, re-reading the file only when its mtime changed
|
||||
/// since the last call (or it appeared/disappeared). Synthesis is serialized
|
||||
/// on a single GPU permit, so a stat per call is noise.
|
||||
fn current_map() -> Arc<CompiledMap> {
|
||||
let path_s = config_path();
|
||||
let path = Path::new(&path_s);
|
||||
let mtime = std::fs::metadata(path).and_then(|m| m.modified()).ok();
|
||||
|
||||
let mut cache = CACHE.lock().unwrap();
|
||||
if let Some(entry) = cache.as_ref()
|
||||
&& entry.mtime == mtime
|
||||
{
|
||||
return entry.compiled.clone();
|
||||
}
|
||||
|
||||
let compiled = match mtime {
|
||||
None => Arc::new(CompiledMap::default()), // no file → no overrides
|
||||
Some(_) => match std::fs::read_to_string(path)
|
||||
.map_err(anyhow::Error::from)
|
||||
.and_then(|s| Ok(serde_json::from_str::<HashMap<String, String>>(&s)?))
|
||||
{
|
||||
Ok(entries) => {
|
||||
log::info!(
|
||||
"loaded {} pronunciation override(s) from {path_s}",
|
||||
entries.len()
|
||||
);
|
||||
Arc::new(CompiledMap::from_entries(&entries))
|
||||
}
|
||||
Err(e) => {
|
||||
log::error!("failed to load pronunciation map {path_s}: {e}");
|
||||
// Keep serving the previous map rather than regressing to
|
||||
// none mid-edit; still record the new mtime so the error
|
||||
// logs once per bad save, not once per synthesis.
|
||||
cache
|
||||
.as_ref()
|
||||
.map(|c| c.compiled.clone())
|
||||
.unwrap_or_default()
|
||||
}
|
||||
},
|
||||
};
|
||||
*cache = Some(CacheEntry {
|
||||
mtime,
|
||||
compiled: compiled.clone(),
|
||||
});
|
||||
compiled
|
||||
}
|
||||
|
||||
/// Rewrite configured words/abbreviations to their phonetic spellings.
|
||||
/// Call on cleaned (post-markdown-strip) text, right before synthesis.
|
||||
pub fn apply_pronunciations(text: &str) -> String {
|
||||
current_map().apply(text)
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
fn compile(pairs: &[(&str, &str)]) -> CompiledMap {
|
||||
let entries = pairs
|
||||
.iter()
|
||||
.map(|(k, v)| (k.to_string(), v.to_string()))
|
||||
.collect();
|
||||
CompiledMap::from_entries(&entries)
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn empty_map_is_a_noop() {
|
||||
let m = compile(&[]);
|
||||
assert_eq!(m.apply("nothing changes"), "nothing changes");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn replaces_whole_words_only() {
|
||||
let m = compile(&[("cat", "kitty")]);
|
||||
assert_eq!(m.apply("the cat sat"), "the kitty sat");
|
||||
// No substring rewrites.
|
||||
assert_eq!(m.apply("the category"), "the category");
|
||||
assert_eq!(m.apply("concatenate"), "concatenate");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn lowercase_keys_match_any_casing() {
|
||||
let m = compile(&[("worcester", "Wuster")]);
|
||||
assert_eq!(m.apply("Worcester is nice"), "Wuster is nice");
|
||||
assert_eq!(m.apply("in WORCESTER today"), "in Wuster today");
|
||||
assert_eq!(m.apply("worcester sauce"), "Wuster sauce");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn uppercase_keys_match_case_sensitively() {
|
||||
let m = compile(&[("US", "U S")]);
|
||||
assert_eq!(m.apply("the US economy"), "the U S economy");
|
||||
// The pronoun survives.
|
||||
assert_eq!(m.apply("join us today"), "join us today");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn keys_with_punctuation_work() {
|
||||
// `\b` is only asserted next to word characters, so the trailing dot
|
||||
// doesn't break matching.
|
||||
let m = compile(&[("Dr.", "Doctor"), ("blvd", "boulevard")]);
|
||||
assert_eq!(
|
||||
m.apply("Dr. Smith on Sunset blvd"),
|
||||
"Doctor Smith on Sunset boulevard"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn longer_keys_win_over_shorter() {
|
||||
let m = compile(&[("new york", "Noo York"), ("new york times", "the Times")]);
|
||||
assert_eq!(m.apply("read the new york times"), "read the the Times");
|
||||
assert_eq!(m.apply("visit new york soon"), "visit Noo York soon");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn multiple_occurrences_all_rewrite() {
|
||||
let m = compile(&[("wsl", "W S L")]);
|
||||
assert_eq!(m.apply("WSL and wsl and Wsl"), "W S L and W S L and W S L");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn replacement_text_is_verbatim() {
|
||||
// Replacements aren't re-scanned — a value containing another key
|
||||
// doesn't cascade.
|
||||
let m = compile(&[("a1", "b2"), ("b2", "c3")]);
|
||||
assert_eq!(m.apply("a1"), "b2");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn blank_keys_are_ignored() {
|
||||
let m = compile(&[("", "x"), (" ", "y"), ("ok", "fine")]);
|
||||
assert_eq!(m.apply("ok then"), "fine then");
|
||||
}
|
||||
}
|
||||
+10
-2
@@ -254,6 +254,14 @@ fn clean_for_tts(input: &str) -> String {
|
||||
s.trim().to_string()
|
||||
}
|
||||
|
||||
/// Full text-preparation pipeline for synthesis: markdown/emoji cleanup, then
|
||||
/// the user's pronunciation overrides (see [`crate::ai::pronunciation`]) on
|
||||
/// the resulting plain text — after cleanup so word boundaries aren't
|
||||
/// obscured by `**WSL**`-style markup.
|
||||
fn prepare_for_tts(input: &str) -> String {
|
||||
crate::ai::pronunciation::apply_pronunciations(&clean_for_tts(input))
|
||||
}
|
||||
|
||||
/// Decode an audio/video file to mono 24 kHz WAV via ffmpeg, returning the WAV
|
||||
/// bytes. Chatterbox validates the reference clip by file *extension* and
|
||||
/// rejects several formats (e.g. `.aac`, `.opus`), so we always normalize to
|
||||
@@ -337,7 +345,7 @@ pub async fn tts_speech_handler(
|
||||
let parent_context = extract_context_from_request(&http_request);
|
||||
let mut span = global_tracer().start_with_context("http.tts.speech", &parent_context);
|
||||
|
||||
let text = clean_for_tts(&req.text);
|
||||
let text = prepare_for_tts(&req.text);
|
||||
if text.is_empty() {
|
||||
span.set_status(Status::error("text is required"));
|
||||
return HttpResponse::BadRequest().json(json!({ "error": "text is required" }));
|
||||
@@ -435,7 +443,7 @@ pub async fn create_speech_job_handler(
|
||||
let mut span =
|
||||
global_tracer().start_with_context("http.tts.speech_job.create", &parent_context);
|
||||
|
||||
let text = clean_for_tts(&req.text);
|
||||
let text = prepare_for_tts(&req.text);
|
||||
if text.is_empty() {
|
||||
span.set_status(Status::error("text is required"));
|
||||
return HttpResponse::BadRequest().json(json!({ "error": "text is required" }));
|
||||
|
||||
Reference in New Issue
Block a user