diff --git a/.env.example b/.env.example index 2b6cff0..a45fdd5 100644 --- a/.env.example +++ b/.env.example @@ -88,6 +88,7 @@ AGENTIC_CHAT_MAX_ITERATIONS=6 # LLAMA_SWAP_TTS_MODEL=chatterbox # TTS model id in config.yaml # LLAMA_SWAP_TTS_VOICE=m # default voice when a request omits one # LLAMA_SWAP_TTS_REF_SECONDS=30 # max voice-clone reference clip length (s) +# LLAMA_SWAP_TTS_REQUEST_TIMEOUT_SECONDS=600 # synth timeout (long chunked text) # ── AI Insights — sibling services (optional) ─────────────────────────── # Apollo (places, face inference, CLIP encoders). Single-Apollo deploys diff --git a/CLAUDE.md b/CLAUDE.md index b5e1ee2..fba33e0 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -666,6 +666,8 @@ LLAMA_SWAP_TTS_MODEL=chatterbox # TTS model id in config.yaml (de LLAMA_SWAP_TTS_VOICE=m # Default voice when /tts/speech omits one (optional) LLAMA_SWAP_TTS_REF_SECONDS=30 # Max voice-clone reference clip length, seconds # (Chatterbox is zero-shot; ~10-20s clean ref is ideal) +LLAMA_SWAP_TTS_REQUEST_TIMEOUT_SECONDS=600 # Per-request synth timeout (long chunked insights take + # minutes); overrides the shared client timeout for /tts/speech # Insight Chat Continuation AGENTIC_CHAT_MAX_ITERATIONS=6 # Cap on tool-calling iterations per chat turn (default 6) diff --git a/README.md b/README.md index 0b678df..58ddc81 100644 --- a/README.md +++ b/README.md @@ -169,6 +169,10 @@ Env: [default: `30`]. Reference audio is ffmpeg-normalized to mono 24 kHz WAV (so any source format works); Chatterbox is zero-shot, so a clean ~10–20s sample is the sweet spot — more rarely helps. +- `LLAMA_SWAP_TTS_REQUEST_TIMEOUT_SECONDS` - per-request synthesis timeout in + seconds [default: `600`]. Long insights are chunked + synthesized server-side + and can take minutes; this is separate from (and overrides, for `/tts/speech`) + the shared `LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS`. #### Fallback Behavior - Primary server is tried first with 5-second connection timeout diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs index 2946688..d56b645 100644 --- a/src/ai/llamacpp.rs +++ b/src/ai/llamacpp.rs @@ -170,9 +170,19 @@ impl LlamaCppClient { body["temperature"] = json!(x); } + // TTS gets its own (longer) timeout: synthesizing a long, internally + // chunked insight can take minutes, well past the shared chat/embedding + // client timeout. Per-request `.timeout()` overrides the client default. + let tts_timeout = std::env::var("LLAMA_SWAP_TTS_REQUEST_TIMEOUT_SECONDS") + .ok() + .and_then(|v| v.parse::().ok()) + .filter(|n| *n > 0) + .unwrap_or(600); + let resp = self .client .post(&url) + .timeout(Duration::from_secs(tts_timeout)) .json(&body) .send() .await