Give TTS synthesis its own (longer) request timeout

Long insights are chunked + synthesized server-side and can run past the shared 180s chat/embedding client timeout, causing spurious timeouts. /tts/speech now uses a per-request timeout from LLAMA_SWAP_TTS_REQUEST_TIMEOUT_SECONDS (default 600), overriding the client default without affecting chat/embeddings. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-03 10:25:06 -04:00
parent 9978b28b52
commit d8dd260c6b
4 changed files with 17 additions and 0 deletions
@@ -170,9 +170,19 @@ impl LlamaCppClient {
            body["temperature"] = json!(x);
        }

+        // TTS gets its own (longer) timeout: synthesizing a long, internally
+        // chunked insight can take minutes, well past the shared chat/embedding
+        // client timeout. Per-request `.timeout()` overrides the client default.
+        let tts_timeout = std::env::var("LLAMA_SWAP_TTS_REQUEST_TIMEOUT_SECONDS")
+            .ok()
+            .and_then(|v| v.parse::<u64>().ok())
+            .filter(|n| *n > 0)
+            .unwrap_or(600);
+
        let resp = self
            .client
            .post(&url)
+            .timeout(Duration::from_secs(tts_timeout))
            .json(&body)
            .send()
            .await