diff --git a/CLAUDE.md b/CLAUDE.md index 7f1da76..b5e1ee2 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -477,6 +477,12 @@ GET /insights/models (local-backend models + capabilities; Ollam GET /insights/openrouter/models (curated OpenRouter allowlist) POST /insights/rate (thumbs up/down for training data) +// Text-to-Speech (Chatterbox via llama-swap; needs LLAMA_SWAP_URL) +POST /tts/speech (read-aloud: { text, voice?, ... } -> { audio_base64, format }) +GET /tts/voices (Chatterbox voice library) +POST /tts/voices/upload (clone a voice from an uploaded clip; multipart) +POST /tts/voices/from-library (clone a voice from a library audio/video file) + // Insight Chat Continuation POST /insights/chat (single-turn reply, non-streaming) POST /insights/chat/stream (SSE: text / tool_call / tool_result / truncated / done) @@ -652,6 +658,15 @@ LLAMA_SWAP_ALLOWED_MODELS=chat,coder # Curated allowlist surfaced by # Empty = picker shows only the configured primary model. LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180 # Per-request timeout; bump for slow CPU offload +# Text-to-speech (Chatterbox served behind llama-swap). Only needs +# LLAMA_SWAP_URL — independent of LLM_BACKEND. Powers /tts/speech (read-aloud) +# and /tts/voices* (voice cloning). Reference audio is ffmpeg-normalized to WAV +# server-side, so any source format works. +LLAMA_SWAP_TTS_MODEL=chatterbox # TTS model id in config.yaml (default: chatterbox) +LLAMA_SWAP_TTS_VOICE=m # Default voice when /tts/speech omits one (optional) +LLAMA_SWAP_TTS_REF_SECONDS=30 # Max voice-clone reference clip length, seconds + # (Chatterbox is zero-shot; ~10-20s clean ref is ideal) + # Insight Chat Continuation AGENTIC_CHAT_MAX_ITERATIONS=6 # Cap on tool-calling iterations per chat turn (default 6) ```