ImageApi/src/ai/tts.rs

// TTS endpoints: proxy text-to-speech + voice-library management to the
// Chatterbox server that sits behind llama-swap (via LlamaCppClient). Speech
// synthesis returns audio as base64-in-JSON so the mobile app can play it as a
// `data:` URI without a binary-fetch path. Voice cloning registers a named
// voice from either an uploaded clip (device) or an existing library file
// (audio read directly; video has its audio track extracted via ffmpeg).

use actix_multipart::Multipart;
use actix_web::{HttpRequest, HttpResponse, Responder, get, post, web};
use anyhow::Context;
use base64::Engine;
use bytes::{BufMut, BytesMut};
use futures::StreamExt;
use opentelemetry::KeyValue;
use opentelemetry::trace::{Span, Status, Tracer};
use regex::Regex;
use serde::{Deserialize, Serialize};
use serde_json::json;
use std::path::Path;
use std::sync::LazyLock;
use tokio::sync::Semaphore;

use crate::data::Claims;
use crate::file_types::{is_audio_file, is_video_file};
use crate::files::is_valid_full_path;
use crate::libraries;
use crate::otel::{extract_context_from_request, global_tracer};
use crate::state::AppState;

/// Hard cap on an uploaded voice-reference clip. Chatterbox itself caps the
/// payload (~60s clip); this is a defensive ceiling so a hostile/oversized
/// upload can't balloon ImageApi memory before we ever forward it.
const MAX_VOICE_UPLOAD_BYTES: usize = 25 * 1024 * 1024; // 25 MB

/// Serialize speech synthesis: the Chatterbox server has no internal lock or
/// queue, so concurrent requests contend on the single GPU and cascade into
/// timeouts. One permit; when busy we fast-fail with 429 rather than queue —
/// the app surfaces "busy" immediately, and typical jobs clear in well under a
/// minute. (An abandoned upstream job can still occupy the GPU until it
/// finishes — that's a wrapper limitation; the chunked-queue plan fixes it.)
static TTS_PERMIT: LazyLock<Semaphore> = LazyLock::new(|| Semaphore::new(1));

/// Sanitize a user-supplied voice name. The name is forwarded to Chatterbox
/// where it becomes a filename in the voice-library directory, so we restrict
/// it to a safe charset (alphanumerics, dash, underscore) — no path
/// separators, dots, or whitespace — and bound its length. Returns `None`
/// when nothing usable remains.
fn sanitize_voice_name(raw: &str) -> Option<String> {
    let cleaned: String = raw
        .trim()
        .chars()
        .map(|c| {
            if c.is_ascii_alphanumeric() || c == '-' || c == '_' {
                c
            } else {
                '-'
            }
        })
        .collect();
    let cleaned = cleaned.trim_matches('-').to_string();
    if cleaned.is_empty() {
        return None;
    }
    Some(cleaned.chars().take(64).collect())
}

/// Optional default voice for synthesis when the request doesn't name one.
/// Set `LLAMA_SWAP_TTS_VOICE=m` to read insights in a cloned voice by default.
fn default_voice() -> Option<String> {
    std::env::var("LLAMA_SWAP_TTS_VOICE")
        .ok()
        .map(|s| s.trim().to_string())
        .filter(|s| !s.is_empty())
}

// Markdown / formatting strippers, compiled once. Insight text is markdown,
// which TTS would otherwise read literally ("star star bold star star").
static MD_IMAGE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"!\[([^\]]*)\]\([^)]*\)").unwrap());
static MD_LINK: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\[([^\]]+)\]\([^)]*\)").unwrap());
static MD_HEADING: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}#{1,6}\s*").unwrap());
static MD_BLOCKQUOTE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}>\s?").unwrap());
static MD_LIST: LazyLock<Regex> =
    LazyLock::new(|| Regex::new(r"(?m)^\s{0,3}([-*+]|\d+\.)\s+").unwrap());
static MD_EMPHASIS: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[*_`~]+").unwrap());
static URL_RE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"https?://\S+").unwrap());
static MULTISPACE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"[ \t]{2,}").unwrap());
// Any run of 2+ newlines (incl. whitespace-only blank lines) collapses to ONE
// newline: Chatterbox inserts a long pause (sometimes ~20s of silence) per
// blank line, so paragraph breaks must reach it as a single line break at most.
static MULTINEWLINE: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\n(?:[ \t]*\n)+").unwrap());

/// True for emoji / pictographic symbols, which most TTS models either skip or
/// mispronounce. Covers the main emoji blocks plus dingbats, misc-technical,
/// variation selectors, and the ZWJ used to glue emoji sequences. We do NOT
/// strip `[bracketed]` tags — non-turbo Chatterbox ignores them, and a future
/// Turbo switch uses them as paralinguistic cues.
fn is_emoji_like(c: char) -> bool {
    let u = c as u32;
    matches!(u,
        0x1F000..=0x1FAFF   // emoji, pictographs, supplemental symbols, flags
        | 0x2300..=0x23FF   // misc technical (⌚ ⏰ ⏳ …)
        | 0x2600..=0x27BF   // misc symbols + dingbats
        | 0x2B00..=0x2BFF   // misc symbols & arrows (★ ⬆ …)
        | 0xFE00..=0xFE0F   // variation selectors
        | 0x200D            // zero-width joiner
    )
}

/// Normalize insight text for speech: unwrap markdown links/images to their
/// visible text, drop heading/list/blockquote/emphasis markers and URLs, strip
/// emoji, and collapse whitespace. Centralized here so every caller (app,
/// WebUI, curl) gets clean audio.
fn clean_for_tts(input: &str) -> String {
    let s = MD_IMAGE.replace_all(input, "$1");
    let s = MD_LINK.replace_all(&s, "$1");
    let s = MD_HEADING.replace_all(&s, "");
    let s = MD_BLOCKQUOTE.replace_all(&s, "");
    let s = MD_LIST.replace_all(&s, "");
    let s = MD_EMPHASIS.replace_all(&s, "");
    let s = URL_RE.replace_all(&s, " ");
    let s: String = s.chars().filter(|c| !is_emoji_like(*c)).collect();
    let s = MULTISPACE.replace_all(&s, " ");
    let s = MULTINEWLINE.replace_all(&s, "\n");
    s.trim().to_string()
}

/// Decode an audio/video file to mono 24 kHz WAV via ffmpeg, returning the WAV
/// bytes. Chatterbox validates the reference clip by file *extension* and
/// rejects several formats (e.g. `.aac`, `.opus`), so we always normalize to
/// WAV regardless of the source container. Capped at 30s — references only need
/// a few seconds of clean speech.
async fn run_ffmpeg_to_wav(input_path: &str) -> anyhow::Result<Vec<u8>> {
    let out = tempfile::Builder::new()
        .suffix(".wav")
        .tempfile()
        .context("creating temp wav")?;
    let out_s = out.path().to_string_lossy().to_string();

    // Cap the reference clip length. Chatterbox is zero-shot — a clean ~10–20s
    // sample is the sweet spot and more rarely helps — so we use the first N
    // seconds. Tune via LLAMA_SWAP_TTS_REF_SECONDS (default 30).
    let secs = std::env::var("LLAMA_SWAP_TTS_REF_SECONDS")
        .ok()
        .and_then(|s| s.trim().parse::<u32>().ok())
        .filter(|n| *n > 0)
        .unwrap_or(30)
        .to_string();

    let output = tokio::process::Command::new("ffmpeg")
        .args([
            "-y", "-i", input_path, "-vn", "-ac", "1", "-ar", "24000", "-t", &secs, "-f", "wav",
            &out_s,
        ])
        .output()
        .await
        .context("spawning ffmpeg")?;

    if !output.status.success() {
        anyhow::bail!("ffmpeg failed: {}", String::from_utf8_lossy(&output.stderr));
    }
    std::fs::read(&out_s).context("reading transcoded audio")
}

/// Normalize in-memory upload bytes to WAV: write to a temp file (keeping the
/// source extension as an ffmpeg probe hint) then transcode.
async fn transcode_bytes_to_wav(input: &[u8], src_ext: Option<&str>) -> anyhow::Result<Vec<u8>> {
    let suffix = src_ext
        .filter(|e| !e.is_empty())
        .map(|e| format!(".{e}"))
        .unwrap_or_else(|| ".bin".to_string());
    let in_tmp = tempfile::Builder::new()
        .suffix(&suffix)
        .tempfile()
        .context("creating temp input")?;
    std::fs::write(in_tmp.path(), input).context("writing temp input")?;
    run_ffmpeg_to_wav(&in_tmp.path().to_string_lossy()).await
}

#[derive(Debug, Deserialize)]
pub struct TtsSpeechRequest {
    pub text: String,
    #[serde(default)]
    pub voice: Option<String>,
    /// Audio container, e.g. `"mp3"` (default) or `"wav"`.
    #[serde(default)]
    pub format: Option<String>,
    /// Chatterbox knobs (clamped server-side). exaggeration 0.25–2.0 (emotion),
    /// cfg_weight 0.0–1.0 (pace; ~0.3 for fast speakers, 0 to neutralize a
    /// reference accent), temperature 0.05–5.0 (randomness).
    #[serde(default)]
    pub exaggeration: Option<f32>,
    #[serde(default)]
    pub cfg_weight: Option<f32>,
    #[serde(default)]
    pub temperature: Option<f32>,
}

#[derive(Debug, Serialize)]
pub struct TtsSpeechResponse {
    pub audio_base64: String,
    pub format: String,
}

/// POST /tts/speech — synthesize `text` (optionally in a named `voice`) and
/// return base64-encoded audio for `data:` URI playback on the client.
#[post("/tts/speech")]
pub async fn tts_speech_handler(
    http_request: HttpRequest,
    _claims: Claims,
    req: web::Json<TtsSpeechRequest>,
    app_state: web::Data<AppState>,
) -> impl Responder {
    let parent_context = extract_context_from_request(&http_request);
    let mut span = global_tracer().start_with_context("http.tts.speech", &parent_context);

    let text = clean_for_tts(&req.text);
    if text.is_empty() {
        span.set_status(Status::error("text is required"));
        return HttpResponse::BadRequest().json(json!({ "error": "text is required" }));
    }
    let Some(client) = app_state.llamacpp.as_ref() else {
        span.set_status(Status::error("tts backend not configured"));
        return HttpResponse::ServiceUnavailable()
            .json(json!({ "error": "TTS backend not configured (set LLAMA_SWAP_URL)" }));
    };

    let format = req
        .format
        .as_deref()
        .filter(|s| !s.is_empty())
        .unwrap_or("mp3");
    let dv = default_voice();
    let voice = req
        .voice
        .as_deref()
        .filter(|s| !s.is_empty())
        .or(dv.as_deref());

    span.set_attribute(KeyValue::new("tts.model", client.tts_model.clone()));
    span.set_attribute(KeyValue::new("tts.format", format.to_string()));
    span.set_attribute(KeyValue::new("tts.has_voice", voice.is_some()));
    span.set_attribute(KeyValue::new("tts.text_len", text.len() as i64));

    // Clamp generation knobs to Chatterbox's documented ranges before forwarding.
    let exaggeration = req.exaggeration.map(|x| x.clamp(0.25, 2.0));
    let cfg_weight = req.cfg_weight.map(|x| x.clamp(0.0, 1.0));
    let temperature = req.temperature.map(|x| x.clamp(0.05, 5.0));

    // One synthesis at a time (see TTS_PERMIT) — fast-fail when busy.
    let Ok(_permit) = TTS_PERMIT.try_acquire() else {
        span.set_status(Status::error("tts busy"));
        return HttpResponse::TooManyRequests().json(json!({
            "error": "TTS is busy with another request — try again shortly"
        }));
    };

    match client
        .text_to_speech(&text, voice, format, exaggeration, cfg_weight, temperature)
        .await
    {
        Ok(bytes) => {
            span.set_attribute(KeyValue::new("tts.audio_bytes", bytes.len() as i64));
            span.set_status(Status::Ok);
            let audio_base64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
            HttpResponse::Ok().json(TtsSpeechResponse {
                audio_base64,
                format: format.to_string(),
            })
        }
        Err(e) => {
            span.set_status(Status::error("tts synthesis failed"));
            log::error!("TTS synth failed: {:?}", e);
            HttpResponse::BadGateway().json(json!({ "error": format!("TTS failed: {e}") }))
        }
    }
}

/// GET /tts/voices — list the Chatterbox voice library (raw passthrough).
#[get("/tts/voices")]
pub async fn list_voices_handler(
    http_request: HttpRequest,
    _claims: Claims,
    app_state: web::Data<AppState>,
) -> impl Responder {
    let parent_context = extract_context_from_request(&http_request);
    let mut span = global_tracer().start_with_context("http.tts.voices.list", &parent_context);

    let Some(client) = app_state.llamacpp.as_ref() else {
        span.set_status(Status::error("tts backend not configured"));
        return HttpResponse::ServiceUnavailable()
            .json(json!({ "error": "TTS backend not configured" }));
    };
    match client.list_voices().await {
        Ok(v) => {
            span.set_status(Status::Ok);
            HttpResponse::Ok().json(v)
        }
        Err(e) => {
            span.set_status(Status::error("list_voices failed"));
            log::error!("list_voices failed: {:?}", e);
            HttpResponse::BadGateway().json(json!({ "error": format!("{e}") }))
        }
    }
}

/// POST /tts/voices/upload — register a cloned voice from an uploaded audio
/// clip. Multipart fields: `voice_name` (text) + a file part (`voice_file`).
#[post("/tts/voices/upload")]
pub async fn create_voice_upload_handler(
    http_request: HttpRequest,
    _claims: Claims,
    mut payload: Multipart,
    app_state: web::Data<AppState>,
) -> impl Responder {
    let parent_context = extract_context_from_request(&http_request);
    let mut span = global_tracer().start_with_context("http.tts.voices.upload", &parent_context);

    let Some(client) = app_state.llamacpp.as_ref() else {
        span.set_status(Status::error("tts backend not configured"));
        return HttpResponse::ServiceUnavailable()
            .json(json!({ "error": "TTS backend not configured" }));
    };

    let mut voice_name: Option<String> = None;
    let mut file_bytes = BytesMut::new();
    let mut filename = "voice.wav".to_string();

    while let Some(Ok(mut part)) = payload.next().await {
        // Capture disposition fields up front so the immutable borrow ends
        // before we mutably stream the part body (mirrors handlers/image.rs).
        let (fname_opt, name_opt) = {
            let cd = part.content_disposition();
            (
                cd.and_then(|c| c.get_filename()).map(|s| s.to_string()),
                cd.and_then(|c| c.get_name()).map(|s| s.to_string()),
            )
        };

        if let Some(fname) = fname_opt {
            filename = fname;
            while let Some(Ok(data)) = part.next().await {
                if file_bytes.len() + data.len() > MAX_VOICE_UPLOAD_BYTES {
                    span.set_status(Status::error("voice clip exceeds limit"));
                    return HttpResponse::PayloadTooLarge()
                        .json(json!({ "error": "voice clip exceeds 25 MB" }));
                }
                file_bytes.put(data);
            }
        } else if name_opt.as_deref() == Some("voice_name") {
            let mut buf = BytesMut::new();
            while let Some(Ok(data)) = part.next().await {
                buf.put(data);
            }
            voice_name = Some(String::from_utf8_lossy(&buf).trim().to_string());
        } else {
            while let Some(Ok(_)) = part.next().await {}
        }
    }

    let Some(name) = voice_name.as_deref().and_then(sanitize_voice_name) else {
        span.set_status(Status::error("voice_name is required"));
        return HttpResponse::BadRequest()
            .json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
    };
    if file_bytes.is_empty() {
        span.set_status(Status::error("voice_file is required"));
        return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" }));
    }
    span.set_attribute(KeyValue::new("tts.voice_name", name.clone()));
    span.set_attribute(KeyValue::new("tts.upload_bytes", file_bytes.len() as i64));

    // Normalize to WAV so any device format (e.g. .aac / .opus, which Chatterbox
    // rejects by extension) is accepted.
    let src_ext = Path::new(&filename).extension().and_then(|e| e.to_str());
    let wav = match transcode_bytes_to_wav(file_bytes.as_ref(), src_ext).await {
        Ok(w) => w,
        Err(e) => {
            span.set_status(Status::error("audio decode failed"));
            log::error!("voice upload transcode failed: {:?}", e);
            return HttpResponse::BadRequest()
                .json(json!({ "error": "couldn't decode that audio file" }));
        }
    };

    match client
        .create_voice(&name, wav, "reference.wav", "audio/wav")
        .await
    {
        Ok(v) => {
            span.set_status(Status::Ok);
            HttpResponse::Ok().json(v)
        }
        Err(e) => {
            span.set_status(Status::error("create_voice failed"));
            log::error!("create_voice (upload) failed: {:?}", e);
            HttpResponse::BadGateway().json(json!({ "error": format!("{e}") }))
        }
    }
}

#[derive(Debug, Deserialize)]
pub struct CreateVoiceFromLibraryRequest {
    pub voice_name: String,
    /// Library-relative path to an audio or video file.
    pub path: String,
    #[serde(default)]
    pub library: Option<String>,
}

/// POST /tts/voices/from-library — register a cloned voice from a file already
/// in a library. Audio and video alike are ffmpeg-normalized to a mono 24 kHz
/// WAV reference clip (length capped by LLAMA_SWAP_TTS_REF_SECONDS).
#[post("/tts/voices/from-library")]
pub async fn create_voice_from_library_handler(
    http_request: HttpRequest,
    _claims: Claims,
    req: web::Json<CreateVoiceFromLibraryRequest>,
    app_state: web::Data<AppState>,
) -> impl Responder {
    let parent_context = extract_context_from_request(&http_request);
    let mut span =
        global_tracer().start_with_context("http.tts.voices.from_library", &parent_context);

    let Some(client) = app_state.llamacpp.as_ref() else {
        span.set_status(Status::error("tts backend not configured"));
        return HttpResponse::ServiceUnavailable()
            .json(json!({ "error": "TTS backend not configured" }));
    };
    let Some(voice_name) = sanitize_voice_name(&req.voice_name) else {
        span.set_status(Status::error("voice_name is required"));
        return HttpResponse::BadRequest()
            .json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
    };

    let library = match libraries::resolve_library_param(&app_state, req.library.as_deref()) {
        Ok(Some(l)) => l,
        Ok(None) => app_state.primary_library(),
        Err(msg) => {
            span.set_status(Status::error("invalid library"));
            return HttpResponse::BadRequest().json(json!({ "error": msg }));
        }
    };

    // is_valid_full_path confines the path to the library root (no traversal).
    let abs = match is_valid_full_path(&library.root_path, &req.path, false) {
        Some(p) if p.exists() => p,
        _ => {
            span.set_status(Status::error("file not found"));
            return HttpResponse::NotFound().json(json!({ "error": "file not found in library" }));
        }
    };

    // Only real audio/video sources are valid voice references — refuse to
    // slurp arbitrary library files into memory / ffmpeg.
    if !is_audio_file(&abs) && !is_video_file(&abs) {
        span.set_status(Status::error("not an audio/video file"));
        return HttpResponse::BadRequest()
            .json(json!({ "error": "file is not an audio or video file" }));
    }
    span.set_attribute(KeyValue::new("tts.voice_name", voice_name.clone()));

    let wav = match prepare_reference_audio(&abs).await {
        Ok(b) => b,
        Err(e) => {
            span.set_status(Status::error("audio decode failed"));
            log::error!("voice reference prep failed for {:?}: {:?}", abs, e);
            return HttpResponse::BadRequest()
                .json(json!({ "error": "couldn't decode that file's audio" }));
        }
    };

    match client
        .create_voice(&voice_name, wav, "reference.wav", "audio/wav")
        .await
    {
        Ok(v) => {
            span.set_status(Status::Ok);
            HttpResponse::Ok().json(v)
        }
        Err(e) => {
            span.set_status(Status::error("create_voice failed"));
            log::error!("create_voice (from-library) failed: {:?}", e);
            HttpResponse::BadGateway().json(json!({ "error": format!("{e}") }))
        }
    }
}

/// Read a library file (audio or video) as a Chatterbox-ready reference: ffmpeg
/// decodes/extracts its audio to mono 24 kHz WAV. Reading straight from the
/// library path avoids slurping a (possibly large) video into memory.
async fn prepare_reference_audio(abs: &Path) -> anyhow::Result<Vec<u8>> {
    run_ffmpeg_to_wav(&abs.to_string_lossy()).await
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn sanitize_voice_name_keeps_safe_chars() {
        assert_eq!(sanitize_voice_name("m").as_deref(), Some("m"));
        assert_eq!(
            sanitize_voice_name("  Cameron ").as_deref(),
            Some("Cameron")
        );
        assert_eq!(
            sanitize_voice_name("voice_01-a").as_deref(),
            Some("voice_01-a")
        );
    }

    #[test]
    fn sanitize_voice_name_strips_unsafe_chars() {
        // Path separators / dots / spaces become '-' and are trimmed at edges.
        assert_eq!(sanitize_voice_name("a b.c").as_deref(), Some("a-b-c"));
        assert_eq!(
            sanitize_voice_name("../etc/passwd").as_deref(),
            Some("etc-passwd")
        );
    }

    #[test]
    fn sanitize_voice_name_rejects_empty_or_all_unsafe() {
        assert_eq!(sanitize_voice_name(""), None);
        assert_eq!(sanitize_voice_name("   "), None);
        assert_eq!(sanitize_voice_name("../../"), None);
        assert_eq!(sanitize_voice_name("...."), None);
    }

    #[test]
    fn sanitize_voice_name_bounds_length() {
        let long = "a".repeat(200);
        assert_eq!(sanitize_voice_name(&long).unwrap().len(), 64);
    }

    #[test]
    fn clean_for_tts_strips_markdown() {
        assert_eq!(
            clean_for_tts("**Bold** and _italic_ and `code`"),
            "Bold and italic and code"
        );
        assert_eq!(clean_for_tts("# Title\n\nbody"), "Title\nbody");
        assert_eq!(
            clean_for_tts("See [docs](http://x.com) now"),
            "See docs now"
        );
        assert_eq!(clean_for_tts("- one\n- two"), "one\ntwo");
    }

    #[test]
    fn clean_for_tts_strips_emoji_and_urls() {
        assert_eq!(clean_for_tts("Hello 😀 world 🎉"), "Hello world");
        assert_eq!(
            clean_for_tts("visit https://example.com today"),
            "visit today"
        );
        // ZWJ-glued emoji sequence is fully removed.
        assert_eq!(clean_for_tts("family 👨‍👩‍👧 photo"), "family photo");
    }

    #[test]
    fn clean_for_tts_collapses_blank_lines_to_single_break() {
        // Chatterbox pauses (sometimes ~20s) per blank line, so paragraph
        // breaks must collapse to a single newline.
        assert_eq!(clean_for_tts("para one\n\npara two"), "para one\npara two");
        assert_eq!(clean_for_tts("a\n\n\n\nb"), "a\nb");
        // Whitespace-only "blank" lines collapse too.
        assert_eq!(clean_for_tts("a\n  \t \nb"), "a\nb");
        // A single newline is left alone.
        assert_eq!(clean_for_tts("a\nb"), "a\nb");
    }

    #[test]
    fn clean_for_tts_preserves_bracket_tags() {
        // Non-turbo Chatterbox ignores these; a future Turbo uses them as
        // paralinguistic cues — so we must not strip them.
        assert_eq!(clean_for_tts("hello [laugh] there"), "hello [laugh] there");
    }
}