Add start/duration window selection for voice-clone reference clips

Both voice creation endpoints (upload + from-library) now accept optional start_seconds/duration_seconds, threaded to ffmpeg as -ss/-t, so the reference window can target clean speech anywhere in a long recording instead of always the first N seconds. Duration is clamped to the LLAMA_SWAP_TTS_REF_SECONDS cap and the voice-name tag reflects the actual window length. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-12 16:09:03 -04:00
parent 2e0f78aa1b
commit 1dec34540d
1 changed files with 147 additions and 34 deletions
@@ -193,6 +193,27 @@ fn append_ref_seconds(name: &str, secs: u32) -> String {
    format!("{base}{suffix}")
 }
 /// Resolve a caller-supplied reference window into concrete `(start, duration)`
 /// seconds for ffmpeg. Start defaults to 0; duration defaults to the
 /// `tts_ref_seconds` cap and is clamped to it (the cap is the most audio the
 /// TTS backend benefits from, so longer requests are quietly bounded rather
 /// than rejected). Non-finite or negative values are the caller's bug → Err.
 fn resolve_ref_window(
    start_seconds: Option<f64>,
    duration_seconds: Option<f64>,
 ) -> Result<(f64, f64), String> {
    let cap = f64::from(tts_ref_seconds());
    let start = start_seconds.unwrap_or(0.0);
    if !start.is_finite() || start < 0.0 {
        return Err("start_seconds must be a non-negative number".to_string());
    }
    let duration = duration_seconds.unwrap_or(cap);
    if !duration.is_finite() || duration <= 0.0 {
        return Err("duration_seconds must be a positive number".to_string());
    }
    Ok((start, duration.min(cap)))
 }
 /// Optional default voice for synthesis when the request doesn't name one.
 /// Set `LLAMA_SWAP_TTS_VOICE=m` to read insights in a cloned voice by default.
 fn default_voice() -> Option<String> {
@@ -265,24 +286,31 @@ fn prepare_for_tts(input: &str) -> String {
 /// Decode an audio/video file to mono 24 kHz WAV via ffmpeg, returning the WAV
 /// bytes. Chatterbox validates the reference clip by file *extension* and
 /// rejects several formats (e.g. `.aac`, `.opus`), so we always normalize to
-/// WAV regardless of the source container. Capped at 30s — references only need
+/// WAV regardless of the source container. Extracts `duration` seconds starting
-/// a few seconds of clean speech.
+/// at `start` (see resolve_ref_window) — references only need a few seconds of
-async fn run_ffmpeg_to_wav(input_path: &str) -> anyhow::Result<Vec<u8>> {
+/// clean speech, which may sit anywhere in a long recording.
 async fn run_ffmpeg_to_wav(input_path: &str, start: f64, duration: f64) -> anyhow::Result<Vec<u8>> {
    let out = tempfile::Builder::new()
        .suffix(".wav")
        .tempfile()
        .context("creating temp wav")?;
    let out_s = out.path().to_string_lossy().to_string();
-    // Cap the reference clip length — we use the first N seconds (see
+    let start_s = format!("{start}");
-    // tts_ref_seconds).
+    let secs = format!("{duration}");
-    let secs = tts_ref_seconds().to_string();
+
    // -ss before -i is input seeking: fast, and frame accuracy doesn't matter
    // for picking a speech window.
    let mut args: Vec<&str> = vec!["-y"];
    if start > 0.0 {
        args.extend(["-ss", &start_s]);
    }
    args.extend([
        "-i", input_path, "-vn", "-ac", "1", "-ar", "24000", "-t", &secs, "-f", "wav", &out_s,
    ]);
    let output = tokio::process::Command::new("ffmpeg")
-        .args([
+        .args(&args)
            "-y", "-i", input_path, "-vn", "-ac", "1", "-ar", "24000", "-t", &secs, "-f", "wav",
            &out_s,
        ])
        .output()
        .await
        .context("spawning ffmpeg")?;
@@ -295,7 +323,12 @@ async fn run_ffmpeg_to_wav(input_path: &str) -> anyhow::Result<Vec<u8>> {
 /// Normalize in-memory upload bytes to WAV: write to a temp file (keeping the
 /// source extension as an ffmpeg probe hint) then transcode.
-async fn transcode_bytes_to_wav(input: &[u8], src_ext: Option<&str>) -> anyhow::Result<Vec<u8>> {
+async fn transcode_bytes_to_wav(
    input: &[u8],
    src_ext: Option<&str>,
    start: f64,
    duration: f64,
 ) -> anyhow::Result<Vec<u8>> {
    let suffix = src_ext
        .filter(|e| !e.is_empty())
        .map(|e| format!(".{e}"))
@@ -305,7 +338,7 @@ async fn transcode_bytes_to_wav(input: &[u8], src_ext: Option<&str>) -> anyhow::
        .tempfile()
        .context("creating temp input")?;
    std::fs::write(in_tmp.path(), input).context("writing temp input")?;
-    run_ffmpeg_to_wav(&in_tmp.path().to_string_lossy()).await
+    run_ffmpeg_to_wav(&in_tmp.path().to_string_lossy(), start, duration).await
 }
 #[derive(Debug, Deserialize)]
@@ -751,7 +784,9 @@ pub async fn delete_voice_handler(
 }
 /// POST /tts/voices/upload — register a cloned voice from an uploaded audio
-/// clip. Multipart fields: `voice_name` (text) + a file part (`voice_file`).
+/// clip. Multipart fields: `voice_name` (text) + a file part (`voice_file`),
 /// plus optional `start_seconds` / `duration_seconds` (text) selecting which
 /// window of a longer recording becomes the reference clip.
 #[post("/tts/voices/upload")]
 pub async fn create_voice_upload_handler(
    http_request: HttpRequest,
@@ -769,6 +804,8 @@ pub async fn create_voice_upload_handler(
    };
    let mut voice_name: Option<String> = None;
    let mut start_field: Option<String> = None;
    let mut duration_field: Option<String> = None;
    let mut file_bytes = BytesMut::new();
    let mut filename = "voice.wav".to_string();
@@ -793,25 +830,57 @@ pub async fn create_voice_upload_handler(
                }
                file_bytes.put(data);
            }
-        } else if name_opt.as_deref() == Some("voice_name") {
+        } else if matches!(
            name_opt.as_deref(),
            Some("voice_name" | "start_seconds" | "duration_seconds")
        ) {
            let field = name_opt.as_deref().unwrap().to_string();
            let mut buf = BytesMut::new();
            while let Some(Ok(data)) = part.next().await {
                buf.put(data);
            }
-            voice_name = Some(String::from_utf8_lossy(&buf).trim().to_string());
+            let text = String::from_utf8_lossy(&buf).trim().to_string();
            match field.as_str() {
                "voice_name" => voice_name = Some(text),
                "start_seconds" => start_field = Some(text),
                _ => duration_field = Some(text),
            }
        } else {
            while let Some(Ok(_)) = part.next().await {}
        }
    }
    // Empty text parts are treated as absent; anything else must parse, so a
    // client bug ("abc") fails loudly instead of silently cloning from 0s.
    let parse_secs = |field: Option<&String>, name: &str| -> Result<Option<f64>, String> {
        match field.map(|s| s.as_str()).filter(|s| !s.is_empty()) {
            None => Ok(None),
            Some(s) => s
                .parse::<f64>()
                .map(Some)
                .map_err(|_| format!("{name} must be a number of seconds")),
        }
    };
    let window = parse_secs(start_field.as_ref(), "start_seconds").and_then(|start| {
        parse_secs(duration_field.as_ref(), "duration_seconds")
            .and_then(|dur| resolve_ref_window(start, dur))
    });
    let (ref_start, ref_duration) = match window {
        Ok(w) => w,
        Err(msg) => {
            span.set_status(Status::error("invalid reference window"));
            return HttpResponse::BadRequest().json(json!({ "error": msg }));
        }
    };
    let Some(name) = voice_name.as_deref().and_then(sanitize_voice_name) else {
        span.set_status(Status::error("voice_name is required"));
        return HttpResponse::BadRequest()
            .json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
    };
-    // Tag the name with the ref-clip cap (e.g. `grandma-30s`) so the library
+    // Tag the name with the ref-clip length (e.g. `grandma-30s`) so the
-    // shows which reference length produced each clone.
+    // library shows which reference length produced each clone.
-    let name = append_ref_seconds(&name, tts_ref_seconds());
+    let name = append_ref_seconds(&name, ref_duration.round().max(1.0) as u32);
    if file_bytes.is_empty() {
        span.set_status(Status::error("voice_file is required"));
        return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" }));
@@ -822,15 +891,16 @@ pub async fn create_voice_upload_handler(
    // Normalize to WAV so any device format (e.g. .aac / .opus, which Chatterbox
    // rejects by extension) is accepted.
    let src_ext = Path::new(&filename).extension().and_then(|e| e.to_str());
-    let wav = match transcode_bytes_to_wav(file_bytes.as_ref(), src_ext).await {
+    let wav =
-        Ok(w) => w,
+        match transcode_bytes_to_wav(file_bytes.as_ref(), src_ext, ref_start, ref_duration).await {
-        Err(e) => {
+            Ok(w) => w,
-            span.set_status(Status::error("audio decode failed"));
+            Err(e) => {
-            log::error!("voice upload transcode failed: {:?}", e);
+                span.set_status(Status::error("audio decode failed"));
-            return HttpResponse::BadRequest()
+                log::error!("voice upload transcode failed: {:?}", e);
-                .json(json!({ "error": "couldn't decode that audio file" }));
+                return HttpResponse::BadRequest()
-        }
+                    .json(json!({ "error": "couldn't decode that audio file" }));
-    };
+            }
        };
    match client
        .create_voice(&name, wav, "reference.wav", "audio/wav")
@@ -856,11 +926,19 @@ pub struct CreateVoiceFromLibraryRequest {
    pub path: String,
    #[serde(default)]
    pub library: Option<String>,
    /// Offset into the source where the reference window begins (default 0) —
    /// lets the client pick the clean-speech section of a long recording.
    #[serde(default)]
    pub start_seconds: Option<f64>,
    /// Reference window length; clamped to LLAMA_SWAP_TTS_REF_SECONDS.
    #[serde(default)]
    pub duration_seconds: Option<f64>,
 }
 /// POST /tts/voices/from-library — register a cloned voice from a file already
 /// in a library. Audio and video alike are ffmpeg-normalized to a mono 24 kHz
-/// WAV reference clip (length capped by LLAMA_SWAP_TTS_REF_SECONDS).
+/// WAV reference clip (window selected by start/duration_seconds, length
 /// capped by LLAMA_SWAP_TTS_REF_SECONDS).
 #[post("/tts/voices/from-library")]
 pub async fn create_voice_from_library_handler(
    http_request: HttpRequest,
@@ -882,9 +960,17 @@ pub async fn create_voice_from_library_handler(
        return HttpResponse::BadRequest()
            .json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
    };
-    // Tag the name with the ref-clip cap (e.g. `grandma-30s`) so the library
+    let (ref_start, ref_duration) =
-    // shows which reference length produced each clone.
+        match resolve_ref_window(req.start_seconds, req.duration_seconds) {
-    let voice_name = append_ref_seconds(&voice_name, tts_ref_seconds());
+            Ok(w) => w,
            Err(msg) => {
                span.set_status(Status::error("invalid reference window"));
                return HttpResponse::BadRequest().json(json!({ "error": msg }));
            }
        };
    // Tag the name with the ref-clip length (e.g. `grandma-30s`) so the
    // library shows which reference length produced each clone.
    let voice_name = append_ref_seconds(&voice_name, ref_duration.round().max(1.0) as u32);
    let library = match libraries::resolve_library_param(&app_state, req.library.as_deref()) {
        Ok(Some(l)) => l,
@@ -913,7 +999,7 @@ pub async fn create_voice_from_library_handler(
    }
    span.set_attribute(KeyValue::new("tts.voice_name", voice_name.clone()));
-    let wav = match prepare_reference_audio(&abs).await {
+    let wav = match prepare_reference_audio(&abs, ref_start, ref_duration).await {
        Ok(b) => b,
        Err(e) => {
            span.set_status(Status::error("audio decode failed"));
@@ -943,8 +1029,8 @@ pub async fn create_voice_from_library_handler(
 /// Read a library file (audio or video) as a Chatterbox-ready reference: ffmpeg
 /// decodes/extracts its audio to mono 24 kHz WAV. Reading straight from the
 /// library path avoids slurping a (possibly large) video into memory.
-async fn prepare_reference_audio(abs: &Path) -> anyhow::Result<Vec<u8>> {
+async fn prepare_reference_audio(abs: &Path, start: f64, duration: f64) -> anyhow::Result<Vec<u8>> {
-    run_ffmpeg_to_wav(&abs.to_string_lossy()).await
+    run_ffmpeg_to_wav(&abs.to_string_lossy(), start, duration).await
 }
 #[cfg(test)]
@@ -1009,6 +1095,33 @@ mod tests {
        assert!(tagged.ends_with("-30s"));
    }
    #[test]
    fn resolve_ref_window_defaults_to_start_of_clip_at_cap_length() {
        // Reads the live cap rather than mutating LLAMA_SWAP_TTS_REF_SECONDS:
        // env mutation flakes under the parallel suite (see env_dispatch).
        let cap = f64::from(tts_ref_seconds());
        assert_eq!(resolve_ref_window(None, None), Ok((0.0, cap)));
    }
    #[test]
    fn resolve_ref_window_accepts_offset_and_clamps_duration() {
        let cap = f64::from(tts_ref_seconds());
        assert_eq!(resolve_ref_window(Some(92.5), None), Ok((92.5, cap)));
        assert_eq!(resolve_ref_window(Some(10.0), Some(12.0)), Ok((10.0, 12.0)));
        // Longer-than-cap windows are bounded, not rejected.
        assert_eq!(resolve_ref_window(None, Some(cap + 100.0)), Ok((0.0, cap)));
    }
    #[test]
    fn resolve_ref_window_rejects_garbage() {
        assert!(resolve_ref_window(Some(-1.0), None).is_err());
        assert!(resolve_ref_window(Some(f64::NAN), None).is_err());
        assert!(resolve_ref_window(Some(f64::INFINITY), None).is_err());
        assert!(resolve_ref_window(None, Some(0.0)).is_err());
        assert!(resolve_ref_window(None, Some(-5.0)).is_err());
        assert!(resolve_ref_window(None, Some(f64::NAN)).is_err());
    }
    #[test]
    fn sweep_drops_expired_results_and_keeps_live_jobs() {
        let now = Instant::now();