From 1dec34540d89265def1908d1de48a6fe0f9af053 Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Fri, 12 Jun 2026 16:09:03 -0400
Subject: [PATCH] Add start/duration window selection for voice-clone reference
 clips

Both voice creation endpoints (upload + from-library) now accept optional
start_seconds/duration_seconds, threaded to ffmpeg as -ss/-t, so the
reference window can target clean speech anywhere in a long recording
instead of always the first N seconds. Duration is clamped to the
LLAMA_SWAP_TTS_REF_SECONDS cap and the voice-name tag reflects the
actual window length.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/ai/tts.rs | 181 ++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 147 insertions(+), 34 deletions(-)
diff --git a/src/ai/tts.rs b/src/ai/tts.rs
index a0b9bd2..415dcbf 100644
--- a/src/ai/tts.rs
+++ b/src/ai/tts.rs
@@ -193,6 +193,27 @@ fn append_ref_seconds(name: &str, secs: u32) -> String {
     format!("{base}{suffix}")
 }
 
+/// Resolve a caller-supplied reference window into concrete `(start, duration)`
+/// seconds for ffmpeg. Start defaults to 0; duration defaults to the
+/// `tts_ref_seconds` cap and is clamped to it (the cap is the most audio the
+/// TTS backend benefits from, so longer requests are quietly bounded rather
+/// than rejected). Non-finite or negative values are the caller's bug → Err.
+fn resolve_ref_window(
+    start_seconds: Option<f64>,
+    duration_seconds: Option<f64>,
+) -> Result<(f64, f64), String> {
+    let cap = f64::from(tts_ref_seconds());
+    let start = start_seconds.unwrap_or(0.0);
+    if !start.is_finite() || start < 0.0 {
+        return Err("start_seconds must be a non-negative number".to_string());
+    }
+    let duration = duration_seconds.unwrap_or(cap);
+    if !duration.is_finite() || duration <= 0.0 {
+        return Err("duration_seconds must be a positive number".to_string());
+    }
+    Ok((start, duration.min(cap)))
+}
+
 /// Optional default voice for synthesis when the request doesn't name one.
 /// Set `LLAMA_SWAP_TTS_VOICE=m` to read insights in a cloned voice by default.
 fn default_voice() -> Option<String> {
@@ -265,24 +286,31 @@ fn prepare_for_tts(input: &str) -> String {
 /// Decode an audio/video file to mono 24 kHz WAV via ffmpeg, returning the WAV
 /// bytes. Chatterbox validates the reference clip by file *extension* and
 /// rejects several formats (e.g. `.aac`, `.opus`), so we always normalize to
-/// WAV regardless of the source container. Capped at 30s — references only need
-/// a few seconds of clean speech.
-async fn run_ffmpeg_to_wav(input_path: &str) -> anyhow::Result<Vec<u8>> {
+/// WAV regardless of the source container. Extracts `duration` seconds starting
+/// at `start` (see resolve_ref_window) — references only need a few seconds of
+/// clean speech, which may sit anywhere in a long recording.
+async fn run_ffmpeg_to_wav(input_path: &str, start: f64, duration: f64) -> anyhow::Result<Vec<u8>> {
     let out = tempfile::Builder::new()
         .suffix(".wav")
         .tempfile()
         .context("creating temp wav")?;
     let out_s = out.path().to_string_lossy().to_string();
 
-    // Cap the reference clip length — we use the first N seconds (see
-    // tts_ref_seconds).
-    let secs = tts_ref_seconds().to_string();
+    let start_s = format!("{start}");
+    let secs = format!("{duration}");
+
+    // -ss before -i is input seeking: fast, and frame accuracy doesn't matter
+    // for picking a speech window.
+    let mut args: Vec<&str> = vec!["-y"];
+    if start > 0.0 {
+        args.extend(["-ss", &start_s]);
+    }
+    args.extend([
+        "-i", input_path, "-vn", "-ac", "1", "-ar", "24000", "-t", &secs, "-f", "wav", &out_s,
+    ]);
 
     let output = tokio::process::Command::new("ffmpeg")
-        .args([
-            "-y", "-i", input_path, "-vn", "-ac", "1", "-ar", "24000", "-t", &secs, "-f", "wav",
-            &out_s,
-        ])
+        .args(&args)
         .output()
         .await
         .context("spawning ffmpeg")?;
@@ -295,7 +323,12 @@ async fn run_ffmpeg_to_wav(input_path: &str) -> anyhow::Result<Vec<u8>> {
 
 /// Normalize in-memory upload bytes to WAV: write to a temp file (keeping the
 /// source extension as an ffmpeg probe hint) then transcode.
-async fn transcode_bytes_to_wav(input: &[u8], src_ext: Option<&str>) -> anyhow::Result<Vec<u8>> {
+async fn transcode_bytes_to_wav(
+    input: &[u8],
+    src_ext: Option<&str>,
+    start: f64,
+    duration: f64,
+) -> anyhow::Result<Vec<u8>> {
     let suffix = src_ext
         .filter(|e| !e.is_empty())
         .map(|e| format!(".{e}"))
@@ -305,7 +338,7 @@ async fn transcode_bytes_to_wav(input: &[u8], src_ext: Option<&str>) -> anyhow::
         .tempfile()
         .context("creating temp input")?;
     std::fs::write(in_tmp.path(), input).context("writing temp input")?;
-    run_ffmpeg_to_wav(&in_tmp.path().to_string_lossy()).await
+    run_ffmpeg_to_wav(&in_tmp.path().to_string_lossy(), start, duration).await
 }
 
 #[derive(Debug, Deserialize)]
@@ -751,7 +784,9 @@ pub async fn delete_voice_handler(
 }
 
 /// POST /tts/voices/upload — register a cloned voice from an uploaded audio
-/// clip. Multipart fields: `voice_name` (text) + a file part (`voice_file`).
+/// clip. Multipart fields: `voice_name` (text) + a file part (`voice_file`),
+/// plus optional `start_seconds` / `duration_seconds` (text) selecting which
+/// window of a longer recording becomes the reference clip.
 #[post("/tts/voices/upload")]
 pub async fn create_voice_upload_handler(
     http_request: HttpRequest,
@@ -769,6 +804,8 @@ pub async fn create_voice_upload_handler(
     };
 
     let mut voice_name: Option<String> = None;
+    let mut start_field: Option<String> = None;
+    let mut duration_field: Option<String> = None;
     let mut file_bytes = BytesMut::new();
     let mut filename = "voice.wav".to_string();
 
@@ -793,25 +830,57 @@ pub async fn create_voice_upload_handler(
                 }
                 file_bytes.put(data);
             }
-        } else if name_opt.as_deref() == Some("voice_name") {
+        } else if matches!(
+            name_opt.as_deref(),
+            Some("voice_name" | "start_seconds" | "duration_seconds")
+        ) {
+            let field = name_opt.as_deref().unwrap().to_string();
             let mut buf = BytesMut::new();
             while let Some(Ok(data)) = part.next().await {
                 buf.put(data);
             }
-            voice_name = Some(String::from_utf8_lossy(&buf).trim().to_string());
+            let text = String::from_utf8_lossy(&buf).trim().to_string();
+            match field.as_str() {
+                "voice_name" => voice_name = Some(text),
+                "start_seconds" => start_field = Some(text),
+                _ => duration_field = Some(text),
+            }
         } else {
             while let Some(Ok(_)) = part.next().await {}
         }
     }
 
+    // Empty text parts are treated as absent; anything else must parse, so a
+    // client bug ("abc") fails loudly instead of silently cloning from 0s.
+    let parse_secs = |field: Option<&String>, name: &str| -> Result<Option<f64>, String> {
+        match field.map(|s| s.as_str()).filter(|s| !s.is_empty()) {
+            None => Ok(None),
+            Some(s) => s
+                .parse::<f64>()
+                .map(Some)
+                .map_err(|_| format!("{name} must be a number of seconds")),
+        }
+    };
+    let window = parse_secs(start_field.as_ref(), "start_seconds").and_then(|start| {
+        parse_secs(duration_field.as_ref(), "duration_seconds")
+            .and_then(|dur| resolve_ref_window(start, dur))
+    });
+    let (ref_start, ref_duration) = match window {
+        Ok(w) => w,
+        Err(msg) => {
+            span.set_status(Status::error("invalid reference window"));
+            return HttpResponse::BadRequest().json(json!({ "error": msg }));
+        }
+    };
+
     let Some(name) = voice_name.as_deref().and_then(sanitize_voice_name) else {
         span.set_status(Status::error("voice_name is required"));
         return HttpResponse::BadRequest()
             .json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
     };
-    // Tag the name with the ref-clip cap (e.g. `grandma-30s`) so the library
-    // shows which reference length produced each clone.
-    let name = append_ref_seconds(&name, tts_ref_seconds());
+    // Tag the name with the ref-clip length (e.g. `grandma-30s`) so the
+    // library shows which reference length produced each clone.
+    let name = append_ref_seconds(&name, ref_duration.round().max(1.0) as u32);
     if file_bytes.is_empty() {
         span.set_status(Status::error("voice_file is required"));
         return HttpResponse::BadRequest().json(json!({ "error": "voice_file is required" }));
@@ -822,15 +891,16 @@ pub async fn create_voice_upload_handler(
     // Normalize to WAV so any device format (e.g. .aac / .opus, which Chatterbox
     // rejects by extension) is accepted.
     let src_ext = Path::new(&filename).extension().and_then(|e| e.to_str());
-    let wav = match transcode_bytes_to_wav(file_bytes.as_ref(), src_ext).await {
-        Ok(w) => w,
-        Err(e) => {
-            span.set_status(Status::error("audio decode failed"));
-            log::error!("voice upload transcode failed: {:?}", e);
-            return HttpResponse::BadRequest()
-                .json(json!({ "error": "couldn't decode that audio file" }));
-        }
-    };
+    let wav =
+        match transcode_bytes_to_wav(file_bytes.as_ref(), src_ext, ref_start, ref_duration).await {
+            Ok(w) => w,
+            Err(e) => {
+                span.set_status(Status::error("audio decode failed"));
+                log::error!("voice upload transcode failed: {:?}", e);
+                return HttpResponse::BadRequest()
+                    .json(json!({ "error": "couldn't decode that audio file" }));
+            }
+        };
 
     match client
         .create_voice(&name, wav, "reference.wav", "audio/wav")
@@ -856,11 +926,19 @@ pub struct CreateVoiceFromLibraryRequest {
     pub path: String,
     #[serde(default)]
     pub library: Option<String>,
+    /// Offset into the source where the reference window begins (default 0) —
+    /// lets the client pick the clean-speech section of a long recording.
+    #[serde(default)]
+    pub start_seconds: Option<f64>,
+    /// Reference window length; clamped to LLAMA_SWAP_TTS_REF_SECONDS.
+    #[serde(default)]
+    pub duration_seconds: Option<f64>,
 }
 
 /// POST /tts/voices/from-library — register a cloned voice from a file already
 /// in a library. Audio and video alike are ffmpeg-normalized to a mono 24 kHz
-/// WAV reference clip (length capped by LLAMA_SWAP_TTS_REF_SECONDS).
+/// WAV reference clip (window selected by start/duration_seconds, length
+/// capped by LLAMA_SWAP_TTS_REF_SECONDS).
 #[post("/tts/voices/from-library")]
 pub async fn create_voice_from_library_handler(
     http_request: HttpRequest,
@@ -882,9 +960,17 @@ pub async fn create_voice_from_library_handler(
         return HttpResponse::BadRequest()
             .json(json!({ "error": "voice_name is required (alphanumerics, - and _ only)" }));
     };
-    // Tag the name with the ref-clip cap (e.g. `grandma-30s`) so the library
-    // shows which reference length produced each clone.
-    let voice_name = append_ref_seconds(&voice_name, tts_ref_seconds());
+    let (ref_start, ref_duration) =
+        match resolve_ref_window(req.start_seconds, req.duration_seconds) {
+            Ok(w) => w,
+            Err(msg) => {
+                span.set_status(Status::error("invalid reference window"));
+                return HttpResponse::BadRequest().json(json!({ "error": msg }));
+            }
+        };
+    // Tag the name with the ref-clip length (e.g. `grandma-30s`) so the
+    // library shows which reference length produced each clone.
+    let voice_name = append_ref_seconds(&voice_name, ref_duration.round().max(1.0) as u32);
 
     let library = match libraries::resolve_library_param(&app_state, req.library.as_deref()) {
         Ok(Some(l)) => l,
@@ -913,7 +999,7 @@ pub async fn create_voice_from_library_handler(
     }
     span.set_attribute(KeyValue::new("tts.voice_name", voice_name.clone()));
 
-    let wav = match prepare_reference_audio(&abs).await {
+    let wav = match prepare_reference_audio(&abs, ref_start, ref_duration).await {
         Ok(b) => b,
         Err(e) => {
             span.set_status(Status::error("audio decode failed"));
@@ -943,8 +1029,8 @@ pub async fn create_voice_from_library_handler(
 /// Read a library file (audio or video) as a Chatterbox-ready reference: ffmpeg
 /// decodes/extracts its audio to mono 24 kHz WAV. Reading straight from the
 /// library path avoids slurping a (possibly large) video into memory.
-async fn prepare_reference_audio(abs: &Path) -> anyhow::Result<Vec<u8>> {
-    run_ffmpeg_to_wav(&abs.to_string_lossy()).await
+async fn prepare_reference_audio(abs: &Path, start: f64, duration: f64) -> anyhow::Result<Vec<u8>> {
+    run_ffmpeg_to_wav(&abs.to_string_lossy(), start, duration).await
 }
 
 #[cfg(test)]
@@ -1009,6 +1095,33 @@ mod tests {
         assert!(tagged.ends_with("-30s"));
     }
 
+    #[test]
+    fn resolve_ref_window_defaults_to_start_of_clip_at_cap_length() {
+        // Reads the live cap rather than mutating LLAMA_SWAP_TTS_REF_SECONDS:
+        // env mutation flakes under the parallel suite (see env_dispatch).
+        let cap = f64::from(tts_ref_seconds());
+        assert_eq!(resolve_ref_window(None, None), Ok((0.0, cap)));
+    }
+
+    #[test]
+    fn resolve_ref_window_accepts_offset_and_clamps_duration() {
+        let cap = f64::from(tts_ref_seconds());
+        assert_eq!(resolve_ref_window(Some(92.5), None), Ok((92.5, cap)));
+        assert_eq!(resolve_ref_window(Some(10.0), Some(12.0)), Ok((10.0, 12.0)));
+        // Longer-than-cap windows are bounded, not rejected.
+        assert_eq!(resolve_ref_window(None, Some(cap + 100.0)), Ok((0.0, cap)));
+    }
+
+    #[test]
+    fn resolve_ref_window_rejects_garbage() {
+        assert!(resolve_ref_window(Some(-1.0), None).is_err());
+        assert!(resolve_ref_window(Some(f64::NAN), None).is_err());
+        assert!(resolve_ref_window(Some(f64::INFINITY), None).is_err());
+        assert!(resolve_ref_window(None, Some(0.0)).is_err());
+        assert!(resolve_ref_window(None, Some(-5.0)).is_err());
+        assert!(resolve_ref_window(None, Some(f64::NAN)).is_err());
+    }
+
     #[test]
     fn sweep_drops_expired_results_and_keeps_live_jobs() {
         let now = Instant::now();