From 0accc4ef2fcb3f94d5a8c81ab6a15b48c29292d4 Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Thu, 11 Jun 2026 18:20:06 -0400
Subject: [PATCH] Add GPU lease coordinating LLM and TTS requests through
 llama-swap

llama-swap runs chat/vision/Chatterbox as a mutually-exclusive set on
one GPU and HOLDS a request for a non-resident model until the resident
model drains, then swaps. That hold burned the holder's reqwest timeout
(measured: a queued TTS lost 77s behind one LLM turn; an LLM request
behind a synthesis waited the entire remaining synth), so concurrent
insight + read-aloud timed out instead of queueing.

ai::gpu adds a fair RwLock lease acquired before each request is sent,
so cross-model waits happen before the HTTP timeout starts: chat/vision
share the read lease, TTS synthesis and voice-library ops (which spin
Chatterbox up) take the write lease, and embeddings take none (the
embed slot is in llama-swap's always-resident group). Speech jobs now
flip queued->running only after acquiring the GPU, letting the client
anchor its poll deadline to that transition.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/ai/gpu.rs      | 88 ++++++++++++++++++++++++++++++++++++++++++++++
 src/ai/llamacpp.rs | 25 +++++++++++++
 src/ai/mod.rs      |  1 +
 src/ai/tts.rs      | 12 ++++++-
 4 files changed, 125 insertions(+), 1 deletion(-)
 create mode 100644 src/ai/gpu.rs

diff --git a/src/ai/gpu.rs b/src/ai/gpu.rs
new file mode 100644
index 0000000..728a144
--- /dev/null
+++ b/src/ai/gpu.rs
@@ -0,0 +1,88 @@
+// GPU lease — in-process coordination for llama-swap model contention.
+//
+// llama-swap runs the heavyweight models (chat / vision / Chatterbox TTS) as
+// a mutually-exclusive set on one GPU (matrix DSL `(q27 | … | tts) & e`): a
+// request for a non-resident model is HELD by llama-swap until the resident
+// model's in-flight requests drain, then the models swap. That hold counts
+// against the *holder's* reqwest timeout — measured live: a queued TTS burned
+// 77s of its budget behind a single LLM turn, and an LLM request behind a
+// running synthesis waited the entire remaining synth. Uncoordinated
+// cross-model traffic therefore times out instead of queueing.
+//
+// The lease moves that wait into this process, BEFORE the HTTP request is
+// sent and before its timeout starts:
+// - chat/vision requests (the LLM-side slots) share the READ lease;
+// - TTS synthesis and voice-library ops (anything that spins Chatterbox up
+//   and evicts the LLM) take the WRITE lease;
+// - embeddings take NO lease: the `embed` slot is in llama-swap's
+//   always-resident group (the `& e` term) and never participates in a swap,
+//   so leasing it would only stall searches behind a queued synthesis.
+//
+// tokio's RwLock is fair (FIFO, write-preferring): a queued TTS gets the GPU
+// right after the current LLM request drains, and later LLM requests queue
+// behind it — bounded waits in both directions, no starvation, no timeout
+// budget burned while waiting.
+//
+// RULES: hold a lease for exactly one HTTP request (for streaming, the
+// stream's lifetime) and NEVER acquire one while already holding one — once a
+// writer is queued, new read acquisitions block, so nested acquisition can
+// deadlock.
+
+use std::sync::LazyLock;
+use std::time::Instant;
+use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
+
+static GPU_LEASE: LazyLock<RwLock<()>> = LazyLock::new(|| RwLock::new(()));
+
+/// Waits longer than this are logged — they mean a cross-model swap was
+/// avoided and quantify what the request *would* have burned of its timeout.
+const SLOW_WAIT_LOG_SECS: f64 = 2.0;
+
+/// Shared lease for LLM-side requests (chat / vision slots).
+pub async fn llm_lease() -> RwLockReadGuard<'static, ()> {
+    let started = Instant::now();
+    let guard = GPU_LEASE.read().await;
+    log_slow_wait("llm", started);
+    guard
+}
+
+/// Exclusive lease for TTS-side requests (speech synthesis + voice-library
+/// ops that spin up Chatterbox).
+pub async fn tts_lease() -> RwLockWriteGuard<'static, ()> {
+    let started = Instant::now();
+    let guard = GPU_LEASE.write().await;
+    log_slow_wait("tts", started);
+    guard
+}
+
+fn log_slow_wait(kind: &str, started: Instant) {
+    let waited = started.elapsed().as_secs_f64();
+    if waited > SLOW_WAIT_LOG_SECS {
+        log::info!("GPU lease ({kind}): waited {waited:.1}s for the other model class to drain");
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    // One sequential test, not several: the lease is a single global, so
+    // parallel tests interleaving reads and writes on it can hit the very
+    // nested-acquisition deadlock the module comment warns about.
+    #[tokio::test]
+    async fn write_lease_excludes_readers_then_reads_share() {
+        let w = tts_lease().await;
+        // A reader must not acquire while the writer is held.
+        let pending = tokio::spawn(async { drop(llm_lease().await) });
+        tokio::task::yield_now().await;
+        assert!(!pending.is_finished());
+        drop(w);
+        pending.await.expect("reader acquires after writer drops");
+
+        // With no writer queued, read leases are shared.
+        let a = llm_lease().await;
+        let b = llm_lease().await;
+        drop(a);
+        drop(b);
+    }
+}
diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs
index 820f5f8..8a7c898 100644
--- a/src/ai/llamacpp.rs
+++ b/src/ai/llamacpp.rs
@@ -142,6 +142,11 @@ impl LlamaCppClient {
     /// Chatterbox generation knobs are forwarded when set (caller is expected
     /// to have range-clamped them): `exaggeration` (0.25–2.0, emotion),
     /// `cfg_weight` (0.0–1.0, pace), `temperature` (0.05–5.0, randomness).
+    ///
+    /// Callers must hold the GPU write lease (`ai::gpu::tts_lease`) across
+    /// this call. It is taken at the call sites in `ai::tts` rather than here
+    /// so the speech-job path can flip its job to `running` between acquiring
+    /// the GPU and sending the request.
     pub async fn text_to_speech(
         &self,
         input: &str,
@@ -204,6 +209,9 @@ impl LlamaCppClient {
     /// List voices in the Chatterbox voice library (raw JSON passthrough).
     pub async fn list_voices(&self) -> Result<Value> {
         let url = format!("{}/upstream/{}/voices", self.swap_root(), self.tts_model);
+        // The /upstream passthrough spins Chatterbox up (evicting the LLM),
+        // so it takes the exclusive GPU lease like synthesis does.
+        let _gpu = crate::ai::gpu::tts_lease().await;
         let resp = self
             .client
             .get(&url)
@@ -237,6 +245,9 @@ impl LlamaCppClient {
             .text("voice_name", voice_name.to_string())
             .part("voice_file", part);
 
+        // The /upstream passthrough spins Chatterbox up (evicting the LLM),
+        // so it takes the exclusive GPU lease like synthesis does.
+        let _gpu = crate::ai::gpu::tts_lease().await;
         let resp = self
             .client
             .post(&url)
@@ -262,6 +273,9 @@ impl LlamaCppClient {
             self.tts_model,
             voice_name
         );
+        // The /upstream passthrough spins Chatterbox up (evicting the LLM),
+        // so it takes the exclusive GPU lease like synthesis does.
+        let _gpu = crate::ai::gpu::tts_lease().await;
         let resp = self
             .client
             .delete(&url)
@@ -481,6 +495,9 @@ impl LlamaCppClient {
             body.insert(k.into(), v);
         }
 
+        // Wait for any TTS synthesis to release the GPU before the request
+        // timeout starts (see ai::gpu).
+        let _gpu = crate::ai::gpu::llm_lease().await;
         let resp = self
             .client
             .post(&url)
@@ -599,6 +616,10 @@ impl LlmClient for LlamaCppClient {
             body.insert(k.into(), v);
         }
 
+        // Wait for any TTS synthesis to release the GPU before the request
+        // timeout starts (see ai::gpu). The guard is moved into the stream
+        // below so the lease spans the whole generation, not just the send.
+        let gpu = crate::ai::gpu::llm_lease().await;
         let resp = self
             .client
             .post(&url)
@@ -615,6 +636,7 @@ impl LlmClient for LlamaCppClient {
 
         let byte_stream = resp.bytes_stream();
         let stream = async_stream::stream! {
+            let _gpu = gpu;
             let mut byte_stream = byte_stream;
             let mut buf: Vec<u8> = Vec::new();
             let mut accumulated_content = String::new();
@@ -730,6 +752,9 @@ impl LlmClient for LlamaCppClient {
     }
 
     async fn generate_embeddings(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>> {
+        // Deliberately NO GPU lease: the embed slot sits in llama-swap's
+        // always-resident group and never participates in a model swap, so
+        // leasing here would only stall searches behind a queued synthesis.
         let url = format!("{}/embeddings", self.base_url);
         let body = json!({
             "model": self.embedding_model,
diff --git a/src/ai/mod.rs b/src/ai/mod.rs
index e083e1d..a4f5e14 100644
--- a/src/ai/mod.rs
+++ b/src/ai/mod.rs
@@ -3,6 +3,7 @@ pub mod backend;
 pub mod clip_client;
 pub mod daily_summary_job;
 pub mod face_client;
+pub mod gpu;
 pub mod handlers;
 pub mod insight_chat;
 pub mod insight_generator;
diff --git a/src/ai/tts.rs b/src/ai/tts.rs
index 02cfc88..f76810d 100644
--- a/src/ai/tts.rs
+++ b/src/ai/tts.rs
@@ -378,6 +378,10 @@ pub async fn tts_speech_handler(
         }));
     };
 
+    // Wait for the LLM side to release the GPU before sending — the synthesis
+    // timeout starts at send, not here (see ai::gpu).
+    let _gpu = crate::ai::gpu::tts_lease().await;
+
     match client
         .text_to_speech(&text, voice, format, exaggeration, cfg_weight, temperature)
         .await
@@ -495,7 +499,13 @@ pub async fn create_speech_job_handler(
                 return;
             }
         };
-        // Cancelled while queued — release the permit without synthesizing.
+        // Wait for the LLM side to release the GPU too (see ai::gpu) — only
+        // then does the job count as running. The synthesis timeout starts at
+        // the HTTP send below, so neither wait burns it, and the client can
+        // anchor its own deadline to the queued→running transition.
+        let _gpu = crate::ai::gpu::tts_lease().await;
+
+        // Cancelled while queued — release the permits without synthesizing.
         let cancelled = with_job(job_id, |job| {
             if job.status == TtsJobStatus::Queued {
                 job.status = TtsJobStatus::Running;