Add memory-reel backend: on-demand narrated photo slideshow

New POST /reels + GET /reels/{id} (+ /video) build an MP4 slideshow of a memory span (day/week/month), narrated by the LLM in a cloned voice. Pipeline (src/reels/): a selector resolves which photos + reel metadata, the scripter writes one narration line per photo via a single LLM call (reusing each photo's cached insight as context — no fresh vision calls, so reel generation stays off the GPU's vision slot), each line is synthesized to speech, and the renderer assembles stills + narration via ffmpeg. Jobs run in the background (mirroring the TTS speech-job registry) since a reel takes minutes; the finished MP4 is cached on disk keyed by the selection so a repeat request is instant. The segment model is media-typed (Photo today) so a video-clip segment (phase 2) and a nightly pre-render (phase 3) slot in without reworking the pipeline. Ken Burns motion is implemented but defaulted off pending a visual check on the GPU box. Supporting changes: - memories: extract gather_memory_items() so the reel selector reuses the exact window/exclusion/tz/sort logic behind /memories. - ai::tts: add synthesize_serialized() so reel narration honors the same single-GPU permit + write lease as user TTS requests. - video::ffmpeg: make get_duration_seconds() pub for narration timing. - AppState: reels_path (REELS_DIRECTORY, defaults beside preview clips). Pure logic (cache key, script parsing, ffmpeg arg/filter construction, even sampling, segment timing) is unit-tested (26 tests). The runtime path (ffmpeg render, TTS, LLM) needs a real run on the GPU host to verify end-to-end — not exercisable in CI. Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-12 22:31:08 -04:00
parent 98274c3301
commit e3f731b3b2
9 changed files with 1615 additions and 30 deletions
@@ -0,0 +1,625 @@
+//! Memory reels: render an MP4 slideshow of a selection of photos with an
+//! LLM-written, voice-cloned narration over it.
+//!
+//! Pipeline: a [`selector`] resolves *which* photos (and the reel metadata),
+//! the [`script`] module writes per-photo narration via the LLM, each line is
+//! synthesized to speech, and [`render`] assembles the stills + narration into
+//! one MP4. Jobs run in the background (mirroring the TTS speech-job registry)
+//! because a reel takes minutes; the finished MP4 is cached on disk keyed by
+//! the selection so a repeat request is instant.
+//!
+//! Phase 1 is on-demand and photos-only. The segment model is media-typed so a
+//! video-clip segment (phase 2) and a nightly pre-render (phase 3) slot in
+//! without reworking the pipeline.
+
+pub mod render;
+pub mod script;
+pub mod selector;
+
+use std::collections::HashMap;
+use std::path::{Path, PathBuf};
+use std::sync::{LazyLock, Mutex as StdMutex};
+use std::time::{Duration, Instant};
+
+use actix_files::NamedFile;
+use actix_web::{HttpRequest, HttpResponse, Responder, get, post, web};
+use chrono::DateTime;
+use serde::{Deserialize, Serialize};
+use serde_json::json;
+use std::sync::Mutex;
+use uuid::Uuid;
+
+use crate::data::Claims;
+use crate::database::{ExifDao, InsightDao};
+use crate::memories::MemoriesSpan;
+use crate::otel::extract_context_from_request;
+use crate::state::AppState;
+use selector::ReelSelector;
+
+/// The media behind one reel segment. Photos-only for now; a `Clip` variant
+/// (a section of a source video) is the phase-2 extension point.
+#[derive(Debug, Clone)]
+pub enum SegmentMedia {
+    Photo { rel_path: String, library_id: i32 },
+}
+
+/// A segment before narration: which photo, when it was taken, and any cached
+/// insight to feed the scripter.
+#[derive(Debug, Clone)]
+pub struct PlannedSegment {
+    pub media: SegmentMedia,
+    pub date: Option<i64>,
+    pub insight_title: Option<String>,
+    pub insight_summary: Option<String>,
+}
+
+impl PlannedSegment {
+    /// Human date for the prompt, e.g. "June 12, 2019". `None` when undated.
+    pub fn date_label(&self) -> Option<String> {
+        let ts = self.date?;
+        let dt = DateTime::from_timestamp(ts, 0)?;
+        Some(dt.format("%B %-d, %Y").to_string())
+    }
+}
+
+/// Reel-wide metadata the scripter uses for framing.
+#[derive(Debug, Clone)]
+pub struct ReelMeta {
+    pub span: MemoriesSpan,
+    pub years: Vec<i32>,
+}
+
+impl ReelMeta {
+    /// Natural-language phrase for the span, e.g. "on this day".
+    pub fn span_phrase(&self) -> &'static str {
+        match self.span {
+            MemoriesSpan::Day => "on this day",
+            MemoriesSpan::Week => "this week",
+            MemoriesSpan::Month => "this month",
+        }
+    }
+}
+
+// --- Job registry ------------------------------------------------------------
+//
+// In-memory, same shape as the TTS speech-job registry: a reel takes minutes,
+// too long to hold one HTTP request from a phone. POST /reels returns a job id;
+// the client polls GET /reels/{id} until the video URL appears. The heavy
+// artifact (the MP4) lives on disk, not in this map — jobs only carry status +
+// the output path. State is intentionally not durable across restarts; the
+// on-disk cache is what makes a repeat request cheap, not the registry.
+
+#[derive(Clone, Copy, PartialEq, Eq, Debug, Serialize)]
+#[serde(rename_all = "snake_case")]
+pub enum ReelJobStatus {
+    Queued,
+    Running,
+    Done,
+    Error,
+}
+
+impl ReelJobStatus {
+    fn is_terminal(self) -> bool {
+        matches!(self, Self::Done | Self::Error)
+    }
+}
+
+struct ReelJob {
+    status: ReelJobStatus,
+    /// Coarse progress label for the client ("scripting", "narrating", …).
+    stage: &'static str,
+    title: Option<String>,
+    output_path: Option<PathBuf>,
+    error: Option<String>,
+    created_at: Instant,
+    finished_at: Option<Instant>,
+    abort: Option<tokio::task::AbortHandle>,
+}
+
+/// Finished jobs linger so a client that lost connectivity can still collect
+/// the result; anything older than MAX_AGE is dropped (aborted first if somehow
+/// still running). Swept lazily on each create.
+const REEL_JOB_RESULT_TTL: Duration = Duration::from_secs(30 * 60);
+const REEL_JOB_MAX_AGE: Duration = Duration::from_secs(60 * 60);
+
+static REEL_JOBS: LazyLock<StdMutex<HashMap<Uuid, ReelJob>>> =
+    LazyLock::new(|| StdMutex::new(HashMap::new()));
+
+fn sweep_stale_jobs(jobs: &mut HashMap<Uuid, ReelJob>, now: Instant) {
+    jobs.retain(|_, job| {
+        let result_expired = job
+            .finished_at
+            .is_some_and(|t| now.duration_since(t) >= REEL_JOB_RESULT_TTL);
+        let too_old = now.duration_since(job.created_at) >= REEL_JOB_MAX_AGE;
+        if too_old && let Some(h) = job.abort.take() {
+            h.abort();
+        }
+        !(result_expired || too_old)
+    });
+}
+
+fn with_job<R>(id: Uuid, f: impl FnOnce(&mut ReelJob) -> R) -> Option<R> {
+    REEL_JOBS.lock().unwrap().get_mut(&id).map(f)
+}
+
+fn set_stage(id: Uuid, stage: &'static str) {
+    with_job(id, |job| {
+        if !job.status.is_terminal() {
+            job.status = ReelJobStatus::Running;
+            job.stage = stage;
+        }
+    });
+}
+
+/// Move a job to a terminal state (first terminal write wins).
+fn finish_job(
+    id: Uuid,
+    status: ReelJobStatus,
+    title: Option<String>,
+    output_path: Option<PathBuf>,
+    error: Option<String>,
+) {
+    with_job(id, |job| {
+        if job.status.is_terminal() {
+            return;
+        }
+        job.status = status;
+        job.stage = match status {
+            ReelJobStatus::Done => "done",
+            _ => "error",
+        };
+        job.title = title;
+        job.output_path = output_path;
+        job.error = error;
+        job.finished_at = Some(Instant::now());
+        job.abort = None;
+    });
+}
+
+// --- On-disk cache -----------------------------------------------------------
+
+/// Render version: bump to invalidate every cached reel after a rendering /
+/// scripting change that should produce a fresh result.
+const RENDER_VERSION: u32 = 1;
+
+/// Cache key over everything that determines *which* media and *how* it's
+/// voiced — but not the (non-deterministic) narration text. Same inputs → same
+/// MP4 served instantly. blake3 keeps it filesystem-safe and collision-free.
+fn cache_key(selector: &ReelSelector, media: &[SegmentMedia], voice: Option<&str>) -> String {
+    let mut buf = format!(
+        "v{}|{}|voice={}|",
+        RENDER_VERSION,
+        selector.descriptor(),
+        voice.unwrap_or("default")
+    );
+    for m in media {
+        match m {
+            SegmentMedia::Photo {
+                rel_path,
+                library_id,
+            } => buf.push_str(&format!("{library_id}:{rel_path}|")),
+        }
+    }
+    blake3::hash(buf.as_bytes()).to_hex().to_string()
+}
+
+fn reel_mp4_path(app_state: &AppState, key: &str) -> PathBuf {
+    Path::new(&app_state.reels_path).join(format!("{key}.mp4"))
+}
+
+fn reel_sidecar_path(app_state: &AppState, key: &str) -> PathBuf {
+    Path::new(&app_state.reels_path).join(format!("{key}.json"))
+}
+
+#[derive(Serialize, Deserialize)]
+struct ReelSidecar {
+    title: String,
+}
+
+// --- HTTP types --------------------------------------------------------------
+
+#[derive(Debug, Deserialize)]
+pub struct CreateReelRequest {
+    #[serde(default)]
+    pub span: Option<MemoriesSpan>,
+    #[serde(default)]
+    pub timezone_offset_minutes: Option<i32>,
+    #[serde(default)]
+    pub library: Option<String>,
+    /// Cloned TTS voice for the narration; server default when omitted.
+    #[serde(default)]
+    pub voice: Option<String>,
+    /// Cap on photos in the reel (clamped server-side).
+    #[serde(default)]
+    pub max_segments: Option<usize>,
+}
+
+#[derive(Debug, Serialize)]
+pub struct ReelJobCreatedResponse {
+    pub job_id: String,
+    pub status: ReelJobStatus,
+}
+
+#[derive(Debug, Serialize)]
+pub struct ReelStatusResponse {
+    pub job_id: String,
+    pub status: ReelJobStatus,
+    pub stage: String,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub title: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub video_url: Option<String>,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub error: Option<String>,
+}
+
+// --- Handlers ----------------------------------------------------------------
+
+/// POST /reels — start (or instantly serve from cache) a memory reel for the
+/// requested span. Returns 202 + a job id; the client polls GET /reels/{id}.
+#[post("/reels")]
+pub async fn create_reel_handler(
+    http_request: HttpRequest,
+    _claims: Claims,
+    req: web::Json<CreateReelRequest>,
+    app_state: web::Data<AppState>,
+    exif_dao: web::Data<Mutex<Box<dyn ExifDao>>>,
+    insight_dao: web::Data<Mutex<Box<dyn InsightDao>>>,
+) -> impl Responder {
+    let span_context = extract_context_from_request(&http_request);
+
+    if app_state.llamacpp.is_none() {
+        return HttpResponse::ServiceUnavailable().json(json!({
+            "error": "Reel narration needs the LLM/TTS backend (set LLAMA_SWAP_URL)"
+        }));
+    }
+
+    let span = req.span.unwrap_or(MemoriesSpan::Day);
+    let max_segments = req.max_segments.unwrap_or(selector::DEFAULT_MAX_SEGMENTS);
+    let selector = ReelSelector::Memories {
+        span,
+        tz_offset_minutes: req.timezone_offset_minutes.unwrap_or(0),
+        library: req.library.clone(),
+        max_segments,
+    };
+
+    // Cheap pass: resolve the media set for the cache key and the emptiness
+    // check. Insight enrichment + scripting happen in the background job.
+    let (planned, meta) = match selector::resolve(&app_state, &exif_dao, &span_context, &selector) {
+        Ok(r) => r,
+        Err(msg) => return HttpResponse::BadRequest().body(msg),
+    };
+    if planned.is_empty() {
+        return HttpResponse::UnprocessableEntity().json(json!({
+            "error": "No photo memories found for this span"
+        }));
+    }
+
+    let media: Vec<SegmentMedia> = planned.iter().map(|p| p.media.clone()).collect();
+    let voice = req.voice.clone().filter(|s| !s.is_empty());
+    let key = cache_key(&selector, &media, voice.as_deref());
+
+    let job_id = Uuid::new_v4();
+
+    // Cache hit: register an already-Done job pointing at the existing MP4 so
+    // the client's first poll returns the video URL immediately.
+    let mp4 = reel_mp4_path(&app_state, &key);
+    if mp4.exists() {
+        let title = std::fs::read(reel_sidecar_path(&app_state, &key))
+            .ok()
+            .and_then(|b| serde_json::from_slice::<ReelSidecar>(&b).ok())
+            .map(|s| s.title);
+        let mut jobs = REEL_JOBS.lock().unwrap();
+        sweep_stale_jobs(&mut jobs, Instant::now());
+        jobs.insert(
+            job_id,
+            ReelJob {
+                status: ReelJobStatus::Done,
+                stage: "done",
+                title,
+                output_path: Some(mp4),
+                error: None,
+                created_at: Instant::now(),
+                finished_at: Some(Instant::now()),
+                abort: None,
+            },
+        );
+        return HttpResponse::Accepted().json(ReelJobCreatedResponse {
+            job_id: job_id.to_string(),
+            status: ReelJobStatus::Done,
+        });
+    }
+
+    {
+        let mut jobs = REEL_JOBS.lock().unwrap();
+        sweep_stale_jobs(&mut jobs, Instant::now());
+        jobs.insert(
+            job_id,
+            ReelJob {
+                status: ReelJobStatus::Queued,
+                stage: "queued",
+                title: None,
+                output_path: None,
+                error: None,
+                created_at: Instant::now(),
+                finished_at: None,
+                abort: None,
+            },
+        );
+    }
+
+    let state = app_state.clone();
+    let insight_dao = insight_dao.clone();
+    let handle = tokio::spawn(async move {
+        match run_reel_job(&state, &insight_dao, job_id, planned, meta, voice, &key).await {
+            Ok((title, path)) => {
+                finish_job(job_id, ReelJobStatus::Done, Some(title), Some(path), None)
+            }
+            Err(e) => {
+                log::error!("reel job {job_id} failed: {e:?}");
+                finish_job(
+                    job_id,
+                    ReelJobStatus::Error,
+                    None,
+                    None,
+                    Some(format!("{e}")),
+                )
+            }
+        }
+    });
+    with_job(job_id, |job| job.abort = Some(handle.abort_handle()));
+
+    HttpResponse::Accepted().json(ReelJobCreatedResponse {
+        job_id: job_id.to_string(),
+        status: ReelJobStatus::Queued,
+    })
+}
+
+/// GET /reels/{id} — poll a reel job. Done jobs carry a `video_url`.
+#[get("/reels/{id}")]
+pub async fn reel_status_handler(_claims: Claims, path: web::Path<String>) -> impl Responder {
+    let id_str = path.into_inner();
+    let Ok(id) = Uuid::parse_str(&id_str) else {
+        return HttpResponse::BadRequest().json(json!({ "error": "invalid job id" }));
+    };
+    let resp = with_job(id, |job| ReelStatusResponse {
+        job_id: id_str.clone(),
+        status: job.status,
+        stage: job.stage.to_string(),
+        title: job.title.clone(),
+        video_url: matches!(job.status, ReelJobStatus::Done)
+            .then(|| format!("/reels/{id_str}/video")),
+        error: job.error.clone(),
+    });
+    match resp {
+        Some(r) => HttpResponse::Ok().json(r),
+        None => HttpResponse::NotFound().json(json!({ "error": "job not found or expired" })),
+    }
+}
+
+/// GET /reels/{id}/video — stream the finished MP4 (supports range requests via
+/// NamedFile, so the mobile player can seek).
+#[get("/reels/{id}/video")]
+pub async fn reel_video_handler(
+    _claims: Claims,
+    request: HttpRequest,
+    path: web::Path<String>,
+) -> impl Responder {
+    let id_str = path.into_inner();
+    let Ok(id) = Uuid::parse_str(&id_str) else {
+        return HttpResponse::BadRequest().json(json!({ "error": "invalid job id" }));
+    };
+    let output = with_job(id, |job| job.output_path.clone()).flatten();
+    let Some(path) = output else {
+        return HttpResponse::NotFound().json(json!({ "error": "reel not ready" }));
+    };
+    match NamedFile::open(&path) {
+        Ok(file) => file.into_response(&request),
+        Err(e) => {
+            log::error!("opening reel mp4 {path:?} failed: {e:?}");
+            HttpResponse::NotFound().json(json!({ "error": "reel file missing" }))
+        }
+    }
+}
+
+// --- Pipeline ----------------------------------------------------------------
+
+/// Run the full reel pipeline: enrich → script → narrate → render → concat,
+/// then publish the MP4 into the cache. Returns (title, mp4_path).
+async fn run_reel_job(
+    app_state: &AppState,
+    insight_dao: &Mutex<Box<dyn InsightDao>>,
+    job_id: Uuid,
+    mut planned: Vec<PlannedSegment>,
+    meta: ReelMeta,
+    voice: Option<String>,
+    key: &str,
+) -> anyhow::Result<(String, PathBuf)> {
+    use anyhow::{Context, anyhow};
+
+    let client = app_state
+        .llamacpp
+        .as_ref()
+        .ok_or_else(|| anyhow!("TTS/LLM backend not configured"))?
+        .clone();
+
+    // 1. Enrich with cached insights, then script (one LLM call).
+    set_stage(job_id, "scripting");
+    let span_context = opentelemetry::Context::new();
+    selector::enrich(insight_dao, &span_context, &mut planned);
+    let script = script::generate_script(&client, &meta, &planned).await?;
+
+    // 2. Narrate each line to speech and 3. render each photo segment. A
+    // segment whose audio or render fails is skipped (logged) rather than
+    // sinking the whole reel — handles an odd HEIC/corrupt file gracefully.
+    set_stage(job_id, "narrating");
+    let work = tempfile::tempdir().context("creating reel work dir")?;
+    let nvenc = render::is_nvenc_available().await;
+    let opts = render::SegmentOpts {
+        nvenc,
+        ..Default::default()
+    };
+
+    let mut segment_files: Vec<String> = Vec::new();
+    for (i, (seg, line)) in planned.iter().zip(script.lines.iter()).enumerate() {
+        let image_path = match resolve_image_path(app_state, &seg.media) {
+            Some(p) => p,
+            None => {
+                log::warn!("reel {job_id}: skipping segment {i}, image path unresolved");
+                continue;
+            }
+        };
+
+        let audio_bytes =
+            match crate::ai::tts::synthesize_serialized(&client, line, voice.as_deref(), "wav")
+                .await
+            {
+                Ok(b) => b,
+                Err(e) => {
+                    log::warn!("reel {job_id}: skipping segment {i}, TTS failed: {e}");
+                    continue;
+                }
+            };
+        let audio_path = work.path().join(format!("narration_{i:03}.wav"));
+        if let Err(e) = tokio::fs::write(&audio_path, &audio_bytes).await {
+            log::warn!("reel {job_id}: skipping segment {i}, writing audio failed: {e}");
+            continue;
+        }
+
+        let narration_secs =
+            crate::video::ffmpeg::get_duration_seconds(&audio_path.to_string_lossy())
+                .await
+                .ok()
+                .flatten()
+                .unwrap_or(render::MIN_SEGMENT_SECONDS);
+        let duration = render::segment_duration(narration_secs);
+
+        set_stage(job_id, "rendering");
+        let seg_out = work.path().join(format!("seg_{i:03}.mp4"));
+        if let Err(e) =
+            render::render_segment(&image_path, &audio_path, &seg_out, duration, &opts).await
+        {
+            log::warn!("reel {job_id}: skipping segment {i}, render failed: {e}");
+            continue;
+        }
+        segment_files.push(seg_out.to_string_lossy().to_string());
+    }
+
+    if segment_files.is_empty() {
+        return Err(anyhow!("no segments rendered successfully"));
+    }
+
+    // 4. Concat into the cache. Write to a temp name in the reels dir, then
+    // rename atomically (same filesystem) so a reader never sees a partial.
+    std::fs::create_dir_all(&app_state.reels_path).context("creating reels dir")?;
+    let final_path = reel_mp4_path(app_state, key);
+    let tmp_path = final_path.with_extension("mp4.tmp");
+    render::concat_segments(&segment_files, &tmp_path).await?;
+    std::fs::rename(&tmp_path, &final_path).context("publishing reel mp4")?;
+
+    // Sidecar carries the title so a future cache hit can return it without
+    // re-running the pipeline.
+    let sidecar = serde_json::to_vec(&ReelSidecar {
+        title: script.title.clone(),
+    })
+    .context("serializing reel sidecar")?;
+    let _ = std::fs::write(reel_sidecar_path(app_state, key), sidecar);
+
+    Ok((script.title, final_path))
+}
+
+/// Resolve a photo segment's library-relative path to a validated absolute
+/// path under its library root.
+fn resolve_image_path(app_state: &AppState, media: &SegmentMedia) -> Option<PathBuf> {
+    let SegmentMedia::Photo {
+        rel_path,
+        library_id,
+    } = media;
+    let lib = app_state.library_by_id(*library_id)?;
+    crate::files::is_valid_full_path(&lib.root_path, rel_path, false)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    fn photo(p: &str, lib: i32) -> SegmentMedia {
+        SegmentMedia::Photo {
+            rel_path: p.to_string(),
+            library_id: lib,
+        }
+    }
+
+    fn day_selector() -> ReelSelector {
+        ReelSelector::Memories {
+            span: MemoriesSpan::Day,
+            tz_offset_minutes: 0,
+            library: None,
+            max_segments: 24,
+        }
+    }
+
+    #[test]
+    fn cache_key_is_stable_for_same_inputs() {
+        let media = vec![photo("a.jpg", 1), photo("b.jpg", 1)];
+        let k1 = cache_key(&day_selector(), &media, Some("grandma"));
+        let k2 = cache_key(&day_selector(), &media, Some("grandma"));
+        assert_eq!(k1, k2);
+        // 64-hex blake3.
+        assert_eq!(k1.len(), 64);
+        assert!(k1.chars().all(|c| c.is_ascii_hexdigit()));
+    }
+
+    #[test]
+    fn cache_key_changes_with_media_order_voice_and_selector() {
+        let media = vec![photo("a.jpg", 1), photo("b.jpg", 1)];
+        let reordered = vec![photo("b.jpg", 1), photo("a.jpg", 1)];
+        let base = cache_key(&day_selector(), &media, Some("grandma"));
+        // Order matters (the reel sequence differs).
+        assert_ne!(
+            base,
+            cache_key(&day_selector(), &reordered, Some("grandma"))
+        );
+        // Voice matters.
+        assert_ne!(base, cache_key(&day_selector(), &media, Some("dad")));
+        assert_ne!(base, cache_key(&day_selector(), &media, None));
+        // Span matters.
+        let week = ReelSelector::Memories {
+            span: MemoriesSpan::Week,
+            tz_offset_minutes: 0,
+            library: None,
+            max_segments: 24,
+        };
+        assert_ne!(base, cache_key(&week, &media, Some("grandma")));
+    }
+
+    #[test]
+    fn span_phrase_maps_each_span() {
+        let mk = |span| ReelMeta {
+            span,
+            years: vec![],
+        };
+        assert_eq!(mk(MemoriesSpan::Day).span_phrase(), "on this day");
+        assert_eq!(mk(MemoriesSpan::Week).span_phrase(), "this week");
+        assert_eq!(mk(MemoriesSpan::Month).span_phrase(), "this month");
+    }
+
+    #[test]
+    fn date_label_formats_or_none() {
+        let seg = PlannedSegment {
+            media: photo("a.jpg", 1),
+            date: Some(1_560_384_000), // 2019-06-13 UTC
+            insight_title: None,
+            insight_summary: None,
+        };
+        assert!(seg.date_label().unwrap().contains("2019"));
+
+        let undated = PlannedSegment {
+            media: photo("a.jpg", 1),
+            date: None,
+            insight_title: None,
+            insight_summary: None,
+        };
+        assert_eq!(undated.date_label(), None);
+    }
+}