f707353807
Implement end-to-end nightly pre-generation of memory reels with agentic
scripting that grounds narration in calendar, location, messages, and RAG.
Sections A-E from the plan:
A. Extract produce_reel pipeline core from run_reel_job with
ScripterMode::Fast/Agentic and progress callbacks.
B. Agentic scripter: factor run_readonly_tool_loop from the insight
generator, build read-only tool gate, prompt builder with GPS, and
generate_script_agentic with fallback to fast path.
C. Precomputed reels ledger (SQLite table + DAO), GET /reels/precomputed
handler with validity gate, GET /reels/by-key/{key}/video streaming,
and normalize_library_key helper.
D. Nightly scheduler: spawn_pregen_scheduler with configurable hour,
run_pregen_batch (day/week/month spans), pregen_one with dedup and
disk-check, secs_until_next_run_hour time math.
E. user_ai_prefs passive mirror table + DAO for param capture in
create_reel_handler and replay in the scheduler.
Also fixes resolve_library_param signature to take &[Library] and adds
resolve_library_param_state wrapper for AppState callers.
New files: migrations/2026-06-13-000000_add_precomputed_reels/,
migrations/2026-06-13-000010_add_user_ai_prefs/,
src/database/precomputed_reel_dao.rs,
src/database/user_ai_prefs_dao.rs
487 lines
18 KiB
Rust
487 lines
18 KiB
Rust
//! Narration scripting for memory reels.
|
|
//!
|
|
//! One LLM call turns the planned beats (each carrying its date and, where
|
|
//! available, its cached insight) into a short first-person narration line per
|
|
//! beat plus a title for the reel. A beat may show several photos in a quick
|
|
//! burst, so a line narrates the *moment*, not a single frame. We reuse the
|
|
//! cached insight summary as the richest signal rather than re-running vision
|
|
//! at reel time — that keeps reel generation off the GPU's vision slot.
|
|
//!
|
|
//! The prompt builder and response parser are pure so the contract is
|
|
//! unit-testable; `generate_script` wires them to the LLM client.
|
|
//!
|
|
//! The agentic scripter (pre-generation) resolves the backend through the
|
|
//! InsightGenerator, builds a read-only tool set, and runs a tool loop to
|
|
//! ground the narration in retrieved context before asking for the final JSON.
|
|
|
|
use anyhow::{Context, Result};
|
|
use std::sync::Arc;
|
|
|
|
use super::{PlannedBeat, ReelMeta};
|
|
use crate::ai::backend::{BackendKind, SamplingOverrides};
|
|
use crate::ai::insight_generator::InsightGenerator;
|
|
use crate::ai::llamacpp::LlamaCppClient;
|
|
use crate::ai::llm_client::{LlmClient, Tool};
|
|
use crate::ai::ollama::ChatMessage;
|
|
|
|
/// The narration for a whole reel: a title and one line per beat, in order.
|
|
#[derive(Debug, Clone, PartialEq)]
|
|
pub struct ReelScript {
|
|
pub title: String,
|
|
pub lines: Vec<String>,
|
|
}
|
|
|
|
const SYSTEM_PROMPT: &str = "You are narrating a personal memory reel — a short \
|
|
slideshow of someone's own photos set to a spoken voiceover. Write warm, \
|
|
specific, first-person narration as if the person is gently looking back on \
|
|
their own memories. Each line plays over one moment, which may be a quick burst \
|
|
of several photos, so narrate the moment as a whole rather than a single frame. \
|
|
Be concrete and grounded in the details given; never invent names, places, or \
|
|
events that aren't supported. Keep each line to one or two short sentences that \
|
|
can be read aloud in a few seconds. Avoid generic filler like \"what a \
|
|
wonderful day\" — if you have little to go on, simply describe the moment \
|
|
plainly.";
|
|
|
|
/// Agentic scripter system prompt: richer version that tells the model it may
|
|
/// call read-only tools to ground each line.
|
|
const AGENTIC_SYSTEM_PROMPT: &str = "You are narrating a personal memory reel — a short \
|
|
slideshow of someone's own photos set to a spoken voiceover. Write warm, \
|
|
specific, first-person narration as if the person is gently looking back on \
|
|
their own memories. Each line plays over one moment, which may be a quick burst \
|
|
of several photos, so narrate the moment as a whole rather than a single frame. \
|
|
Be concrete and grounded in the details given; never invent names, places, or \
|
|
events that aren't supported. Keep each line to one or two short sentences that \
|
|
can be read aloud in a few seconds. Avoid generic filler like \"what a \
|
|
wonderful day\" — if you have little to go on, simply describe the moment \
|
|
plainly.\n\nYou may call read-only tools (search_messages, get_file_tags, \
|
|
reverse_geocode, get_current_datetime, recall_entities, recall_facts_for_photo, \
|
|
recall_facts_for_entity) to ground each line in real context. Never invent \
|
|
details. Return ONLY the JSON object, no prose or code fences.";
|
|
|
|
/// Maximum agentic tool iterations for pre-generation. Tunable via
|
|
/// `REEL_PREGEN_MAX_TOOL_ITERS` (default 8).
|
|
fn reel_pregen_max_tool_iters() -> usize {
|
|
std::env::var("REEL_PREGEN_MAX_TOOL_ITERS")
|
|
.ok()
|
|
.and_then(|s| s.trim().parse::<usize>().ok())
|
|
.filter(|x| *x > 0)
|
|
.unwrap_or(8)
|
|
}
|
|
|
|
/// Build the (system, user) prompt pair for the scripter. The user message
|
|
/// describes each beat in order and asks for strict JSON back.
|
|
pub fn build_script_messages(meta: &ReelMeta, beats: &[PlannedBeat]) -> (String, String) {
|
|
let mut user = String::new();
|
|
user.push_str(&format!(
|
|
"This reel has {} moments surfaced as memories {}.\n\n",
|
|
beats.len(),
|
|
meta.span_phrase()
|
|
));
|
|
if !meta.years.is_empty() {
|
|
let years: Vec<String> = meta.years.iter().map(|y| y.to_string()).collect();
|
|
user.push_str(&format!("They span the years: {}.\n\n", years.join(", ")));
|
|
}
|
|
user.push_str("Moments, in the order they will appear:\n");
|
|
for (i, beat) in beats.iter().enumerate() {
|
|
user.push_str(&format!("\n[{}]", i + 1));
|
|
if let Some(date) = beat.date_label() {
|
|
user.push_str(&format!(" {date}"));
|
|
}
|
|
if beat.is_clip() {
|
|
user.push_str(" (a video clip)");
|
|
} else if beat.media.len() > 1 {
|
|
user.push_str(&format!(" (a burst of {} photos)", beat.media.len()));
|
|
}
|
|
user.push('\n');
|
|
match (&beat.insight_title, &beat.insight_summary) {
|
|
(Some(t), Some(s)) if !s.trim().is_empty() => {
|
|
user.push_str(&format!(" Known context: {t} — {s}\n"));
|
|
}
|
|
(Some(t), _) => user.push_str(&format!(" Known context: {t}\n")),
|
|
(_, Some(s)) if !s.trim().is_empty() => {
|
|
user.push_str(&format!(" Known context: {s}\n"));
|
|
}
|
|
_ => user.push_str(" (no extra context — narrate plainly from the date)\n"),
|
|
}
|
|
}
|
|
user.push_str(&format!(
|
|
"\nReturn ONLY a JSON object, no prose or code fences, shaped exactly:\n\
|
|
{{\"title\": \"<short reel title>\", \"segments\": [\"<line for moment 1>\", \
|
|
\"<line for moment 2>\", ... ]}}\n\
|
|
The \"segments\" array MUST have exactly {} items, one per moment in order.",
|
|
beats.len()
|
|
));
|
|
(SYSTEM_PROMPT.to_string(), user)
|
|
}
|
|
|
|
/// Build a richer (system, user) prompt pair for the agentic scripter. The
|
|
/// system prompt tells the model it may call read-only tools to ground each
|
|
/// line. The user message uses the same per-beat enumeration as
|
|
/// `build_script_messages` plus a GPS line per beat when available.
|
|
pub fn build_agentic_script_messages(meta: &ReelMeta, beats: &[PlannedBeat]) -> Vec<ChatMessage> {
|
|
let mut user = String::new();
|
|
user.push_str(&format!(
|
|
"This reel has {} moments surfaced as memories {}.\n\n",
|
|
beats.len(),
|
|
meta.span_phrase()
|
|
));
|
|
if !meta.years.is_empty() {
|
|
let years: Vec<String> = meta.years.iter().map(|y| y.to_string()).collect();
|
|
user.push_str(&format!("They span the years: {}.\n\n", years.join(", ")));
|
|
}
|
|
user.push_str("Moments, in the order they will appear:\n");
|
|
for (i, beat) in beats.iter().enumerate() {
|
|
user.push_str(&format!("\n[{}]", i + 1));
|
|
if let Some(date) = beat.date_label() {
|
|
user.push_str(&format!(" {date}"));
|
|
}
|
|
if beat.is_clip() {
|
|
user.push_str(" (a video clip)");
|
|
} else if beat.media.len() > 1 {
|
|
user.push_str(&format!(" (a burst of {} photos)", beat.media.len()));
|
|
}
|
|
if let Some((lat, lon)) = beat.gps {
|
|
user.push_str(&format!("\n GPS: {:.4}, {:.4}", lat, lon));
|
|
}
|
|
user.push('\n');
|
|
match (&beat.insight_title, &beat.insight_summary) {
|
|
(Some(t), Some(s)) if !s.trim().is_empty() => {
|
|
user.push_str(&format!(" Known context: {t} — {s}\n"));
|
|
}
|
|
(Some(t), _) => user.push_str(&format!(" Known context: {t}\n")),
|
|
(_, Some(s)) if !s.trim().is_empty() => {
|
|
user.push_str(&format!(" Known context: {s}\n"));
|
|
}
|
|
_ => user.push_str(" (no extra context — narrate plainly from the date)\n"),
|
|
}
|
|
}
|
|
user.push_str(&format!(
|
|
"\nReturn ONLY a JSON object, no prose or code fences, shaped exactly:\n\
|
|
{{\"title\": \"<short reel title>\", \"segments\": [\"<line for moment 1>\", \
|
|
\"<line for moment 2>\", ... ]}}\n\
|
|
The \"segments\" array MUST have exactly {} items, one per moment in order.",
|
|
beats.len()
|
|
));
|
|
|
|
vec![
|
|
ChatMessage::system(AGENTIC_SYSTEM_PROMPT.to_string()),
|
|
ChatMessage::user(user),
|
|
]
|
|
}
|
|
|
|
/// Parse the model's response into a script with exactly `n` lines. Tolerant of
|
|
/// code fences and surrounding prose, and of both `segments: [".."]` and
|
|
/// `segments: [{"narration": ".."}]` shapes. Missing/extra lines are padded or
|
|
/// truncated so the caller always gets `n` aligned to the segments.
|
|
pub fn parse_script_response(raw: &str, n: usize) -> ReelScript {
|
|
let fallback_line = "A moment worth remembering.";
|
|
let value = extract_json_object(raw);
|
|
|
|
let title = value
|
|
.as_ref()
|
|
.and_then(|v| v.get("title"))
|
|
.and_then(|t| t.as_str())
|
|
.map(clean_text)
|
|
.filter(|s| !s.is_empty())
|
|
.unwrap_or_else(|| "Memories".to_string());
|
|
|
|
let mut lines: Vec<String> = value
|
|
.as_ref()
|
|
.and_then(|v| v.get("segments"))
|
|
.and_then(|s| s.as_array())
|
|
.map(|arr| {
|
|
arr.iter()
|
|
.map(|item| {
|
|
let text = item
|
|
.as_str()
|
|
.map(|s| s.to_string())
|
|
.or_else(|| {
|
|
item.get("narration")
|
|
.and_then(|n| n.as_str())
|
|
.map(|s| s.to_string())
|
|
})
|
|
.unwrap_or_default();
|
|
clean_text(&text)
|
|
})
|
|
.collect()
|
|
})
|
|
.unwrap_or_default();
|
|
|
|
// Align to exactly n: drop extras, pad shortfalls with a neutral line so
|
|
// every photo still gets spoken audio.
|
|
lines.truncate(n);
|
|
while lines.len() < n {
|
|
lines.push(fallback_line.to_string());
|
|
}
|
|
for line in lines.iter_mut() {
|
|
if line.is_empty() {
|
|
*line = fallback_line.to_string();
|
|
}
|
|
}
|
|
|
|
ReelScript { title, lines }
|
|
}
|
|
|
|
/// Pull the first balanced top-level JSON object out of a possibly-noisy model
|
|
/// response (code fences, leading prose). Returns None if nothing parses.
|
|
fn extract_json_object(raw: &str) -> Option<serde_json::Value> {
|
|
// Fast path: the whole thing is valid JSON.
|
|
if let Ok(v) = serde_json::from_str::<serde_json::Value>(raw.trim()) {
|
|
return Some(v);
|
|
}
|
|
// Otherwise scan for the first '{' ... matching '}' span, ignoring braces
|
|
// inside strings.
|
|
let bytes = raw.as_bytes();
|
|
let start = raw.find('{')?;
|
|
let mut depth = 0i32;
|
|
let mut in_str = false;
|
|
let mut escaped = false;
|
|
for i in start..bytes.len() {
|
|
let c = bytes[i] as char;
|
|
if in_str {
|
|
if escaped {
|
|
escaped = false;
|
|
} else if c == '\\' {
|
|
escaped = true;
|
|
} else if c == '"' {
|
|
in_str = false;
|
|
}
|
|
continue;
|
|
}
|
|
match c {
|
|
'"' => in_str = true,
|
|
'{' => depth += 1,
|
|
'}' => {
|
|
depth -= 1;
|
|
if depth == 0 {
|
|
return serde_json::from_str(&raw[start..=i]).ok();
|
|
}
|
|
}
|
|
_ => {}
|
|
}
|
|
}
|
|
None
|
|
}
|
|
|
|
/// Collapse whitespace and strip stray markdown/quote decorations a model
|
|
/// sometimes leaves around a line.
|
|
fn clean_text(s: &str) -> String {
|
|
let trimmed = s.trim().trim_matches('"').trim();
|
|
trimmed.split_whitespace().collect::<Vec<_>>().join(" ")
|
|
}
|
|
|
|
/// Generate the reel script via the LLM. Text-only (no images) — the per-beat
|
|
/// context comes from cached insights. The call takes the GPU read lease
|
|
/// internally (see `LlamaCppClient::generate`).
|
|
pub async fn generate_script(
|
|
client: &Arc<LlamaCppClient>,
|
|
meta: &ReelMeta,
|
|
beats: &[PlannedBeat],
|
|
) -> Result<ReelScript> {
|
|
let (system, user) = build_script_messages(meta, beats);
|
|
let raw = client
|
|
.generate(&user, Some(&system), None)
|
|
.await
|
|
.context("LLM script generation failed")?;
|
|
Ok(parse_script_response(&raw, beats.len()))
|
|
}
|
|
|
|
/// Agentic version of script generation: resolves the backend via the
|
|
/// InsightGenerator (honouring LLM_BACKEND, model overrides, etc.), builds
|
|
/// a read-only tool set, runs the tool loop, then parses the JSON response.
|
|
/// Returns the same ReelScript shape. On failure the caller may fall back to
|
|
/// `generate_script`.
|
|
pub async fn generate_script_agentic(
|
|
generator: &InsightGenerator,
|
|
meta: &ReelMeta,
|
|
beats: &[PlannedBeat],
|
|
) -> Result<ReelScript> {
|
|
// 1. Resolve the backend. Bail if the local model lacks tool-calling.
|
|
let backend = generator
|
|
.resolve_backend(
|
|
BackendKind::Local,
|
|
&SamplingOverrides {
|
|
model: None,
|
|
num_ctx: None,
|
|
temperature: None,
|
|
top_p: None,
|
|
top_k: None,
|
|
min_p: None,
|
|
},
|
|
)
|
|
.await
|
|
.context("resolving backend for agentic script")?;
|
|
|
|
// 2. Build the read-only tool set. Start from the persona gate (no
|
|
// persona context, so corrections are closed), force has_vision=false,
|
|
// then filter out write tools.
|
|
let gate = generator.current_gate_opts_for_persona(false, None);
|
|
let all_tools = InsightGenerator::build_tool_definitions(gate);
|
|
let read_only_names: std::collections::HashSet<&str> = [
|
|
"search_rag",
|
|
"search_messages",
|
|
"get_sms_messages",
|
|
"get_calendar_events",
|
|
"get_location_history",
|
|
"get_file_tags",
|
|
"get_faces_in_photo",
|
|
"reverse_geocode",
|
|
"get_personal_place_at",
|
|
"recall_entities",
|
|
"recall_facts_for_photo",
|
|
"recall_facts_for_entity",
|
|
"get_current_datetime",
|
|
]
|
|
.into_iter()
|
|
.collect();
|
|
let tools: Vec<Tool> = all_tools
|
|
.into_iter()
|
|
.filter(|t| read_only_names.contains(t.function.name.as_str()))
|
|
.collect();
|
|
|
|
// 3. Build the agentic prompt messages.
|
|
let messages = build_agentic_script_messages(meta, beats);
|
|
|
|
// 4. Run the tool loop.
|
|
let max_iter = reel_pregen_max_tool_iters();
|
|
let raw = generator
|
|
.run_readonly_tool_loop(&backend, messages, tools, max_iter)
|
|
.await
|
|
.context("agentic tool loop failed")?;
|
|
|
|
// 5. Strip any think-blocks the model may have emitted, then parse.
|
|
let raw = crate::ai::llm_client::strip_think_blocks(&raw);
|
|
Ok(parse_script_response(&raw, beats.len()))
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
use crate::memories::MemoriesSpan;
|
|
|
|
fn meta() -> ReelMeta {
|
|
ReelMeta {
|
|
span: MemoriesSpan::Day,
|
|
years: vec![2019, 2021],
|
|
}
|
|
}
|
|
|
|
fn planned(n: usize) -> Vec<PlannedBeat> {
|
|
(0..n)
|
|
.map(|i| PlannedBeat {
|
|
media: vec![super::super::SegmentMedia::Photo {
|
|
rel_path: format!("p{i}.jpg"),
|
|
library_id: 1,
|
|
}],
|
|
date: Some(1_560_000_000 + i as i64 * 86_400),
|
|
insight_title: None,
|
|
insight_summary: None,
|
|
gps: None,
|
|
})
|
|
.collect()
|
|
}
|
|
|
|
#[test]
|
|
fn prompt_states_exact_moment_count_and_span() {
|
|
let (sys, user) = build_script_messages(&meta(), &planned(3));
|
|
assert!(sys.contains("memory reel"));
|
|
assert!(user.contains("3 moments"));
|
|
assert!(user.contains("on this day"));
|
|
assert!(user.contains("exactly 3 items"));
|
|
// Each moment gets an indexed entry.
|
|
assert!(user.contains("[1]") && user.contains("[2]") && user.contains("[3]"));
|
|
}
|
|
|
|
#[test]
|
|
fn prompt_notes_burst_photo_count() {
|
|
let mut p = planned(1);
|
|
p[0].media = vec![
|
|
super::super::SegmentMedia::Photo {
|
|
rel_path: "a.jpg".into(),
|
|
library_id: 1,
|
|
},
|
|
super::super::SegmentMedia::Photo {
|
|
rel_path: "b.jpg".into(),
|
|
library_id: 1,
|
|
},
|
|
super::super::SegmentMedia::Photo {
|
|
rel_path: "c.jpg".into(),
|
|
library_id: 1,
|
|
},
|
|
];
|
|
let (_sys, user) = build_script_messages(&meta(), &p);
|
|
assert!(user.contains("a burst of 3 photos"));
|
|
}
|
|
|
|
#[test]
|
|
fn prompt_marks_clip_beats() {
|
|
let mut p = planned(1);
|
|
p[0].media = vec![super::super::SegmentMedia::Clip {
|
|
rel_path: "v.mp4".into(),
|
|
library_id: 1,
|
|
}];
|
|
let (_sys, user) = build_script_messages(&meta(), &p);
|
|
assert!(user.contains("a video clip"));
|
|
}
|
|
|
|
#[test]
|
|
fn prompt_includes_insight_context_when_present() {
|
|
let mut p = planned(1);
|
|
p[0].insight_title = Some("Lake house weekend".into());
|
|
p[0].insight_summary = Some("Swimming with the dogs.".into());
|
|
let (_sys, user) = build_script_messages(&meta(), &p);
|
|
assert!(user.contains("Lake house weekend — Swimming with the dogs."));
|
|
}
|
|
|
|
#[test]
|
|
fn parse_plain_json_object() {
|
|
let raw = r#"{"title":"Summer Days","segments":["First line.","Second line."]}"#;
|
|
let script = parse_script_response(raw, 2);
|
|
assert_eq!(script.title, "Summer Days");
|
|
assert_eq!(script.lines, vec!["First line.", "Second line."]);
|
|
}
|
|
|
|
#[test]
|
|
fn parse_tolerates_code_fences_and_prose() {
|
|
let raw = "Sure! Here's your reel:\n```json\n{\"title\": \"Trip\", \"segments\": [\"A.\", \"B.\"]}\n```\nEnjoy!";
|
|
let script = parse_script_response(raw, 2);
|
|
assert_eq!(script.title, "Trip");
|
|
assert_eq!(script.lines, vec!["A.", "B."]);
|
|
}
|
|
|
|
#[test]
|
|
fn parse_accepts_object_segment_shape() {
|
|
let raw = r#"{"title":"T","segments":[{"narration":"One."},{"narration":"Two."}]}"#;
|
|
let script = parse_script_response(raw, 2);
|
|
assert_eq!(script.lines, vec!["One.", "Two."]);
|
|
}
|
|
|
|
#[test]
|
|
fn parse_pads_short_and_truncates_long_to_n() {
|
|
// Model returned 1 line but we have 3 segments → pad with neutral lines.
|
|
let short = parse_script_response(r#"{"title":"T","segments":["Only one."]}"#, 3);
|
|
assert_eq!(short.lines.len(), 3);
|
|
assert_eq!(short.lines[0], "Only one.");
|
|
assert!(!short.lines[1].is_empty());
|
|
|
|
// Model returned 3 but we have 2 → truncate.
|
|
let long = parse_script_response(r#"{"title":"T","segments":["a","b","c"]}"#, 2);
|
|
assert_eq!(long.lines, vec!["a", "b"]);
|
|
}
|
|
|
|
#[test]
|
|
fn parse_falls_back_on_garbage() {
|
|
let script = parse_script_response("the model said no", 2);
|
|
assert_eq!(script.title, "Memories");
|
|
assert_eq!(script.lines.len(), 2);
|
|
assert!(script.lines.iter().all(|l| !l.is_empty()));
|
|
}
|
|
|
|
#[test]
|
|
fn parse_blank_line_replaced_with_fallback() {
|
|
let script = parse_script_response(r#"{"title":"T","segments":[" ","Real."]}"#, 2);
|
|
assert!(!script.lines[0].is_empty());
|
|
assert_eq!(script.lines[1], "Real.");
|
|
}
|
|
}
|