//! Re-embed stored corpora through `LocalLlm`, i.e. the same //! `LLM_BACKEND` dispatch the query side uses. The original import / //! backfill tools always embedded via Ollama, so a deploy running //! `LLM_BACKEND=llamacpp` queries vector spaces the corpora may not live //! in. Three tables share the problem and are all covered here: //! //! - `daily_conversation_summaries` — re-embeds //! `strip_summary_boilerplate(summary)` (what the original job fed the //! embedder); also rewrites `model_version`. //! - `calendar_events` — re-embeds "summary description location" exactly //! as `import_calendar` does; rows without an embedding are skipped (the //! import only embeds under `--generate-embeddings`). //! - `search_history` — re-embeds the raw query text. //! - `entities` (knowledge graph) — re-embeds "name description" exactly as //! `tool_store_entity` does; embedding-less rows are skipped (embedding //! is best-effort at store time). //! //! Source text is untouched — only vectors are rewritten. The old↔new //! cosine report doubles as a diagnostic: ~1.0 means both backends already //! shared a space (re-embedding was a no-op); low values confirm the //! mismatch this tool exists to fix. use anyhow::{Context, Result}; use clap::Parser; use diesel::prelude::*; use diesel::sql_query; use diesel::sqlite::SqliteConnection; use image_api::ai::{LocalLlm, strip_summary_boilerplate}; use image_api::bin_progress; use std::env; #[derive(Parser, Debug)] #[command(author, version, about = "Re-embed stored corpora via the configured LLM_BACKEND", long_about = None)] struct Args { /// Comma-separated tables to process: summaries, calendar, search, entities #[arg(long, default_value = "summaries,calendar,search,entities")] tables: String, /// Only process the first N rows per table (smoke test) #[arg(long)] limit: Option, /// Compute embeddings and report old↔new similarity without writing #[arg(long, default_value_t = false)] dry_run: bool, } #[derive(QueryableByName)] struct SummaryRow { #[diesel(sql_type = diesel::sql_types::Integer)] id: i32, #[diesel(sql_type = diesel::sql_types::Text)] summary: String, #[diesel(sql_type = diesel::sql_types::Binary)] embedding: Vec, #[diesel(sql_type = diesel::sql_types::Text)] model_version: String, } #[derive(QueryableByName)] struct CalendarRow { #[diesel(sql_type = diesel::sql_types::Integer)] id: i32, #[diesel(sql_type = diesel::sql_types::Text)] summary: String, #[diesel(sql_type = diesel::sql_types::Nullable)] description: Option, #[diesel(sql_type = diesel::sql_types::Nullable)] location: Option, #[diesel(sql_type = diesel::sql_types::Binary)] embedding: Vec, } #[derive(QueryableByName)] struct SearchRow { #[diesel(sql_type = diesel::sql_types::BigInt)] id: i64, #[diesel(sql_type = diesel::sql_types::Text)] query: String, #[diesel(sql_type = diesel::sql_types::Binary)] embedding: Vec, } #[derive(QueryableByName)] struct EntityRow { #[diesel(sql_type = diesel::sql_types::Integer)] id: i32, #[diesel(sql_type = diesel::sql_types::Text)] name: String, #[diesel(sql_type = diesel::sql_types::Text)] description: String, #[diesel(sql_type = diesel::sql_types::Binary)] embedding: Vec, } /// One unit of re-embed work, normalized across tables. struct WorkItem { /// Row key, as i64 so both i32 ids and rowids fit. id: i64, /// Text fed to the embedder — must match what the original writer used. text: String, /// Existing vector bytes, for the old↔new similarity report. old_embedding: Vec, } fn deserialize_vector(bytes: &[u8]) -> Option> { if !bytes.len().is_multiple_of(4) { return None; } Some( bytes .chunks_exact(4) .map(|c| f32::from_le_bytes([c[0], c[1], c[2], c[3]])) .collect(), ) } fn serialize_vector(vec: &[f32]) -> Vec { vec.iter().flat_map(|f| f.to_le_bytes()).collect() } fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 { if a.len() != b.len() { return 0.0; } let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum(); let mag_a: f32 = a.iter().map(|x| x * x).sum::().sqrt(); let mag_b: f32 = b.iter().map(|x| x * x).sum::().sqrt(); if mag_a == 0.0 || mag_b == 0.0 { return 0.0; } dot / (mag_a * mag_b) } /// Embed `text`, halving it on "input too large" errors until it fits the /// server's physical batch (`--ubatch-size`). Mirrors the silent truncation /// Ollama applied when these corpora were first embedded — llama-server /// returns a 500 instead — except here it's surfaced via the returned flag. /// Returns `(embedding, truncated)`. async fn embed_with_truncation(llm: &LocalLlm, text: &str) -> Result<(Vec, bool)> { let mut text = text.to_string(); let mut truncated = false; loop { match llm.embed_document(&text).await { Ok(emb) => return Ok((emb, truncated)), Err(e) if e.to_string().contains("too large to process") && text.chars().count() > 64 => { let keep = text.chars().count() / 2; text = text.chars().take(keep).collect(); truncated = true; } Err(e) => return Err(e), } } } /// Re-embed `items`, writing each new vector via `update`. Returns the /// old↔new cosines for the similarity report. async fn reembed_table( conn: &mut SqliteConnection, llm: &LocalLlm, label: &str, items: Vec, dry_run: bool, update: impl Fn(&mut SqliteConnection, i64, Vec) -> Result<()>, ) -> Result> { println!("\n[{}] re-embedding {} rows...", label, items.len()); let pb = bin_progress::determinate(items.len() as u64, format!("re-embedding {}", label)); let mut sims: Vec = Vec::with_capacity(items.len()); let mut updated = 0usize; let mut failed = 0usize; let mut truncated_count = 0usize; for item in &items { let new_emb = match embed_with_truncation(llm, &item.text).await { Ok((e, truncated)) => { if truncated { truncated_count += 1; pb.println(format!( "⚠ {} id={}: input exceeded the embed server's batch size, \ truncated before embedding", label, item.id )); } e } Err(e) => { pb.inc(1); failed += 1; eprintln!("✗ {} id={}: {}", label, item.id, e); continue; } }; // The whole pipeline (DAO checks, stored corpora) assumes // EMBEDDING_DIM dims. A mismatch means the active embed slot is not // serving the configured model — stop rather than corrupt the table. anyhow::ensure!( new_emb.len() == image_api::ai::embedding_dim(), "backend returned {}-dim embedding (expected {}) — '{}' does not \ match the configured EMBEDDING_DIM", new_emb.len(), image_api::ai::embedding_dim(), llm.embedding_model_version() ); if let Some(old_emb) = deserialize_vector(&item.old_embedding) { sims.push(cosine_similarity(&old_emb, &new_emb)); } if !dry_run { update(conn, item.id, serialize_vector(&new_emb)) .with_context(|| format!("updating {} id={}", label, item.id))?; } updated += 1; pb.inc(1); } pb.finish_and_clear(); println!( "[{}] {} re-embedded ({} truncated), {} failed", label, updated, truncated_count, failed ); Ok(sims) } fn report_similarity(label: &str, mut sims: Vec) { if sims.is_empty() { println!("[{}] no old↔new pairs to compare", label); return; } sims.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal)); let mean: f32 = sims.iter().sum::() / sims.len() as f32; let median = sims[sims.len() / 2]; println!( "[{}] old↔new cosine over identical text: min={:.3} median={:.3} mean={:.3} max={:.3}", label, sims.first().unwrap(), median, mean, sims.last().unwrap() ); if median > 0.98 { println!( "[{}] → old and new backends agree (~same vector space); poor search \ results are coming from something else (prefixes, thresholds, corpus).", label ); } else if median > 0.9 { println!( "[{}] → same model family but measurably different vectors \ (quantization / runtime drift); re-embedding was worthwhile.", label ); } else { println!( "[{}] → vector-space mismatch confirmed — queries were searching a \ different space than the corpus. This re-embed should fix it.", label ); } } #[tokio::main] async fn main() -> Result<()> { dotenv::dotenv().ok(); env_logger::init(); let args = Args::parse(); let tables: Vec<&str> = args.tables.split(',').map(|t| t.trim()).collect(); for t in &tables { anyhow::ensure!( matches!(*t, "summaries" | "calendar" | "search" | "entities"), "unknown table '{}' — expected summaries, calendar, search, entities", t ); } let database_url = env::var("DATABASE_URL").unwrap_or_else(|_| "auth.db".to_string()); println!("Database: {}", database_url); let mut conn = SqliteConnection::establish(&database_url) .with_context(|| format!("connecting to {}", database_url))?; let llm = LocalLlm::from_env(); let model_version = llm.embedding_model_version(); println!("Embedding via '{}'", model_version); if args.dry_run { println!("DRY RUN — no rows will be written"); } if tables.contains(&"summaries") { let mut rows: Vec = sql_query( "SELECT id, summary, embedding, model_version FROM daily_conversation_summaries ORDER BY date", ) .load(&mut conn) .context("loading daily summaries")?; if let Some(limit) = args.limit { rows.truncate(limit); } if let Some(first) = rows.first() { println!( "\n[summaries] previous model_version '{}' → '{}'", first.model_version, model_version ); } let items = rows .into_iter() .map(|r| WorkItem { id: r.id as i64, text: strip_summary_boilerplate(&r.summary), old_embedding: r.embedding, }) .collect(); let mv = model_version.clone(); let sims = reembed_table( &mut conn, &llm, "summaries", items, args.dry_run, move |conn, id, emb| { sql_query( "UPDATE daily_conversation_summaries SET embedding = ?1, model_version = ?2 WHERE id = ?3", ) .bind::(emb) .bind::(&mv) .bind::(id as i32) .execute(conn)?; Ok(()) }, ) .await?; report_similarity("summaries", sims); } if tables.contains(&"calendar") { let mut rows: Vec = sql_query( "SELECT id, summary, description, location, embedding FROM calendar_events WHERE embedding IS NOT NULL ORDER BY id", ) .load(&mut conn) .context("loading calendar events")?; if let Some(limit) = args.limit { rows.truncate(limit); } let items = rows .into_iter() .map(|r| WorkItem { id: r.id as i64, // Same text construction as import_calendar. text: format!( "{} {} {}", r.summary, r.description.as_deref().unwrap_or(""), r.location.as_deref().unwrap_or("") ), old_embedding: r.embedding, }) .collect(); let sims = reembed_table( &mut conn, &llm, "calendar", items, args.dry_run, |conn, id, emb| { sql_query("UPDATE calendar_events SET embedding = ?1 WHERE id = ?2") .bind::(emb) .bind::(id as i32) .execute(conn)?; Ok(()) }, ) .await?; report_similarity("calendar", sims); } if tables.contains(&"search") { let mut rows: Vec = sql_query( "SELECT rowid AS id, query, embedding FROM search_history ORDER BY rowid", ) .load(&mut conn) .context("loading search history")?; if let Some(limit) = args.limit { rows.truncate(limit); } let items = rows .into_iter() .map(|r| WorkItem { id: r.id, text: r.query, old_embedding: r.embedding, }) .collect(); let sims = reembed_table( &mut conn, &llm, "search", items, args.dry_run, |conn, id, emb| { sql_query("UPDATE search_history SET embedding = ?1 WHERE rowid = ?2") .bind::(emb) .bind::(id) .execute(conn)?; Ok(()) }, ) .await?; report_similarity("search", sims); } if tables.contains(&"entities") { let mut rows: Vec = sql_query( "SELECT id, name, description, embedding FROM entities WHERE embedding IS NOT NULL ORDER BY id", ) .load(&mut conn) .context("loading knowledge entities")?; if let Some(limit) = args.limit { rows.truncate(limit); } let items = rows .into_iter() .map(|r| WorkItem { id: r.id as i64, // Same text construction as tool_store_entity. text: format!("{} {}", r.name, r.description), old_embedding: r.embedding, }) .collect(); let sims = reembed_table( &mut conn, &llm, "entities", items, args.dry_run, |conn, id, emb| { sql_query("UPDATE entities SET embedding = ?1 WHERE id = ?2") .bind::(emb) .bind::(id as i32) .execute(conn)?; Ok(()) }, ) .await?; report_similarity("entities", sims); } println!( "\n{}", if args.dry_run { "Dry run complete" } else { "Done" } ); Ok(()) }