Fix RAG vector-space mismatch and search_rag retrieval quality

Queries embedded via llama-swap were searching corpora embedded via Ollama (measured: spaces diverged). Introduce LocalLlm — the local Ollama + llama-swap pair with LLM_BACKEND dispatch baked in — and route all embedding writers through it; anything embedding via a concrete client reintroduces the bug. - search_rag: embed the model's query verbatim (no metadata boilerplate), make date optional — no time-decay when omitted, so "when did X happen?" queries rank purely by similarity across all time - reembed_embeddings bin: re-embed summaries / calendar / search / knowledge entities via the active backend, with old-new cosine report per table and truncate-and-retry for inputs over the embed server's physical batch size - import_calendar, import_search_history: embed through LocalLlm - search_messages / get_sms_messages: render sender → recipient so sent messages are attributable to a conversation - insight job failures: store the one-line anyhow context chain ({:#}) instead of the Debug dump the client was shown verbatim - serialize env_dispatch tests behind a lock (parallel-runner flake) Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-11 19:06:52 -04:00
parent 0accc4ef2f
commit a022a3d15d
8 changed files with 738 additions and 99 deletions
@@ -0,0 +1,464 @@
+//! Re-embed stored corpora through `LocalLlm`, i.e. the same
+//! `LLM_BACKEND` dispatch the query side uses. The original import /
+//! backfill tools always embedded via Ollama, so a deploy running
+//! `LLM_BACKEND=llamacpp` queries vector spaces the corpora may not live
+//! in. Three tables share the problem and are all covered here:
+//!
+//! - `daily_conversation_summaries` — re-embeds
+//!   `strip_summary_boilerplate(summary)` (what the original job fed the
+//!   embedder); also rewrites `model_version`.
+//! - `calendar_events` — re-embeds "summary description location" exactly
+//!   as `import_calendar` does; rows without an embedding are skipped (the
+//!   import only embeds under `--generate-embeddings`).
+//! - `search_history` — re-embeds the raw query text.
+//! - `entities` (knowledge graph) — re-embeds "name description" exactly as
+//!   `tool_store_entity` does; embedding-less rows are skipped (embedding
+//!   is best-effort at store time).
+//!
+//! Source text is untouched — only vectors are rewritten. The old↔new
+//! cosine report doubles as a diagnostic: ~1.0 means both backends already
+//! shared a space (re-embedding was a no-op); low values confirm the
+//! mismatch this tool exists to fix.
+
+use anyhow::{Context, Result};
+use clap::Parser;
+use diesel::prelude::*;
+use diesel::sql_query;
+use diesel::sqlite::SqliteConnection;
+use image_api::ai::{LocalLlm, strip_summary_boilerplate};
+use image_api::bin_progress;
+use std::env;
+
+#[derive(Parser, Debug)]
+#[command(author, version, about = "Re-embed stored corpora via the configured LLM_BACKEND", long_about = None)]
+struct Args {
+    /// Comma-separated tables to process: summaries, calendar, search, entities
+    #[arg(long, default_value = "summaries,calendar,search,entities")]
+    tables: String,
+
+    /// Only process the first N rows per table (smoke test)
+    #[arg(long)]
+    limit: Option<usize>,
+
+    /// Compute embeddings and report old↔new similarity without writing
+    #[arg(long, default_value_t = false)]
+    dry_run: bool,
+}
+
+#[derive(QueryableByName)]
+struct SummaryRow {
+    #[diesel(sql_type = diesel::sql_types::Integer)]
+    id: i32,
+    #[diesel(sql_type = diesel::sql_types::Text)]
+    summary: String,
+    #[diesel(sql_type = diesel::sql_types::Binary)]
+    embedding: Vec<u8>,
+    #[diesel(sql_type = diesel::sql_types::Text)]
+    model_version: String,
+}
+
+#[derive(QueryableByName)]
+struct CalendarRow {
+    #[diesel(sql_type = diesel::sql_types::Integer)]
+    id: i32,
+    #[diesel(sql_type = diesel::sql_types::Text)]
+    summary: String,
+    #[diesel(sql_type = diesel::sql_types::Nullable<diesel::sql_types::Text>)]
+    description: Option<String>,
+    #[diesel(sql_type = diesel::sql_types::Nullable<diesel::sql_types::Text>)]
+    location: Option<String>,
+    #[diesel(sql_type = diesel::sql_types::Binary)]
+    embedding: Vec<u8>,
+}
+
+#[derive(QueryableByName)]
+struct SearchRow {
+    #[diesel(sql_type = diesel::sql_types::BigInt)]
+    id: i64,
+    #[diesel(sql_type = diesel::sql_types::Text)]
+    query: String,
+    #[diesel(sql_type = diesel::sql_types::Binary)]
+    embedding: Vec<u8>,
+}
+
+#[derive(QueryableByName)]
+struct EntityRow {
+    #[diesel(sql_type = diesel::sql_types::Integer)]
+    id: i32,
+    #[diesel(sql_type = diesel::sql_types::Text)]
+    name: String,
+    #[diesel(sql_type = diesel::sql_types::Text)]
+    description: String,
+    #[diesel(sql_type = diesel::sql_types::Binary)]
+    embedding: Vec<u8>,
+}
+
+/// One unit of re-embed work, normalized across tables.
+struct WorkItem {
+    /// Row key, as i64 so both i32 ids and rowids fit.
+    id: i64,
+    /// Text fed to the embedder — must match what the original writer used.
+    text: String,
+    /// Existing vector bytes, for the old↔new similarity report.
+    old_embedding: Vec<u8>,
+}
+
+fn deserialize_vector(bytes: &[u8]) -> Option<Vec<f32>> {
+    if !bytes.len().is_multiple_of(4) {
+        return None;
+    }
+    Some(
+        bytes
+            .chunks_exact(4)
+            .map(|c| f32::from_le_bytes([c[0], c[1], c[2], c[3]]))
+            .collect(),
+    )
+}
+
+fn serialize_vector(vec: &[f32]) -> Vec<u8> {
+    vec.iter().flat_map(|f| f.to_le_bytes()).collect()
+}
+
+fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
+    if a.len() != b.len() {
+        return 0.0;
+    }
+    let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum();
+    let mag_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
+    let mag_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
+    if mag_a == 0.0 || mag_b == 0.0 {
+        return 0.0;
+    }
+    dot / (mag_a * mag_b)
+}
+
+/// Embed `text`, halving it on "input too large" errors until it fits the
+/// server's physical batch (`--ubatch-size`). Mirrors the silent truncation
+/// Ollama applied when these corpora were first embedded — llama-server
+/// returns a 500 instead — except here it's surfaced via the returned flag.
+/// Returns `(embedding, truncated)`.
+async fn embed_with_truncation(llm: &LocalLlm, text: &str) -> Result<(Vec<f32>, bool)> {
+    let mut text = text.to_string();
+    let mut truncated = false;
+    loop {
+        match llm.embed(&text).await {
+            Ok(emb) => return Ok((emb, truncated)),
+            Err(e)
+                if e.to_string().contains("too large to process") && text.chars().count() > 64 =>
+            {
+                let keep = text.chars().count() / 2;
+                text = text.chars().take(keep).collect();
+                truncated = true;
+            }
+            Err(e) => return Err(e),
+        }
+    }
+}
+
+/// Re-embed `items`, writing each new vector via `update`. Returns the
+/// old↔new cosines for the similarity report.
+async fn reembed_table(
+    conn: &mut SqliteConnection,
+    llm: &LocalLlm,
+    label: &str,
+    items: Vec<WorkItem>,
+    dry_run: bool,
+    update: impl Fn(&mut SqliteConnection, i64, Vec<u8>) -> Result<()>,
+) -> Result<Vec<f32>> {
+    println!("\n[{}] re-embedding {} rows...", label, items.len());
+    let pb = bin_progress::determinate(items.len() as u64, format!("re-embedding {}", label));
+
+    let mut sims: Vec<f32> = Vec::with_capacity(items.len());
+    let mut updated = 0usize;
+    let mut failed = 0usize;
+    let mut truncated_count = 0usize;
+
+    for item in &items {
+        let new_emb = match embed_with_truncation(llm, &item.text).await {
+            Ok((e, truncated)) => {
+                if truncated {
+                    truncated_count += 1;
+                    pb.println(format!(
+                        "⚠ {} id={}: input exceeded the embed server's batch size, \
+                         truncated before embedding",
+                        label, item.id
+                    ));
+                }
+                e
+            }
+            Err(e) => {
+                pb.inc(1);
+                failed += 1;
+                eprintln!("✗ {} id={}: {}", label, item.id, e);
+                continue;
+            }
+        };
+
+        // The whole pipeline (DAO checks, stored corpora) assumes 768 dims.
+        // A different dim means the active backend is not serving a
+        // nomic-compatible model — stop rather than corrupt the table.
+        anyhow::ensure!(
+            new_emb.len() == 768,
+            "backend returned {}-dim embedding (expected 768) — '{}' is not \
+             serving a nomic-embed-text-v1.5-compatible model",
+            new_emb.len(),
+            llm.embedding_model_version()
+        );
+
+        if let Some(old_emb) = deserialize_vector(&item.old_embedding) {
+            sims.push(cosine_similarity(&old_emb, &new_emb));
+        }
+
+        if !dry_run {
+            update(conn, item.id, serialize_vector(&new_emb))
+                .with_context(|| format!("updating {} id={}", label, item.id))?;
+        }
+        updated += 1;
+        pb.inc(1);
+    }
+    pb.finish_and_clear();
+
+    println!(
+        "[{}] {} re-embedded ({} truncated), {} failed",
+        label, updated, truncated_count, failed
+    );
+    Ok(sims)
+}
+
+fn report_similarity(label: &str, mut sims: Vec<f32>) {
+    if sims.is_empty() {
+        println!("[{}] no old↔new pairs to compare", label);
+        return;
+    }
+    sims.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
+    let mean: f32 = sims.iter().sum::<f32>() / sims.len() as f32;
+    let median = sims[sims.len() / 2];
+    println!(
+        "[{}] old↔new cosine over identical text: min={:.3} median={:.3} mean={:.3} max={:.3}",
+        label,
+        sims.first().unwrap(),
+        median,
+        mean,
+        sims.last().unwrap()
+    );
+    if median > 0.98 {
+        println!(
+            "[{}] → old and new backends agree (~same vector space); poor search \
+             results are coming from something else (prefixes, thresholds, corpus).",
+            label
+        );
+    } else if median > 0.9 {
+        println!(
+            "[{}] → same model family but measurably different vectors \
+             (quantization / runtime drift); re-embedding was worthwhile.",
+            label
+        );
+    } else {
+        println!(
+            "[{}] → vector-space mismatch confirmed — queries were searching a \
+             different space than the corpus. This re-embed should fix it.",
+            label
+        );
+    }
+}
+
+#[tokio::main]
+async fn main() -> Result<()> {
+    dotenv::dotenv().ok();
+    env_logger::init();
+    let args = Args::parse();
+
+    let tables: Vec<&str> = args.tables.split(',').map(|t| t.trim()).collect();
+    for t in &tables {
+        anyhow::ensure!(
+            matches!(*t, "summaries" | "calendar" | "search" | "entities"),
+            "unknown table '{}' — expected summaries, calendar, search, entities",
+            t
+        );
+    }
+
+    let database_url = env::var("DATABASE_URL").unwrap_or_else(|_| "auth.db".to_string());
+    println!("Database: {}", database_url);
+
+    let mut conn = SqliteConnection::establish(&database_url)
+        .with_context(|| format!("connecting to {}", database_url))?;
+
+    let llm = LocalLlm::from_env();
+    let model_version = llm.embedding_model_version();
+    println!("Embedding via '{}'", model_version);
+    if args.dry_run {
+        println!("DRY RUN — no rows will be written");
+    }
+
+    if tables.contains(&"summaries") {
+        let mut rows: Vec<SummaryRow> = sql_query(
+            "SELECT id, summary, embedding, model_version
+             FROM daily_conversation_summaries ORDER BY date",
+        )
+        .load(&mut conn)
+        .context("loading daily summaries")?;
+        if let Some(limit) = args.limit {
+            rows.truncate(limit);
+        }
+        if let Some(first) = rows.first() {
+            println!(
+                "\n[summaries] previous model_version '{}' → '{}'",
+                first.model_version, model_version
+            );
+        }
+        let items = rows
+            .into_iter()
+            .map(|r| WorkItem {
+                id: r.id as i64,
+                text: strip_summary_boilerplate(&r.summary),
+                old_embedding: r.embedding,
+            })
+            .collect();
+        let mv = model_version.clone();
+        let sims = reembed_table(
+            &mut conn,
+            &llm,
+            "summaries",
+            items,
+            args.dry_run,
+            move |conn, id, emb| {
+                sql_query(
+                    "UPDATE daily_conversation_summaries
+                     SET embedding = ?1, model_version = ?2 WHERE id = ?3",
+                )
+                .bind::<diesel::sql_types::Binary, _>(emb)
+                .bind::<diesel::sql_types::Text, _>(&mv)
+                .bind::<diesel::sql_types::Integer, _>(id as i32)
+                .execute(conn)?;
+                Ok(())
+            },
+        )
+        .await?;
+        report_similarity("summaries", sims);
+    }
+
+    if tables.contains(&"calendar") {
+        let mut rows: Vec<CalendarRow> = sql_query(
+            "SELECT id, summary, description, location, embedding
+             FROM calendar_events WHERE embedding IS NOT NULL ORDER BY id",
+        )
+        .load(&mut conn)
+        .context("loading calendar events")?;
+        if let Some(limit) = args.limit {
+            rows.truncate(limit);
+        }
+        let items = rows
+            .into_iter()
+            .map(|r| WorkItem {
+                id: r.id as i64,
+                // Same text construction as import_calendar.
+                text: format!(
+                    "{} {} {}",
+                    r.summary,
+                    r.description.as_deref().unwrap_or(""),
+                    r.location.as_deref().unwrap_or("")
+                ),
+                old_embedding: r.embedding,
+            })
+            .collect();
+        let sims = reembed_table(
+            &mut conn,
+            &llm,
+            "calendar",
+            items,
+            args.dry_run,
+            |conn, id, emb| {
+                sql_query("UPDATE calendar_events SET embedding = ?1 WHERE id = ?2")
+                    .bind::<diesel::sql_types::Binary, _>(emb)
+                    .bind::<diesel::sql_types::Integer, _>(id as i32)
+                    .execute(conn)?;
+                Ok(())
+            },
+        )
+        .await?;
+        report_similarity("calendar", sims);
+    }
+
+    if tables.contains(&"search") {
+        let mut rows: Vec<SearchRow> = sql_query(
+            "SELECT rowid AS id, query, embedding
+             FROM search_history ORDER BY rowid",
+        )
+        .load(&mut conn)
+        .context("loading search history")?;
+        if let Some(limit) = args.limit {
+            rows.truncate(limit);
+        }
+        let items = rows
+            .into_iter()
+            .map(|r| WorkItem {
+                id: r.id,
+                text: r.query,
+                old_embedding: r.embedding,
+            })
+            .collect();
+        let sims = reembed_table(
+            &mut conn,
+            &llm,
+            "search",
+            items,
+            args.dry_run,
+            |conn, id, emb| {
+                sql_query("UPDATE search_history SET embedding = ?1 WHERE rowid = ?2")
+                    .bind::<diesel::sql_types::Binary, _>(emb)
+                    .bind::<diesel::sql_types::BigInt, _>(id)
+                    .execute(conn)?;
+                Ok(())
+            },
+        )
+        .await?;
+        report_similarity("search", sims);
+    }
+
+    if tables.contains(&"entities") {
+        let mut rows: Vec<EntityRow> = sql_query(
+            "SELECT id, name, description, embedding
+             FROM entities WHERE embedding IS NOT NULL ORDER BY id",
+        )
+        .load(&mut conn)
+        .context("loading knowledge entities")?;
+        if let Some(limit) = args.limit {
+            rows.truncate(limit);
+        }
+        let items = rows
+            .into_iter()
+            .map(|r| WorkItem {
+                id: r.id as i64,
+                // Same text construction as tool_store_entity.
+                text: format!("{} {}", r.name, r.description),
+                old_embedding: r.embedding,
+            })
+            .collect();
+        let sims = reembed_table(
+            &mut conn,
+            &llm,
+            "entities",
+            items,
+            args.dry_run,
+            |conn, id, emb| {
+                sql_query("UPDATE entities SET embedding = ?1 WHERE id = ?2")
+                    .bind::<diesel::sql_types::Binary, _>(emb)
+                    .bind::<diesel::sql_types::Integer, _>(id as i32)
+                    .execute(conn)?;
+                Ok(())
+            },
+        )
+        .await?;
+        report_similarity("entities", sims);
+    }
+
+    println!(
+        "\n{}",
+        if args.dry_run {
+            "Dry run complete"
+        } else {
+            "Done"
+        }
+    );
+    Ok(())
+}