ImageApi/src/bin/reembed_embeddings.rs

//! Re-embed stored corpora through `LocalLlm`, i.e. the same
//! `LLM_BACKEND` dispatch the query side uses. The original import /
//! backfill tools always embedded via Ollama, so a deploy running
//! `LLM_BACKEND=llamacpp` queries vector spaces the corpora may not live
//! in. Three tables share the problem and are all covered here:
//!
//! - `daily_conversation_summaries` — re-embeds
//!   `strip_summary_boilerplate(summary)` (what the original job fed the
//!   embedder); also rewrites `model_version`.
//! - `calendar_events` — re-embeds "summary description location" exactly
//!   as `import_calendar` does; rows without an embedding are skipped (the
//!   import only embeds under `--generate-embeddings`).
//! - `search_history` — re-embeds the raw query text.
//! - `entities` (knowledge graph) — re-embeds "name description" exactly as
//!   `tool_store_entity` does; embedding-less rows are skipped (embedding
//!   is best-effort at store time).
//!
//! Source text is untouched — only vectors are rewritten. The old↔new
//! cosine report doubles as a diagnostic: ~1.0 means both backends already
//! shared a space (re-embedding was a no-op); low values confirm the
//! mismatch this tool exists to fix.

use anyhow::{Context, Result};
use clap::Parser;
use diesel::prelude::*;
use diesel::sql_query;
use diesel::sqlite::SqliteConnection;
use image_api::ai::{LocalLlm, strip_summary_boilerplate};
use image_api::bin_progress;
use std::env;

#[derive(Parser, Debug)]
#[command(author, version, about = "Re-embed stored corpora via the configured LLM_BACKEND", long_about = None)]
struct Args {
    /// Comma-separated tables to process: summaries, calendar, search, entities
    #[arg(long, default_value = "summaries,calendar,search,entities")]
    tables: String,

    /// Only process the first N rows per table (smoke test)
    #[arg(long)]
    limit: Option<usize>,

    /// Compute embeddings and report old↔new similarity without writing
    #[arg(long, default_value_t = false)]
    dry_run: bool,
}

#[derive(QueryableByName)]
struct SummaryRow {
    #[diesel(sql_type = diesel::sql_types::Integer)]
    id: i32,
    #[diesel(sql_type = diesel::sql_types::Text)]
    summary: String,
    #[diesel(sql_type = diesel::sql_types::Binary)]
    embedding: Vec<u8>,
    #[diesel(sql_type = diesel::sql_types::Text)]
    model_version: String,
}

#[derive(QueryableByName)]
struct CalendarRow {
    #[diesel(sql_type = diesel::sql_types::Integer)]
    id: i32,
    #[diesel(sql_type = diesel::sql_types::Text)]
    summary: String,
    #[diesel(sql_type = diesel::sql_types::Nullable<diesel::sql_types::Text>)]
    description: Option<String>,
    #[diesel(sql_type = diesel::sql_types::Nullable<diesel::sql_types::Text>)]
    location: Option<String>,
    #[diesel(sql_type = diesel::sql_types::Binary)]
    embedding: Vec<u8>,
}

#[derive(QueryableByName)]
struct SearchRow {
    #[diesel(sql_type = diesel::sql_types::BigInt)]
    id: i64,
    #[diesel(sql_type = diesel::sql_types::Text)]
    query: String,
    #[diesel(sql_type = diesel::sql_types::Binary)]
    embedding: Vec<u8>,
}

#[derive(QueryableByName)]
struct EntityRow {
    #[diesel(sql_type = diesel::sql_types::Integer)]
    id: i32,
    #[diesel(sql_type = diesel::sql_types::Text)]
    name: String,
    #[diesel(sql_type = diesel::sql_types::Text)]
    description: String,
    #[diesel(sql_type = diesel::sql_types::Binary)]
    embedding: Vec<u8>,
}

/// One unit of re-embed work, normalized across tables.
struct WorkItem {
    /// Row key, as i64 so both i32 ids and rowids fit.
    id: i64,
    /// Text fed to the embedder — must match what the original writer used.
    text: String,
    /// Existing vector bytes, for the old↔new similarity report.
    old_embedding: Vec<u8>,
}

fn deserialize_vector(bytes: &[u8]) -> Option<Vec<f32>> {
    if !bytes.len().is_multiple_of(4) {
        return None;
    }
    Some(
        bytes
            .chunks_exact(4)
            .map(|c| f32::from_le_bytes([c[0], c[1], c[2], c[3]]))
            .collect(),
    )
}

fn serialize_vector(vec: &[f32]) -> Vec<u8> {
    vec.iter().flat_map(|f| f.to_le_bytes()).collect()
}

fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
    if a.len() != b.len() {
        return 0.0;
    }
    let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum();
    let mag_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
    let mag_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
    if mag_a == 0.0 || mag_b == 0.0 {
        return 0.0;
    }
    dot / (mag_a * mag_b)
}

/// Embed `text`, halving it on "input too large" errors until it fits the
/// server's physical batch (`--ubatch-size`). Mirrors the silent truncation
/// Ollama applied when these corpora were first embedded — llama-server
/// returns a 500 instead — except here it's surfaced via the returned flag.
/// Returns `(embedding, truncated)`.
async fn embed_with_truncation(llm: &LocalLlm, text: &str) -> Result<(Vec<f32>, bool)> {
    let mut text = text.to_string();
    let mut truncated = false;
    loop {
        match llm.embed_document(&text).await {
            Ok(emb) => return Ok((emb, truncated)),
            Err(e)
                if e.to_string().contains("too large to process") && text.chars().count() > 64 =>
            {
                let keep = text.chars().count() / 2;
                text = text.chars().take(keep).collect();
                truncated = true;
            }
            Err(e) => return Err(e),
        }
    }
}

/// Re-embed `items`, writing each new vector via `update`. Returns the
/// old↔new cosines for the similarity report.
async fn reembed_table(
    conn: &mut SqliteConnection,
    llm: &LocalLlm,
    label: &str,
    items: Vec<WorkItem>,
    dry_run: bool,
    update: impl Fn(&mut SqliteConnection, i64, Vec<u8>) -> Result<()>,
) -> Result<Vec<f32>> {
    println!("\n[{}] re-embedding {} rows...", label, items.len());
    let pb = bin_progress::determinate(items.len() as u64, format!("re-embedding {}", label));

    let mut sims: Vec<f32> = Vec::with_capacity(items.len());
    let mut updated = 0usize;
    let mut failed = 0usize;
    let mut truncated_count = 0usize;

    for item in &items {
        let new_emb = match embed_with_truncation(llm, &item.text).await {
            Ok((e, truncated)) => {
                if truncated {
                    truncated_count += 1;
                    pb.println(format!(
                        "⚠ {} id={}: input exceeded the embed server's batch size, \
                         truncated before embedding",
                        label, item.id
                    ));
                }
                e
            }
            Err(e) => {
                pb.inc(1);
                failed += 1;
                eprintln!("✗ {} id={}: {}", label, item.id, e);
                continue;
            }
        };

        // The whole pipeline (DAO checks, stored corpora) assumes
        // EMBEDDING_DIM dims. A mismatch means the active embed slot is not
        // serving the configured model — stop rather than corrupt the table.
        anyhow::ensure!(
            new_emb.len() == image_api::ai::embedding_dim(),
            "backend returned {}-dim embedding (expected {}) — '{}' does not \
             match the configured EMBEDDING_DIM",
            new_emb.len(),
            image_api::ai::embedding_dim(),
            llm.embedding_model_version()
        );

        if let Some(old_emb) = deserialize_vector(&item.old_embedding) {
            sims.push(cosine_similarity(&old_emb, &new_emb));
        }

        if !dry_run {
            update(conn, item.id, serialize_vector(&new_emb))
                .with_context(|| format!("updating {} id={}", label, item.id))?;
        }
        updated += 1;
        pb.inc(1);
    }
    pb.finish_and_clear();

    println!(
        "[{}] {} re-embedded ({} truncated), {} failed",
        label, updated, truncated_count, failed
    );
    Ok(sims)
}

fn report_similarity(label: &str, mut sims: Vec<f32>) {
    if sims.is_empty() {
        println!("[{}] no old↔new pairs to compare", label);
        return;
    }
    sims.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
    let mean: f32 = sims.iter().sum::<f32>() / sims.len() as f32;
    let median = sims[sims.len() / 2];
    println!(
        "[{}] old↔new cosine over identical text: min={:.3} median={:.3} mean={:.3} max={:.3}",
        label,
        sims.first().unwrap(),
        median,
        mean,
        sims.last().unwrap()
    );
    if median > 0.98 {
        println!(
            "[{}] → old and new backends agree (~same vector space); poor search \
             results are coming from something else (prefixes, thresholds, corpus).",
            label
        );
    } else if median > 0.9 {
        println!(
            "[{}] → same model family but measurably different vectors \
             (quantization / runtime drift); re-embedding was worthwhile.",
            label
        );
    } else {
        println!(
            "[{}] → vector-space mismatch confirmed — queries were searching a \
             different space than the corpus. This re-embed should fix it.",
            label
        );
    }
}

#[tokio::main]
async fn main() -> Result<()> {
    dotenv::dotenv().ok();
    env_logger::init();
    let args = Args::parse();

    let tables: Vec<&str> = args.tables.split(',').map(|t| t.trim()).collect();
    for t in &tables {
        anyhow::ensure!(
            matches!(*t, "summaries" | "calendar" | "search" | "entities"),
            "unknown table '{}' — expected summaries, calendar, search, entities",
            t
        );
    }

    let database_url = env::var("DATABASE_URL").unwrap_or_else(|_| "auth.db".to_string());
    println!("Database: {}", database_url);

    let mut conn = SqliteConnection::establish(&database_url)
        .with_context(|| format!("connecting to {}", database_url))?;

    let llm = LocalLlm::from_env();
    let model_version = llm.embedding_model_version();
    println!("Embedding via '{}'", model_version);
    if args.dry_run {
        println!("DRY RUN — no rows will be written");
    }

    if tables.contains(&"summaries") {
        let mut rows: Vec<SummaryRow> = sql_query(
            "SELECT id, summary, embedding, model_version
             FROM daily_conversation_summaries ORDER BY date",
        )
        .load(&mut conn)
        .context("loading daily summaries")?;
        if let Some(limit) = args.limit {
            rows.truncate(limit);
        }
        if let Some(first) = rows.first() {
            println!(
                "\n[summaries] previous model_version '{}' → '{}'",
                first.model_version, model_version
            );
        }
        let items = rows
            .into_iter()
            .map(|r| WorkItem {
                id: r.id as i64,
                text: strip_summary_boilerplate(&r.summary),
                old_embedding: r.embedding,
            })
            .collect();
        let mv = model_version.clone();
        let sims = reembed_table(
            &mut conn,
            &llm,
            "summaries",
            items,
            args.dry_run,
            move |conn, id, emb| {
                sql_query(
                    "UPDATE daily_conversation_summaries
                     SET embedding = ?1, model_version = ?2 WHERE id = ?3",
                )
                .bind::<diesel::sql_types::Binary, _>(emb)
                .bind::<diesel::sql_types::Text, _>(&mv)
                .bind::<diesel::sql_types::Integer, _>(id as i32)
                .execute(conn)?;
                Ok(())
            },
        )
        .await?;
        report_similarity("summaries", sims);
    }

    if tables.contains(&"calendar") {
        let mut rows: Vec<CalendarRow> = sql_query(
            "SELECT id, summary, description, location, embedding
             FROM calendar_events WHERE embedding IS NOT NULL ORDER BY id",
        )
        .load(&mut conn)
        .context("loading calendar events")?;
        if let Some(limit) = args.limit {
            rows.truncate(limit);
        }
        let items = rows
            .into_iter()
            .map(|r| WorkItem {
                id: r.id as i64,
                // Same text construction as import_calendar.
                text: format!(
                    "{} {} {}",
                    r.summary,
                    r.description.as_deref().unwrap_or(""),
                    r.location.as_deref().unwrap_or("")
                ),
                old_embedding: r.embedding,
            })
            .collect();
        let sims = reembed_table(
            &mut conn,
            &llm,
            "calendar",
            items,
            args.dry_run,
            |conn, id, emb| {
                sql_query("UPDATE calendar_events SET embedding = ?1 WHERE id = ?2")
                    .bind::<diesel::sql_types::Binary, _>(emb)
                    .bind::<diesel::sql_types::Integer, _>(id as i32)
                    .execute(conn)?;
                Ok(())
            },
        )
        .await?;
        report_similarity("calendar", sims);
    }

    if tables.contains(&"search") {
        let mut rows: Vec<SearchRow> = sql_query(
            "SELECT rowid AS id, query, embedding
             FROM search_history ORDER BY rowid",
        )
        .load(&mut conn)
        .context("loading search history")?;
        if let Some(limit) = args.limit {
            rows.truncate(limit);
        }
        let items = rows
            .into_iter()
            .map(|r| WorkItem {
                id: r.id,
                text: r.query,
                old_embedding: r.embedding,
            })
            .collect();
        let sims = reembed_table(
            &mut conn,
            &llm,
            "search",
            items,
            args.dry_run,
            |conn, id, emb| {
                sql_query("UPDATE search_history SET embedding = ?1 WHERE rowid = ?2")
                    .bind::<diesel::sql_types::Binary, _>(emb)
                    .bind::<diesel::sql_types::BigInt, _>(id)
                    .execute(conn)?;
                Ok(())
            },
        )
        .await?;
        report_similarity("search", sims);
    }

    if tables.contains(&"entities") {
        let mut rows: Vec<EntityRow> = sql_query(
            "SELECT id, name, description, embedding
             FROM entities WHERE embedding IS NOT NULL ORDER BY id",
        )
        .load(&mut conn)
        .context("loading knowledge entities")?;
        if let Some(limit) = args.limit {
            rows.truncate(limit);
        }
        let items = rows
            .into_iter()
            .map(|r| WorkItem {
                id: r.id as i64,
                // Same text construction as tool_store_entity.
                text: format!("{} {}", r.name, r.description),
                old_embedding: r.embedding,
            })
            .collect();
        let sims = reembed_table(
            &mut conn,
            &llm,
            "entities",
            items,
            args.dry_run,
            |conn, id, emb| {
                sql_query("UPDATE entities SET embedding = ?1 WHERE id = ?2")
                    .bind::<diesel::sql_types::Binary, _>(emb)
                    .bind::<diesel::sql_types::Integer, _>(id as i32)
                    .execute(conn)?;
                Ok(())
            },
        )
        .await?;
        report_similarity("entities", sims);
    }

    println!(
        "\n{}",
        if args.dry_run {
            "Dry run complete"
        } else {
            "Done"
        }
    );
    Ok(())
}