efd05db523
Trialing Qwen3-Embedding-0.6B (1024-dim, instruct-prefixed queries) against nomic required code changes at every hardcoded seam; now it's a config flip plus a reembed_embeddings run. - EMBEDDING_DIM env (default 768) replaces every hardcoded dim check: daily summary / calendar / search / location DAOs, Ollama batch validation, reembed_embeddings - entities gains the dim guard it never had — a wrong-dim vector silently kills dedup/recall (cosine over mismatched lengths is 0), so store None and warn instead - embed_query / embed_document split with EMBED_QUERY_PREFIX / EMBED_DOCUMENT_PREFIX (literal \n expanded): retrieval models treat the two sides differently — nomic wants search_query:/search_document:, Qwen3 wants Instruct:...\nQuery: on queries only. All query-side call sites and all corpus writers now declare their side. - document the contract in CLAUDE.md: change the model or any of these vars → re-run reembed_embeddings or search is garbage Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
466 lines
15 KiB
Rust
466 lines
15 KiB
Rust
//! Re-embed stored corpora through `LocalLlm`, i.e. the same
|
|
//! `LLM_BACKEND` dispatch the query side uses. The original import /
|
|
//! backfill tools always embedded via Ollama, so a deploy running
|
|
//! `LLM_BACKEND=llamacpp` queries vector spaces the corpora may not live
|
|
//! in. Three tables share the problem and are all covered here:
|
|
//!
|
|
//! - `daily_conversation_summaries` — re-embeds
|
|
//! `strip_summary_boilerplate(summary)` (what the original job fed the
|
|
//! embedder); also rewrites `model_version`.
|
|
//! - `calendar_events` — re-embeds "summary description location" exactly
|
|
//! as `import_calendar` does; rows without an embedding are skipped (the
|
|
//! import only embeds under `--generate-embeddings`).
|
|
//! - `search_history` — re-embeds the raw query text.
|
|
//! - `entities` (knowledge graph) — re-embeds "name description" exactly as
|
|
//! `tool_store_entity` does; embedding-less rows are skipped (embedding
|
|
//! is best-effort at store time).
|
|
//!
|
|
//! Source text is untouched — only vectors are rewritten. The old↔new
|
|
//! cosine report doubles as a diagnostic: ~1.0 means both backends already
|
|
//! shared a space (re-embedding was a no-op); low values confirm the
|
|
//! mismatch this tool exists to fix.
|
|
|
|
use anyhow::{Context, Result};
|
|
use clap::Parser;
|
|
use diesel::prelude::*;
|
|
use diesel::sql_query;
|
|
use diesel::sqlite::SqliteConnection;
|
|
use image_api::ai::{LocalLlm, strip_summary_boilerplate};
|
|
use image_api::bin_progress;
|
|
use std::env;
|
|
|
|
#[derive(Parser, Debug)]
|
|
#[command(author, version, about = "Re-embed stored corpora via the configured LLM_BACKEND", long_about = None)]
|
|
struct Args {
|
|
/// Comma-separated tables to process: summaries, calendar, search, entities
|
|
#[arg(long, default_value = "summaries,calendar,search,entities")]
|
|
tables: String,
|
|
|
|
/// Only process the first N rows per table (smoke test)
|
|
#[arg(long)]
|
|
limit: Option<usize>,
|
|
|
|
/// Compute embeddings and report old↔new similarity without writing
|
|
#[arg(long, default_value_t = false)]
|
|
dry_run: bool,
|
|
}
|
|
|
|
#[derive(QueryableByName)]
|
|
struct SummaryRow {
|
|
#[diesel(sql_type = diesel::sql_types::Integer)]
|
|
id: i32,
|
|
#[diesel(sql_type = diesel::sql_types::Text)]
|
|
summary: String,
|
|
#[diesel(sql_type = diesel::sql_types::Binary)]
|
|
embedding: Vec<u8>,
|
|
#[diesel(sql_type = diesel::sql_types::Text)]
|
|
model_version: String,
|
|
}
|
|
|
|
#[derive(QueryableByName)]
|
|
struct CalendarRow {
|
|
#[diesel(sql_type = diesel::sql_types::Integer)]
|
|
id: i32,
|
|
#[diesel(sql_type = diesel::sql_types::Text)]
|
|
summary: String,
|
|
#[diesel(sql_type = diesel::sql_types::Nullable<diesel::sql_types::Text>)]
|
|
description: Option<String>,
|
|
#[diesel(sql_type = diesel::sql_types::Nullable<diesel::sql_types::Text>)]
|
|
location: Option<String>,
|
|
#[diesel(sql_type = diesel::sql_types::Binary)]
|
|
embedding: Vec<u8>,
|
|
}
|
|
|
|
#[derive(QueryableByName)]
|
|
struct SearchRow {
|
|
#[diesel(sql_type = diesel::sql_types::BigInt)]
|
|
id: i64,
|
|
#[diesel(sql_type = diesel::sql_types::Text)]
|
|
query: String,
|
|
#[diesel(sql_type = diesel::sql_types::Binary)]
|
|
embedding: Vec<u8>,
|
|
}
|
|
|
|
#[derive(QueryableByName)]
|
|
struct EntityRow {
|
|
#[diesel(sql_type = diesel::sql_types::Integer)]
|
|
id: i32,
|
|
#[diesel(sql_type = diesel::sql_types::Text)]
|
|
name: String,
|
|
#[diesel(sql_type = diesel::sql_types::Text)]
|
|
description: String,
|
|
#[diesel(sql_type = diesel::sql_types::Binary)]
|
|
embedding: Vec<u8>,
|
|
}
|
|
|
|
/// One unit of re-embed work, normalized across tables.
|
|
struct WorkItem {
|
|
/// Row key, as i64 so both i32 ids and rowids fit.
|
|
id: i64,
|
|
/// Text fed to the embedder — must match what the original writer used.
|
|
text: String,
|
|
/// Existing vector bytes, for the old↔new similarity report.
|
|
old_embedding: Vec<u8>,
|
|
}
|
|
|
|
fn deserialize_vector(bytes: &[u8]) -> Option<Vec<f32>> {
|
|
if !bytes.len().is_multiple_of(4) {
|
|
return None;
|
|
}
|
|
Some(
|
|
bytes
|
|
.chunks_exact(4)
|
|
.map(|c| f32::from_le_bytes([c[0], c[1], c[2], c[3]]))
|
|
.collect(),
|
|
)
|
|
}
|
|
|
|
fn serialize_vector(vec: &[f32]) -> Vec<u8> {
|
|
vec.iter().flat_map(|f| f.to_le_bytes()).collect()
|
|
}
|
|
|
|
fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
|
|
if a.len() != b.len() {
|
|
return 0.0;
|
|
}
|
|
let dot: f32 = a.iter().zip(b).map(|(x, y)| x * y).sum();
|
|
let mag_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
|
|
let mag_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
|
|
if mag_a == 0.0 || mag_b == 0.0 {
|
|
return 0.0;
|
|
}
|
|
dot / (mag_a * mag_b)
|
|
}
|
|
|
|
/// Embed `text`, halving it on "input too large" errors until it fits the
|
|
/// server's physical batch (`--ubatch-size`). Mirrors the silent truncation
|
|
/// Ollama applied when these corpora were first embedded — llama-server
|
|
/// returns a 500 instead — except here it's surfaced via the returned flag.
|
|
/// Returns `(embedding, truncated)`.
|
|
async fn embed_with_truncation(llm: &LocalLlm, text: &str) -> Result<(Vec<f32>, bool)> {
|
|
let mut text = text.to_string();
|
|
let mut truncated = false;
|
|
loop {
|
|
match llm.embed_document(&text).await {
|
|
Ok(emb) => return Ok((emb, truncated)),
|
|
Err(e)
|
|
if e.to_string().contains("too large to process") && text.chars().count() > 64 =>
|
|
{
|
|
let keep = text.chars().count() / 2;
|
|
text = text.chars().take(keep).collect();
|
|
truncated = true;
|
|
}
|
|
Err(e) => return Err(e),
|
|
}
|
|
}
|
|
}
|
|
|
|
/// Re-embed `items`, writing each new vector via `update`. Returns the
|
|
/// old↔new cosines for the similarity report.
|
|
async fn reembed_table(
|
|
conn: &mut SqliteConnection,
|
|
llm: &LocalLlm,
|
|
label: &str,
|
|
items: Vec<WorkItem>,
|
|
dry_run: bool,
|
|
update: impl Fn(&mut SqliteConnection, i64, Vec<u8>) -> Result<()>,
|
|
) -> Result<Vec<f32>> {
|
|
println!("\n[{}] re-embedding {} rows...", label, items.len());
|
|
let pb = bin_progress::determinate(items.len() as u64, format!("re-embedding {}", label));
|
|
|
|
let mut sims: Vec<f32> = Vec::with_capacity(items.len());
|
|
let mut updated = 0usize;
|
|
let mut failed = 0usize;
|
|
let mut truncated_count = 0usize;
|
|
|
|
for item in &items {
|
|
let new_emb = match embed_with_truncation(llm, &item.text).await {
|
|
Ok((e, truncated)) => {
|
|
if truncated {
|
|
truncated_count += 1;
|
|
pb.println(format!(
|
|
"⚠ {} id={}: input exceeded the embed server's batch size, \
|
|
truncated before embedding",
|
|
label, item.id
|
|
));
|
|
}
|
|
e
|
|
}
|
|
Err(e) => {
|
|
pb.inc(1);
|
|
failed += 1;
|
|
eprintln!("✗ {} id={}: {}", label, item.id, e);
|
|
continue;
|
|
}
|
|
};
|
|
|
|
// The whole pipeline (DAO checks, stored corpora) assumes
|
|
// EMBEDDING_DIM dims. A mismatch means the active embed slot is not
|
|
// serving the configured model — stop rather than corrupt the table.
|
|
anyhow::ensure!(
|
|
new_emb.len() == image_api::ai::embedding_dim(),
|
|
"backend returned {}-dim embedding (expected {}) — '{}' does not \
|
|
match the configured EMBEDDING_DIM",
|
|
new_emb.len(),
|
|
image_api::ai::embedding_dim(),
|
|
llm.embedding_model_version()
|
|
);
|
|
|
|
if let Some(old_emb) = deserialize_vector(&item.old_embedding) {
|
|
sims.push(cosine_similarity(&old_emb, &new_emb));
|
|
}
|
|
|
|
if !dry_run {
|
|
update(conn, item.id, serialize_vector(&new_emb))
|
|
.with_context(|| format!("updating {} id={}", label, item.id))?;
|
|
}
|
|
updated += 1;
|
|
pb.inc(1);
|
|
}
|
|
pb.finish_and_clear();
|
|
|
|
println!(
|
|
"[{}] {} re-embedded ({} truncated), {} failed",
|
|
label, updated, truncated_count, failed
|
|
);
|
|
Ok(sims)
|
|
}
|
|
|
|
fn report_similarity(label: &str, mut sims: Vec<f32>) {
|
|
if sims.is_empty() {
|
|
println!("[{}] no old↔new pairs to compare", label);
|
|
return;
|
|
}
|
|
sims.sort_by(|a, b| a.partial_cmp(b).unwrap_or(std::cmp::Ordering::Equal));
|
|
let mean: f32 = sims.iter().sum::<f32>() / sims.len() as f32;
|
|
let median = sims[sims.len() / 2];
|
|
println!(
|
|
"[{}] old↔new cosine over identical text: min={:.3} median={:.3} mean={:.3} max={:.3}",
|
|
label,
|
|
sims.first().unwrap(),
|
|
median,
|
|
mean,
|
|
sims.last().unwrap()
|
|
);
|
|
if median > 0.98 {
|
|
println!(
|
|
"[{}] → old and new backends agree (~same vector space); poor search \
|
|
results are coming from something else (prefixes, thresholds, corpus).",
|
|
label
|
|
);
|
|
} else if median > 0.9 {
|
|
println!(
|
|
"[{}] → same model family but measurably different vectors \
|
|
(quantization / runtime drift); re-embedding was worthwhile.",
|
|
label
|
|
);
|
|
} else {
|
|
println!(
|
|
"[{}] → vector-space mismatch confirmed — queries were searching a \
|
|
different space than the corpus. This re-embed should fix it.",
|
|
label
|
|
);
|
|
}
|
|
}
|
|
|
|
#[tokio::main]
|
|
async fn main() -> Result<()> {
|
|
dotenv::dotenv().ok();
|
|
env_logger::init();
|
|
let args = Args::parse();
|
|
|
|
let tables: Vec<&str> = args.tables.split(',').map(|t| t.trim()).collect();
|
|
for t in &tables {
|
|
anyhow::ensure!(
|
|
matches!(*t, "summaries" | "calendar" | "search" | "entities"),
|
|
"unknown table '{}' — expected summaries, calendar, search, entities",
|
|
t
|
|
);
|
|
}
|
|
|
|
let database_url = env::var("DATABASE_URL").unwrap_or_else(|_| "auth.db".to_string());
|
|
println!("Database: {}", database_url);
|
|
|
|
let mut conn = SqliteConnection::establish(&database_url)
|
|
.with_context(|| format!("connecting to {}", database_url))?;
|
|
|
|
let llm = LocalLlm::from_env();
|
|
let model_version = llm.embedding_model_version();
|
|
println!("Embedding via '{}'", model_version);
|
|
if args.dry_run {
|
|
println!("DRY RUN — no rows will be written");
|
|
}
|
|
|
|
if tables.contains(&"summaries") {
|
|
let mut rows: Vec<SummaryRow> = sql_query(
|
|
"SELECT id, summary, embedding, model_version
|
|
FROM daily_conversation_summaries ORDER BY date",
|
|
)
|
|
.load(&mut conn)
|
|
.context("loading daily summaries")?;
|
|
if let Some(limit) = args.limit {
|
|
rows.truncate(limit);
|
|
}
|
|
if let Some(first) = rows.first() {
|
|
println!(
|
|
"\n[summaries] previous model_version '{}' → '{}'",
|
|
first.model_version, model_version
|
|
);
|
|
}
|
|
let items = rows
|
|
.into_iter()
|
|
.map(|r| WorkItem {
|
|
id: r.id as i64,
|
|
text: strip_summary_boilerplate(&r.summary),
|
|
old_embedding: r.embedding,
|
|
})
|
|
.collect();
|
|
let mv = model_version.clone();
|
|
let sims = reembed_table(
|
|
&mut conn,
|
|
&llm,
|
|
"summaries",
|
|
items,
|
|
args.dry_run,
|
|
move |conn, id, emb| {
|
|
sql_query(
|
|
"UPDATE daily_conversation_summaries
|
|
SET embedding = ?1, model_version = ?2 WHERE id = ?3",
|
|
)
|
|
.bind::<diesel::sql_types::Binary, _>(emb)
|
|
.bind::<diesel::sql_types::Text, _>(&mv)
|
|
.bind::<diesel::sql_types::Integer, _>(id as i32)
|
|
.execute(conn)?;
|
|
Ok(())
|
|
},
|
|
)
|
|
.await?;
|
|
report_similarity("summaries", sims);
|
|
}
|
|
|
|
if tables.contains(&"calendar") {
|
|
let mut rows: Vec<CalendarRow> = sql_query(
|
|
"SELECT id, summary, description, location, embedding
|
|
FROM calendar_events WHERE embedding IS NOT NULL ORDER BY id",
|
|
)
|
|
.load(&mut conn)
|
|
.context("loading calendar events")?;
|
|
if let Some(limit) = args.limit {
|
|
rows.truncate(limit);
|
|
}
|
|
let items = rows
|
|
.into_iter()
|
|
.map(|r| WorkItem {
|
|
id: r.id as i64,
|
|
// Same text construction as import_calendar.
|
|
text: format!(
|
|
"{} {} {}",
|
|
r.summary,
|
|
r.description.as_deref().unwrap_or(""),
|
|
r.location.as_deref().unwrap_or("")
|
|
),
|
|
old_embedding: r.embedding,
|
|
})
|
|
.collect();
|
|
let sims = reembed_table(
|
|
&mut conn,
|
|
&llm,
|
|
"calendar",
|
|
items,
|
|
args.dry_run,
|
|
|conn, id, emb| {
|
|
sql_query("UPDATE calendar_events SET embedding = ?1 WHERE id = ?2")
|
|
.bind::<diesel::sql_types::Binary, _>(emb)
|
|
.bind::<diesel::sql_types::Integer, _>(id as i32)
|
|
.execute(conn)?;
|
|
Ok(())
|
|
},
|
|
)
|
|
.await?;
|
|
report_similarity("calendar", sims);
|
|
}
|
|
|
|
if tables.contains(&"search") {
|
|
let mut rows: Vec<SearchRow> = sql_query(
|
|
"SELECT rowid AS id, query, embedding
|
|
FROM search_history ORDER BY rowid",
|
|
)
|
|
.load(&mut conn)
|
|
.context("loading search history")?;
|
|
if let Some(limit) = args.limit {
|
|
rows.truncate(limit);
|
|
}
|
|
let items = rows
|
|
.into_iter()
|
|
.map(|r| WorkItem {
|
|
id: r.id,
|
|
text: r.query,
|
|
old_embedding: r.embedding,
|
|
})
|
|
.collect();
|
|
let sims = reembed_table(
|
|
&mut conn,
|
|
&llm,
|
|
"search",
|
|
items,
|
|
args.dry_run,
|
|
|conn, id, emb| {
|
|
sql_query("UPDATE search_history SET embedding = ?1 WHERE rowid = ?2")
|
|
.bind::<diesel::sql_types::Binary, _>(emb)
|
|
.bind::<diesel::sql_types::BigInt, _>(id)
|
|
.execute(conn)?;
|
|
Ok(())
|
|
},
|
|
)
|
|
.await?;
|
|
report_similarity("search", sims);
|
|
}
|
|
|
|
if tables.contains(&"entities") {
|
|
let mut rows: Vec<EntityRow> = sql_query(
|
|
"SELECT id, name, description, embedding
|
|
FROM entities WHERE embedding IS NOT NULL ORDER BY id",
|
|
)
|
|
.load(&mut conn)
|
|
.context("loading knowledge entities")?;
|
|
if let Some(limit) = args.limit {
|
|
rows.truncate(limit);
|
|
}
|
|
let items = rows
|
|
.into_iter()
|
|
.map(|r| WorkItem {
|
|
id: r.id as i64,
|
|
// Same text construction as tool_store_entity.
|
|
text: format!("{} {}", r.name, r.description),
|
|
old_embedding: r.embedding,
|
|
})
|
|
.collect();
|
|
let sims = reembed_table(
|
|
&mut conn,
|
|
&llm,
|
|
"entities",
|
|
items,
|
|
args.dry_run,
|
|
|conn, id, emb| {
|
|
sql_query("UPDATE entities SET embedding = ?1 WHERE id = ?2")
|
|
.bind::<diesel::sql_types::Binary, _>(emb)
|
|
.bind::<diesel::sql_types::Integer, _>(id as i32)
|
|
.execute(conn)?;
|
|
Ok(())
|
|
},
|
|
)
|
|
.await?;
|
|
report_similarity("entities", sims);
|
|
}
|
|
|
|
println!(
|
|
"\n{}",
|
|
if args.dry_run {
|
|
"Dry run complete"
|
|
} else {
|
|
"Done"
|
|
}
|
|
);
|
|
Ok(())
|
|
}
|