From efd05db5234d0a28f6579a1169484bc933e1c0a7 Mon Sep 17 00:00:00 2001
From: Cameron Cordes <cameronc.dev@gmail.com>
Date: Thu, 11 Jun 2026 21:40:40 -0400
Subject: [PATCH] Make the embedding model swappable via env for A/B testing
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Trialing Qwen3-Embedding-0.6B (1024-dim, instruct-prefixed queries)
against nomic required code changes at every hardcoded seam; now it's a
config flip plus a reembed_embeddings run.

- EMBEDDING_DIM env (default 768) replaces every hardcoded dim check:
  daily summary / calendar / search / location DAOs, Ollama batch
  validation, reembed_embeddings
- entities gains the dim guard it never had — a wrong-dim vector
  silently kills dedup/recall (cosine over mismatched lengths is 0),
  so store None and warn instead
- embed_query / embed_document split with EMBED_QUERY_PREFIX /
  EMBED_DOCUMENT_PREFIX (literal \n expanded): retrieval models treat
  the two sides differently — nomic wants search_query:/search_document:,
  Qwen3 wants Instruct:...\nQuery: on queries only. All query-side
  call sites and all corpus writers now declare their side.
- document the contract in CLAUDE.md: change the model or any of these
  vars → re-run reembed_embeddings or search is garbage

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 CLAUDE.md                         |  8 +++++
 src/ai/insight_generator.rs       | 48 ++++++++++++++++++++---------
 src/ai/local_llm.rs               | 14 +++++----
 src/ai/mod.rs                     | 51 +++++++++++++++++++++++++++++++
 src/ai/ollama.rs                  |  9 +++---
 src/bin/import_calendar.rs        |  3 +-
 src/bin/import_search_history.rs  |  2 +-
 src/bin/reembed_embeddings.rs     | 15 ++++-----
 src/database/calendar_dao.rs      | 23 ++++++++------
 src/database/daily_summary_dao.rs | 21 +++++++------
 src/database/location_dao.rs      |  9 +++---
 src/database/search_dao.rs        | 23 ++++++++------
 12 files changed, 159 insertions(+), 67 deletions(-)
diff --git a/CLAUDE.md b/CLAUDE.md
index b63ed4c..816391b 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -645,6 +645,14 @@ OPENROUTER_APP_TITLE=ImageApi                  # Optional attribution header
 # re-embedding — mixed vector spaces break similarity search.
 LLM_BACKEND=ollama
 
+# Embedding model contract. Corpus and queries must be embedded by the same
+# model with matching prefixes — after changing the embed model or any of
+# these, run `cargo run --bin reembed_embeddings` (all tables) or search is
+# garbage. Prefix values may contain a literal \n (expanded to a newline).
+EMBEDDING_DIM=768           # 768 = nomic-embed-text v1.5; 1024 = Qwen3-Embedding-0.6B
+EMBED_QUERY_PREFIX=         # nomic: "search_query: " | Qwen3: "Instruct: <task>\nQuery: "
+EMBED_DOCUMENT_PREFIX=      # nomic: "search_document: " | Qwen3: leave empty
+
 # llama.cpp / llama-swap (used when LLM_BACKEND=llamacpp). OpenAI-compatible
 # proxy hosting one or more llama-server processes. Chat models receive
 # images directly via content-parts (all models assumed vision-capable).
diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs
index 1ac4db3..645caa2 100644
--- a/src/ai/insight_generator.rs
+++ b/src/ai/insight_generator.rs
@@ -535,7 +535,7 @@ impl InsightGenerator {
         // (`LLM_BACKEND` switch). Must match the backend that populated the
         // daily-summary embeddings or similarity search will be garbage.
         let query_embedding =
-            crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &query).await?;
+            crate::ai::embed_query(&self.ollama, self.llamacpp.as_deref(), &query).await?;
 
         // Search for similar daily summaries with time-based weighting
         // This prioritizes summaries temporally close to the query date
@@ -601,7 +601,7 @@ impl InsightGenerator {
         // Must use the same backend that populated the daily-summary
         // embeddings or similarity search is garbage (see embed_one docs).
         let query_embedding =
-            crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), query).await?;
+            crate::ai::embed_query(&self.ollama, self.llamacpp.as_deref(), query).await?;
 
         let mut summary_dao = self
             .daily_summary_dao
@@ -687,7 +687,7 @@ impl InsightGenerator {
         let calendar_cx = parent_cx.with_span(span);
 
         let query_embedding = if let Some(loc) = location {
-            match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), loc).await {
+            match crate::ai::embed_query(&self.ollama, self.llamacpp.as_deref(), loc).await {
                 Ok(emb) => Some(emb),
                 Err(e) => {
                     log::warn!("Failed to generate embedding for location '{}': {}", loc, e);
@@ -859,7 +859,8 @@ impl InsightGenerator {
         };
 
         let query_embedding =
-            match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &query_text).await {
+            match crate::ai::embed_query(&self.ollama, self.llamacpp.as_deref(), &query_text).await
+            {
                 Ok(emb) => emb,
                 Err(e) => {
                     log::warn!("Failed to generate search embedding: {}", e);
@@ -2942,17 +2943,34 @@ Return ONLY the summary, nothing else."#,
         // Generate embedding for name + description (best-effort) via the
         // configured local backend.
         let embed_text = format!("{} {}", name, description);
-        let embedding: Option<Vec<u8>> =
-            match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &embed_text).await {
-                Ok(vec) => {
-                    let bytes: Vec<u8> = vec.iter().flat_map(|f| f.to_le_bytes()).collect();
-                    Some(bytes)
-                }
-                Err(e) => {
-                    log::warn!("Embedding generation failed for entity '{}': {}", name, e);
-                    None
-                }
-            };
+        let embedding: Option<Vec<u8>> = match crate::ai::embed_document(
+            &self.ollama,
+            self.llamacpp.as_deref(),
+            &embed_text,
+        )
+        .await
+        {
+            // The entities table has no dim check at the DAO layer, and a
+            // wrong-dim vector silently kills dedup/recall (cosine over
+            // mismatched lengths is 0) — guard here, store None instead.
+            Ok(vec) if vec.len() == crate::ai::embedding_dim() => {
+                let bytes: Vec<u8> = vec.iter().flat_map(|f| f.to_le_bytes()).collect();
+                Some(bytes)
+            }
+            Ok(vec) => {
+                log::warn!(
+                    "Entity '{}' embedding has {} dims (expected {}) — storing without embedding",
+                    name,
+                    vec.len(),
+                    crate::ai::embedding_dim()
+                );
+                None
+            }
+            Err(e) => {
+                log::warn!("Embedding generation failed for entity '{}': {}", name, e);
+                None
+            }
+        };
 
         let now = chrono::Utc::now().timestamp();
         let insert = InsertEntity {
diff --git a/src/ai/local_llm.rs b/src/ai/local_llm.rs
index 8344a87..bf3510c 100644
--- a/src/ai/local_llm.rs
+++ b/src/ai/local_llm.rs
@@ -43,14 +43,16 @@ impl LocalLlm {
         )
     }
 
-    /// Embed one string via the `LLM_BACKEND`-selected client.
-    pub async fn embed(&self, text: &str) -> Result<Vec<f32>> {
-        super::embed_one(&self.ollama, self.llamacpp.as_deref(), text).await
+    /// Embed a search query (applies `EMBED_QUERY_PREFIX`). Callers must
+    /// pick query vs document — retrieval models treat the two sides
+    /// differently and an unmarked embed invites prefix-mismatch bugs.
+    pub async fn embed_query(&self, text: &str) -> Result<Vec<f32>> {
+        super::embed_query(&self.ollama, self.llamacpp.as_deref(), text).await
     }
 
-    /// Embed a batch via the `LLM_BACKEND`-selected client.
-    pub async fn embed_batch(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>> {
-        super::embed_many(&self.ollama, self.llamacpp.as_deref(), texts).await
+    /// Embed corpus text (applies `EMBED_DOCUMENT_PREFIX`).
+    pub async fn embed_document(&self, text: &str) -> Result<Vec<f32>> {
+        super::embed_document(&self.ollama, self.llamacpp.as_deref(), text).await
     }
 
     /// Single-shot local text generation via the `LLM_BACKEND`-selected
diff --git a/src/ai/mod.rs b/src/ai/mod.rs
index 5125a96..d358f6c 100644
--- a/src/ai/mod.rs
+++ b/src/ai/mod.rs
@@ -75,6 +75,57 @@ pub fn local_backend_is_llamacpp() -> bool {
     )
 }
 
+/// Expected embedding dimensionality, env-overridable via `EMBEDDING_DIM`
+/// (default 768, nomic-embed-text). Every store/query dim check reads this —
+/// swapping to a different-dim model (e.g. Qwen3-Embedding-0.6B at 1024) is
+/// then a config flip plus a `reembed_embeddings` run, not a code change.
+/// Cached for the process lifetime; a flip requires a restart anyway since
+/// the corpus must be re-embedded with it.
+pub fn embedding_dim() -> usize {
+    static DIM: std::sync::OnceLock<usize> = std::sync::OnceLock::new();
+    *DIM.get_or_init(|| {
+        std::env::var("EMBEDDING_DIM")
+            .ok()
+            .and_then(|v| v.parse().ok())
+            .unwrap_or(768)
+    })
+}
+
+/// Read an embedding prefix from the environment. `.env` values can't hold
+/// real newlines, so a literal `\n` in the value is expanded — Qwen3-style
+/// query instructions need one ("Instruct: ...\nQuery: ").
+fn embed_prefix(key: &str) -> String {
+    std::env::var(key)
+        .map(|v| v.replace("\\n", "\n"))
+        .unwrap_or_default()
+}
+
+/// Embed a search query. Applies `EMBED_QUERY_PREFIX` (default empty) —
+/// retrieval models distinguish query-side from document-side text:
+/// nomic v1.5 wants `search_query: `, Qwen3-Embedding wants
+/// `Instruct: <task>\nQuery: `. Must pair with the document prefix the
+/// corpus was embedded with or similarity degrades.
+pub async fn embed_query(
+    ollama: &OllamaClient,
+    llamacpp: Option<&LlamaCppClient>,
+    text: &str,
+) -> anyhow::Result<Vec<f32>> {
+    let prefixed = format!("{}{}", embed_prefix("EMBED_QUERY_PREFIX"), text);
+    embed_one(ollama, llamacpp, &prefixed).await
+}
+
+/// Embed corpus text (the stored side of retrieval). Applies
+/// `EMBED_DOCUMENT_PREFIX` (default empty; nomic v1.5 wants
+/// `search_document: `, Qwen3-Embedding wants none).
+pub async fn embed_document(
+    ollama: &OllamaClient,
+    llamacpp: Option<&LlamaCppClient>,
+    text: &str,
+) -> anyhow::Result<Vec<f32>> {
+    let prefixed = format!("{}{}", embed_prefix("EMBED_DOCUMENT_PREFIX"), text);
+    embed_one(ollama, llamacpp, &prefixed).await
+}
+
 /// Embed a batch of strings via the configured local backend. Routes
 /// through llama-swap when `LLM_BACKEND=llamacpp` (and a client is
 /// configured), else Ollama. See [`local_backend_is_llamacpp`] for the
diff --git a/src/ai/ollama.rs b/src/ai/ollama.rs
index 75c8a02..518c7ec 100644
--- a/src/ai/ollama.rs
+++ b/src/ai/ollama.rs
@@ -1046,13 +1046,14 @@ Analyze the image and use specific details from both the visual content and the
             }
         };
 
-        // Validate embedding dimensions (should be 768 for nomic-embed-text:v1.5)
+        // Validate embedding dimensions (EMBEDDING_DIM; 768 for nomic-embed-text:v1.5)
         for (i, embedding) in embeddings.iter().enumerate() {
-            if embedding.len() != 768 {
+            if embedding.len() != crate::ai::embedding_dim() {
                 log::warn!(
-                    "Unexpected embedding dimensions for item {}: {} (expected 768)",
+                    "Unexpected embedding dimensions for item {}: {} (expected {})",
                     i,
-                    embedding.len()
+                    embedding.len(),
+                    crate::ai::embedding_dim()
                 );
             }
         }
diff --git a/src/bin/import_calendar.rs b/src/bin/import_calendar.rs
index 629a794..98b3f37 100644
--- a/src/bin/import_calendar.rs
+++ b/src/bin/import_calendar.rs
@@ -87,7 +87,8 @@ async fn main() -> Result<()> {
             );
 
             match tokio::task::block_in_place(|| {
-                tokio::runtime::Handle::current().block_on(async { llm.embed(&text).await })
+                tokio::runtime::Handle::current()
+                    .block_on(async { llm.embed_document(&text).await })
             }) {
                 Ok(emb) => Some(emb),
                 Err(e) => {
diff --git a/src/bin/import_search_history.rs b/src/bin/import_search_history.rs
index 7494392..93605cc 100644
--- a/src/bin/import_search_history.rs
+++ b/src/bin/import_search_history.rs
@@ -64,7 +64,7 @@ async fn main() -> Result<()> {
             async move {
                 let mut embeddings = Vec::new();
                 for query in &queries {
-                    match llm.embed(query).await {
+                    match llm.embed_document(query).await {
                         Ok(emb) => embeddings.push(Some(emb)),
                         Err(e) => {
                             pb_for_warn.println(format!("embedding failed for '{}': {}", query, e));
diff --git a/src/bin/reembed_embeddings.rs b/src/bin/reembed_embeddings.rs
index 26b2fde..a2fdd4c 100644
--- a/src/bin/reembed_embeddings.rs
+++ b/src/bin/reembed_embeddings.rs
@@ -141,7 +141,7 @@ async fn embed_with_truncation(llm: &LocalLlm, text: &str) -> Result<(Vec<f32>,
     let mut text = text.to_string();
     let mut truncated = false;
     loop {
-        match llm.embed(&text).await {
+        match llm.embed_document(&text).await {
             Ok(emb) => return Ok((emb, truncated)),
             Err(e)
                 if e.to_string().contains("too large to process") && text.chars().count() > 64 =>
@@ -194,14 +194,15 @@ async fn reembed_table(
             }
         };
 
-        // The whole pipeline (DAO checks, stored corpora) assumes 768 dims.
-        // A different dim means the active backend is not serving a
-        // nomic-compatible model — stop rather than corrupt the table.
+        // The whole pipeline (DAO checks, stored corpora) assumes
+        // EMBEDDING_DIM dims. A mismatch means the active embed slot is not
+        // serving the configured model — stop rather than corrupt the table.
         anyhow::ensure!(
-            new_emb.len() == 768,
-            "backend returned {}-dim embedding (expected 768) — '{}' is not \
-             serving a nomic-embed-text-v1.5-compatible model",
+            new_emb.len() == image_api::ai::embedding_dim(),
+            "backend returned {}-dim embedding (expected {}) — '{}' does not \
+             match the configured EMBEDDING_DIM",
             new_emb.len(),
+            image_api::ai::embedding_dim(),
             llm.embedding_model_version()
         );
 
diff --git a/src/database/calendar_dao.rs b/src/database/calendar_dao.rs
index 4ebd21c..f739d87 100644
--- a/src/database/calendar_dao.rs
+++ b/src/database/calendar_dao.rs
@@ -222,11 +222,12 @@ impl CalendarEventDao for SqliteCalendarEventDao {
 
             // Validate embedding dimensions if provided
             if let Some(ref emb) = event.embedding
-                && emb.len() != 768
+                && emb.len() != crate::ai::embedding_dim()
             {
                 return Err(anyhow::anyhow!(
-                    "Invalid embedding dimensions: {} (expected 768)",
-                    emb.len()
+                    "Invalid embedding dimensions: {} (expected {})",
+                    emb.len(),
+                    crate::ai::embedding_dim()
                 ));
             }
 
@@ -293,7 +294,7 @@ impl CalendarEventDao for SqliteCalendarEventDao {
                 for event in events {
                     // Validate embedding if provided
                     if let Some(ref emb) = event.embedding
-                        && emb.len() != 768
+                        && emb.len() != crate::ai::embedding_dim()
                     {
                         log::warn!(
                             "Skipping event with invalid embedding dimensions: {}",
@@ -385,10 +386,11 @@ impl CalendarEventDao for SqliteCalendarEventDao {
         trace_db_call(context, "query", "find_similar_events", |_span| {
             let mut conn = self.connection.lock().expect("Unable to get CalendarEventDao");
 
-            if query_embedding.len() != 768 {
+            if query_embedding.len() != crate::ai::embedding_dim() {
                 return Err(anyhow::anyhow!(
-                    "Invalid query embedding dimensions: {} (expected 768)",
-                    query_embedding.len()
+                    "Invalid query embedding dimensions: {} (expected {})",
+                    query_embedding.len(),
+                    crate::ai::embedding_dim()
                 ));
             }
 
@@ -461,10 +463,11 @@ impl CalendarEventDao for SqliteCalendarEventDao {
 
             // Step 2: If query embedding provided, rank by semantic similarity
             if let Some(query_emb) = query_embedding {
-                if query_emb.len() != 768 {
+                if query_emb.len() != crate::ai::embedding_dim() {
                     return Err(anyhow::anyhow!(
-                        "Invalid query embedding dimensions: {} (expected 768)",
-                        query_emb.len()
+                        "Invalid query embedding dimensions: {} (expected {})",
+                        query_emb.len(),
+                        crate::ai::embedding_dim()
                     ));
                 }
 
diff --git a/src/database/daily_summary_dao.rs b/src/database/daily_summary_dao.rs
index 521c1a5..af1d16f 100644
--- a/src/database/daily_summary_dao.rs
+++ b/src/database/daily_summary_dao.rs
@@ -150,10 +150,11 @@ impl DailySummaryDao for SqliteDailySummaryDao {
                 .expect("Unable to get DailySummaryDao");
 
             // Validate embedding dimensions
-            if summary.embedding.len() != 768 {
+            if summary.embedding.len() != crate::ai::embedding_dim() {
                 return Err(anyhow::anyhow!(
-                    "Invalid embedding dimensions: {} (expected 768)",
-                    summary.embedding.len()
+                    "Invalid embedding dimensions: {} (expected {})",
+                    summary.embedding.len(),
+                    crate::ai::embedding_dim()
                 ));
             }
 
@@ -202,10 +203,11 @@ impl DailySummaryDao for SqliteDailySummaryDao {
         trace_db_call(context, "query", "find_similar_summaries", |_span| {
             let mut conn = self.connection.lock().expect("Unable to get DailySummaryDao");
 
-            if query_embedding.len() != 768 {
+            if query_embedding.len() != crate::ai::embedding_dim() {
                 return Err(anyhow::anyhow!(
-                    "Invalid query embedding dimensions: {} (expected 768)",
-                    query_embedding.len()
+                    "Invalid query embedding dimensions: {} (expected {})",
+                    query_embedding.len(),
+                    crate::ai::embedding_dim()
                 ));
             }
 
@@ -299,10 +301,11 @@ impl DailySummaryDao for SqliteDailySummaryDao {
         trace_db_call(context, "query", "find_similar_summaries_with_time_weight", |_span| {
             let mut conn = self.connection.lock().expect("Unable to get DailySummaryDao");
 
-            if query_embedding.len() != 768 {
+            if query_embedding.len() != crate::ai::embedding_dim() {
                 return Err(anyhow::anyhow!(
-                    "Invalid query embedding dimensions: {} (expected 768)",
-                    query_embedding.len()
+                    "Invalid query embedding dimensions: {} (expected {})",
+                    query_embedding.len(),
+                    crate::ai::embedding_dim()
                 ));
             }
 
diff --git a/src/database/location_dao.rs b/src/database/location_dao.rs
index 8bb0ac4..9840279 100644
--- a/src/database/location_dao.rs
+++ b/src/database/location_dao.rs
@@ -216,11 +216,12 @@ impl LocationHistoryDao for SqliteLocationHistoryDao {
 
             // Validate embedding dimensions if provided (rare for location data)
             if let Some(ref emb) = location.embedding
-                && emb.len() != 768
+                && emb.len() != crate::ai::embedding_dim()
             {
                 return Err(anyhow::anyhow!(
-                    "Invalid embedding dimensions: {} (expected 768)",
-                    emb.len()
+                    "Invalid embedding dimensions: {} (expected {})",
+                    emb.len(),
+                    crate::ai::embedding_dim()
                 ));
             }
 
@@ -292,7 +293,7 @@ impl LocationHistoryDao for SqliteLocationHistoryDao {
                 for location in locations {
                     // Validate embedding if provided (rare)
                     if let Some(ref emb) = location.embedding
-                        && emb.len() != 768
+                        && emb.len() != crate::ai::embedding_dim()
                     {
                         log::warn!(
                             "Skipping location with invalid embedding dimensions: {}",
diff --git a/src/database/search_dao.rs b/src/database/search_dao.rs
index ee7d0ad..a73c9fb 100644
--- a/src/database/search_dao.rs
+++ b/src/database/search_dao.rs
@@ -189,10 +189,11 @@ impl SearchHistoryDao for SqliteSearchHistoryDao {
                 .expect("Unable to get SearchHistoryDao");
 
             // Validate embedding dimensions (REQUIRED for searches)
-            if search.embedding.len() != 768 {
+            if search.embedding.len() != crate::ai::embedding_dim() {
                 return Err(anyhow::anyhow!(
-                    "Invalid embedding dimensions: {} (expected 768)",
-                    search.embedding.len()
+                    "Invalid embedding dimensions: {} (expected {})",
+                    search.embedding.len(),
+                    crate::ai::embedding_dim()
                 ));
             }
 
@@ -245,7 +246,7 @@ impl SearchHistoryDao for SqliteSearchHistoryDao {
             conn.transaction::<_, anyhow::Error, _>(|conn| {
                 for search in searches {
                     // Validate embedding (REQUIRED)
-                    if search.embedding.len() != 768 {
+                    if search.embedding.len() != crate::ai::embedding_dim() {
                         log::warn!(
                             "Skipping search with invalid embedding dimensions: {}",
                             search.embedding.len()
@@ -325,10 +326,11 @@ impl SearchHistoryDao for SqliteSearchHistoryDao {
                 .lock()
                 .expect("Unable to get SearchHistoryDao");
 
-            if query_embedding.len() != 768 {
+            if query_embedding.len() != crate::ai::embedding_dim() {
                 return Err(anyhow::anyhow!(
-                    "Invalid query embedding dimensions: {} (expected 768)",
-                    query_embedding.len()
+                    "Invalid query embedding dimensions: {} (expected {})",
+                    query_embedding.len(),
+                    crate::ai::embedding_dim()
                 ));
             }
 
@@ -406,10 +408,11 @@ impl SearchHistoryDao for SqliteSearchHistoryDao {
 
             // Step 2: If query embedding provided, rank by semantic similarity
             if let Some(query_emb) = query_embedding {
-                if query_emb.len() != 768 {
+                if query_emb.len() != crate::ai::embedding_dim() {
                     return Err(anyhow::anyhow!(
-                        "Invalid query embedding dimensions: {} (expected 768)",
-                        query_emb.len()
+                        "Invalid query embedding dimensions: {} (expected {})",
+                        query_emb.len(),
+                        crate::ai::embedding_dim()
                     ));
                 }