Fix RAG vector-space mismatch and search_rag retrieval quality

Queries embedded via llama-swap were searching corpora embedded via Ollama (measured: spaces diverged). Introduce LocalLlm — the local Ollama + llama-swap pair with LLM_BACKEND dispatch baked in — and route all embedding writers through it; anything embedding via a concrete client reintroduces the bug. - search_rag: embed the model's query verbatim (no metadata boilerplate), make date optional — no time-decay when omitted, so "when did X happen?" queries rank purely by similarity across all time - reembed_embeddings bin: re-embed summaries / calendar / search / knowledge entities via the active backend, with old-new cosine report per table and truncate-and-retry for inputs over the embed server's physical batch size - import_calendar, import_search_history: embed through LocalLlm - search_messages / get_sms_messages: render sender → recipient so sent messages are attributable to a conversation - insight job failures: store the one-line anyhow context chain ({:#}) instead of the Debug dump the client was shown verbatim - serialize env_dispatch tests behind a lock (parallel-runner flake) Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-11 19:06:52 -04:00
parent 0accc4ef2f
commit a022a3d15d
8 changed files with 738 additions and 99 deletions
@@ -1,7 +1,7 @@
 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Parser;
-use image_api::ai::ollama::OllamaClient;
+use image_api::ai::LocalLlm;
 use image_api::bin_progress;
 use image_api::database::calendar_dao::{InsertCalendarEvent, SqliteCalendarEventDao};
 use image_api::parsers::ical_parser::parse_ics_file;
@@ -44,22 +44,10 @@ async fn main() -> Result<()> {

    let context = opentelemetry::Context::current();

-    let ollama = if args.generate_embeddings {
-        let primary_url = dotenv::var("OLLAMA_PRIMARY_URL")
-            .or_else(|_| dotenv::var("OLLAMA_URL"))
-            .unwrap_or_else(|_| "http://localhost:11434".to_string());
-        let fallback_url = dotenv::var("OLLAMA_FALLBACK_URL").ok();
-        let primary_model = dotenv::var("OLLAMA_PRIMARY_MODEL")
-            .or_else(|_| dotenv::var("OLLAMA_MODEL"))
-            .unwrap_or_else(|_| "nomic-embed-text:v1.5".to_string());
-        let fallback_model = dotenv::var("OLLAMA_FALLBACK_MODEL").ok();
-
-        Some(OllamaClient::new(
-            primary_url,
-            fallback_url,
-            primary_model,
-            fallback_model,
-        ))
+    // LocalLlm dispatches per LLM_BACKEND, so embeddings written here land
+    // in the same vector space the query side searches.
+    let llm = if args.generate_embeddings {
+        Some(LocalLlm::from_env())
    } else {
        None
    };
@@ -90,7 +78,7 @@ async fn main() -> Result<()> {
        }

        // Generate embedding if requested (blocking call)
-        let embedding = if let Some(ref ollama_client) = ollama {
+        let embedding = if let Some(ref llm) = llm {
            let text = format!(
                "{} {} {}",
                event.summary,
@@ -99,8 +87,7 @@ async fn main() -> Result<()> {
            );

            match tokio::task::block_in_place(|| {
-                tokio::runtime::Handle::current()
-                    .block_on(async { ollama_client.generate_embedding(&text).await })
+                tokio::runtime::Handle::current().block_on(async { llm.embed(&text).await })
            }) {
                Ok(emb) => Some(emb),
                Err(e) => {