Fix RAG vector-space mismatch and search_rag retrieval quality

Queries embedded via llama-swap were searching corpora embedded via Ollama (measured: spaces diverged). Introduce LocalLlm — the local Ollama + llama-swap pair with LLM_BACKEND dispatch baked in — and route all embedding writers through it; anything embedding via a concrete client reintroduces the bug. - search_rag: embed the model's query verbatim (no metadata boilerplate), make date optional — no time-decay when omitted, so "when did X happen?" queries rank purely by similarity across all time - reembed_embeddings bin: re-embed summaries / calendar / search / knowledge entities via the active backend, with old-new cosine report per table and truncate-and-retry for inputs over the embed server's physical batch size - import_calendar, import_search_history: embed through LocalLlm - search_messages / get_sms_messages: render sender → recipient so sent messages are attributable to a conversation - insight job failures: store the one-line anyhow context chain ({:#}) instead of the Debug dump the client was shown verbatim - serialize env_dispatch tests behind a lock (parallel-runner flake) Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-11 19:06:52 -04:00
parent 0accc4ef2f
commit a022a3d15d
8 changed files with 738 additions and 99 deletions
@@ -0,0 +1,86 @@
+//! Bundle of the local LLM pair (Ollama + optional llama-swap) with the
+//! `LLM_BACKEND` dispatch baked in.
+//!
+//! Exists because passing the pair around as loose values invited the same
+//! bug three times: import/backfill tooling embedded corpora via
+//! `OllamaClient` directly while the query side dispatched through
+//! `embed_one`, so flipping `LLM_BACKEND=llamacpp` silently split queries
+//! and corpus into different vector spaces. Anything that writes or reads
+//! embeddings should go through this type (or `embed_one`/`embed_many`),
+//! never a concrete client.
+//!
+//! Deliberately knows nothing about chat policy — hybrid/OpenRouter routing
+//! is request-scoped and stays in `ResolvedBackend`. This is only the
+//! local stack: embeddings and offline single-shot generation.
+
+// Constructed by binaries, not the server — dead code from main.rs's view.
+#![allow(dead_code)]
+
+use std::sync::Arc;
+
+use anyhow::Result;
+
+use super::llamacpp::LlamaCppClient;
+use super::llm_client::LlmClient;
+use super::ollama::{EMBEDDING_MODEL, OllamaClient};
+
+#[derive(Clone)]
+pub struct LocalLlm {
+    ollama: OllamaClient,
+    llamacpp: Option<Arc<LlamaCppClient>>,
+}
+
+impl LocalLlm {
+    pub fn new(ollama: OllamaClient, llamacpp: Option<Arc<LlamaCppClient>>) -> Self {
+        Self { ollama, llamacpp }
+    }
+
+    /// Construct from the canonical env wiring shared with `AppState`.
+    pub fn from_env() -> Self {
+        Self::new(
+            crate::state::build_ollama_from_env(),
+            crate::state::build_llamacpp_from_env(),
+        )
+    }
+
+    /// Embed one string via the `LLM_BACKEND`-selected client.
+    pub async fn embed(&self, text: &str) -> Result<Vec<f32>> {
+        super::embed_one(&self.ollama, self.llamacpp.as_deref(), text).await
+    }
+
+    /// Embed a batch via the `LLM_BACKEND`-selected client.
+    pub async fn embed_batch(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>> {
+        super::embed_many(&self.ollama, self.llamacpp.as_deref(), texts).await
+    }
+
+    /// Single-shot local text generation via the `LLM_BACKEND`-selected
+    /// client (offline tooling; chat turns belong to `ResolvedBackend`).
+    pub async fn generate(&self, prompt: &str, system: Option<&str>) -> Result<String> {
+        if super::local_backend_is_llamacpp() {
+            if let Some(lc) = self.llamacpp.as_deref() {
+                return <LlamaCppClient as LlmClient>::generate(lc, prompt, system, None).await;
+            }
+            anyhow::bail!(
+                "LLM_BACKEND=llamacpp but LlamaCppClient is unconfigured — \
+                 set LLAMA_SWAP_URL or switch to LLM_BACKEND=ollama"
+            );
+        }
+        self.ollama.generate(prompt, system).await
+    }
+
+    /// Label identifying which backend + model produces embeddings right
+    /// now. Store it alongside vectors (`model_version` columns) so a
+    /// backend flip is detectable in the data, not just in env history.
+    pub fn embedding_model_version(&self) -> String {
+        if super::local_backend_is_llamacpp() {
+            let slot = self
+                .llamacpp
+                .as_deref()
+                .map(|c| c.embedding_model.as_str())
+                .unwrap_or("embed");
+            format!("llama-swap:{}", slot)
+        } else {
+            EMBEDDING_MODEL.to_string()
+        }
+    }
+}