Fix RAG vector-space mismatch and search_rag retrieval quality

Queries embedded via llama-swap were searching corpora embedded via Ollama (measured: spaces diverged). Introduce LocalLlm — the local Ollama + llama-swap pair with LLM_BACKEND dispatch baked in — and route all embedding writers through it; anything embedding via a concrete client reintroduces the bug. - search_rag: embed the model's query verbatim (no metadata boilerplate), make date optional — no time-decay when omitted, so "when did X happen?" queries rank purely by similarity across all time - reembed_embeddings bin: re-embed summaries / calendar / search / knowledge entities via the active backend, with old-new cosine report per table and truncate-and-retry for inputs over the embed server's physical batch size - import_calendar, import_search_history: embed through LocalLlm - search_messages / get_sms_messages: render sender → recipient so sent messages are attributable to a conversation - insight job failures: store the one-line anyhow context chain ({:#}) instead of the Debug dump the client was shown verbatim - serialize env_dispatch tests behind a lock (parallel-runner flake) Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
2026-06-11 19:06:52 -04:00
parent 0accc4ef2f
commit a022a3d15d
8 changed files with 738 additions and 99 deletions
@@ -186,21 +186,7 @@ impl AppState {
 impl Default for AppState {
    fn default() -> Self {
        // Initialize AI clients
-        let ollama_primary_url = env::var("OLLAMA_PRIMARY_URL").unwrap_or_else(|_| {
-            env::var("OLLAMA_URL").unwrap_or_else(|_| "http://localhost:11434".to_string())
-        });
-        let ollama_fallback_url = env::var("OLLAMA_FALLBACK_URL").ok();
-        let ollama_primary_model = env::var("OLLAMA_PRIMARY_MODEL")
-            .or_else(|_| env::var("OLLAMA_MODEL"))
-            .unwrap_or_else(|_| "nemotron-3-nano:30b".to_string());
-        let ollama_fallback_model = env::var("OLLAMA_FALLBACK_MODEL").ok();
-
-        let ollama = OllamaClient::new(
-            ollama_primary_url,
-            ollama_fallback_url,
-            ollama_primary_model,
-            ollama_fallback_model,
-        );
+        let ollama = build_ollama_from_env();

        let openrouter = build_openrouter_from_env();
        let openrouter_allowed_models = parse_openrouter_allowed_models();
@@ -375,13 +361,29 @@ fn parse_openrouter_allowed_models() -> Vec<String> {
        .collect()
 }

+/// Build the `OllamaClient` from environment variables — the canonical
+/// `OLLAMA_*` wiring shared by the server (`AppState::default`) and the
+/// standalone binaries (which predate this helper and used to copy it).
+pub fn build_ollama_from_env() -> OllamaClient {
+    let primary_url = env::var("OLLAMA_PRIMARY_URL").unwrap_or_else(|_| {
+        env::var("OLLAMA_URL").unwrap_or_else(|_| "http://localhost:11434".to_string())
+    });
+    let fallback_url = env::var("OLLAMA_FALLBACK_URL").ok();
+    let primary_model = env::var("OLLAMA_PRIMARY_MODEL")
+        .or_else(|_| env::var("OLLAMA_MODEL"))
+        .unwrap_or_else(|_| "nemotron-3-nano:30b".to_string());
+    let fallback_model = env::var("OLLAMA_FALLBACK_MODEL").ok();
+
+    OllamaClient::new(primary_url, fallback_url, primary_model, fallback_model)
+}
+
 /// Build a `LlamaCppClient` from environment variables. Returns `None` when
 /// `LLAMA_SWAP_URL` is unset. The client is constructed unconditionally
 /// when the URL is set (so it's available even under `LLM_BACKEND=ollama`
 /// for ad-hoc tooling), but the agentic / chat paths only route through it
 /// when `LLM_BACKEND=llamacpp`. Slot ids default to the names the bundled
 /// `llama-swap/config.yaml` uses — `chat` / `vision` / `embed`.
-fn build_llamacpp_from_env() -> Option<Arc<LlamaCppClient>> {
+pub fn build_llamacpp_from_env() -> Option<Arc<LlamaCppClient>> {
    let base_url = env::var("LLAMA_SWAP_URL").ok()?;
    let primary_model = env::var("LLAMA_SWAP_PRIMARY_MODEL").ok();
    let mut client = LlamaCppClient::new(Some(base_url), primary_model);