Fix RAG vector-space mismatch and search_rag retrieval quality
Queries embedded via llama-swap were searching corpora embedded via
Ollama (measured: spaces diverged). Introduce LocalLlm — the local
Ollama + llama-swap pair with LLM_BACKEND dispatch baked in — and route
all embedding writers through it; anything embedding via a concrete
client reintroduces the bug.
- search_rag: embed the model's query verbatim (no metadata boilerplate),
make date optional — no time-decay when omitted, so "when did X
happen?" queries rank purely by similarity across all time
- reembed_embeddings bin: re-embed summaries / calendar / search /
knowledge entities via the active backend, with old-new cosine report
per table and truncate-and-retry for inputs over the embed server's
physical batch size
- import_calendar, import_search_history: embed through LocalLlm
- search_messages / get_sms_messages: render sender → recipient so sent
messages are attributable to a conversation
- insight job failures: store the one-line anyhow context chain ({:#})
instead of the Debug dump the client was shown verbatim
- serialize env_dispatch tests behind a lock (parallel-runner flake)
Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::Utc;
|
||||
use clap::Parser;
|
||||
use image_api::ai::ollama::OllamaClient;
|
||||
use image_api::ai::LocalLlm;
|
||||
use image_api::bin_progress;
|
||||
use image_api::database::calendar_dao::{InsertCalendarEvent, SqliteCalendarEventDao};
|
||||
use image_api::parsers::ical_parser::parse_ics_file;
|
||||
@@ -44,22 +44,10 @@ async fn main() -> Result<()> {
|
||||
|
||||
let context = opentelemetry::Context::current();
|
||||
|
||||
let ollama = if args.generate_embeddings {
|
||||
let primary_url = dotenv::var("OLLAMA_PRIMARY_URL")
|
||||
.or_else(|_| dotenv::var("OLLAMA_URL"))
|
||||
.unwrap_or_else(|_| "http://localhost:11434".to_string());
|
||||
let fallback_url = dotenv::var("OLLAMA_FALLBACK_URL").ok();
|
||||
let primary_model = dotenv::var("OLLAMA_PRIMARY_MODEL")
|
||||
.or_else(|_| dotenv::var("OLLAMA_MODEL"))
|
||||
.unwrap_or_else(|_| "nomic-embed-text:v1.5".to_string());
|
||||
let fallback_model = dotenv::var("OLLAMA_FALLBACK_MODEL").ok();
|
||||
|
||||
Some(OllamaClient::new(
|
||||
primary_url,
|
||||
fallback_url,
|
||||
primary_model,
|
||||
fallback_model,
|
||||
))
|
||||
// LocalLlm dispatches per LLM_BACKEND, so embeddings written here land
|
||||
// in the same vector space the query side searches.
|
||||
let llm = if args.generate_embeddings {
|
||||
Some(LocalLlm::from_env())
|
||||
} else {
|
||||
None
|
||||
};
|
||||
@@ -90,7 +78,7 @@ async fn main() -> Result<()> {
|
||||
}
|
||||
|
||||
// Generate embedding if requested (blocking call)
|
||||
let embedding = if let Some(ref ollama_client) = ollama {
|
||||
let embedding = if let Some(ref llm) = llm {
|
||||
let text = format!(
|
||||
"{} {} {}",
|
||||
event.summary,
|
||||
@@ -99,8 +87,7 @@ async fn main() -> Result<()> {
|
||||
);
|
||||
|
||||
match tokio::task::block_in_place(|| {
|
||||
tokio::runtime::Handle::current()
|
||||
.block_on(async { ollama_client.generate_embedding(&text).await })
|
||||
tokio::runtime::Handle::current().block_on(async { llm.embed(&text).await })
|
||||
}) {
|
||||
Ok(emb) => Some(emb),
|
||||
Err(e) => {
|
||||
|
||||
Reference in New Issue
Block a user