ai: collapse llamacpp into LLM_BACKEND env switch

Reverts the per-request backend="llamacpp" value. Chat/vision/embedding
backend is now a deploy-time decision (LLM_BACKEND=ollama|llamacpp),
applied globally across chat, vision describe, and embeddings — so
embedding vectors stay in one space across the index.

- Per-request backend whitelist back to "local"|"hybrid". A request
  arriving with backend="llamacpp" is rejected.
- LLM_BACKEND=llamacpp swaps the entire local stack to llama-swap:
  chat hits the chat slot, describe hits the vision slot, embeddings
  hit the embed slot. Hybrid mode still routes chat to OpenRouter
  but uses LLM_BACKEND for the describe pass.
- Drops env vars HYBRID_VISION_BACKEND, LLAMA_SWAP_VISION_MODELS,
  EMBEDDING_BACKEND (the last never shipped). Drops the
  LlamaCppClient.vision_models allowlist — capability inference now
  reports has_vision only for the configured vision_model slot.
- Drops the /insights/llamacpp/models handler. /insights/models is
  the single endpoint; returns Ollama servers under LLM_BACKEND=ollama
  and llama-swap slots (from LLAMA_SWAP_ALLOWED_MODELS) under
  LLM_BACKEND=llamacpp. Same envelope shape either way.
- New ai::embed_one helper routes embeddings through llama-swap when
  LLM_BACKEND=llamacpp (else Ollama). Wires it into the four
  insight_generator embedding sites.
- Cross-replay matrix simplifies to pre-llamacpp shape (local↔local,
  hybrid↔hybrid, hybrid→local allowed; local→hybrid rejected).
This commit is contained in:
Cameron Cordes
2026-05-21 11:36:58 -04:00
parent d14df63f19
commit be51421b38
9 changed files with 338 additions and 301 deletions

View File

@@ -471,8 +471,11 @@ impl InsightGenerator {
log::info!("RAG QUERY: {}", query);
log::info!("========================================");
// Generate embedding for the query
let query_embedding = self.ollama.generate_embedding(&query).await?;
// Generate embedding for the query via the configured local backend
// (`LLM_BACKEND` switch). Must match the backend that populated the
// daily-summary embeddings or similarity search will be garbage.
let query_embedding =
crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &query).await?;
// Search for similar daily summaries with time-based weighting
// This prioritizes summaries temporally close to the query date
@@ -563,7 +566,7 @@ impl InsightGenerator {
let calendar_cx = parent_cx.with_span(span);
let query_embedding = if let Some(loc) = location {
match self.ollama.generate_embedding(loc).await {
match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), loc).await {
Ok(emb) => Some(emb),
Err(e) => {
log::warn!("Failed to generate embedding for location '{}': {}", loc, e);
@@ -734,16 +737,17 @@ impl InsightGenerator {
)
};
let query_embedding = match self.ollama.generate_embedding(&query_text).await {
Ok(emb) => emb,
Err(e) => {
log::warn!("Failed to generate search embedding: {}", e);
search_cx.span().set_status(Status::Error {
description: e.to_string().into(),
});
return Ok(None);
}
};
let query_embedding =
match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &query_text).await {
Ok(emb) => emb,
Err(e) => {
log::warn!("Failed to generate search embedding: {}", e);
search_cx.span().set_status(Status::Error {
description: e.to_string().into(),
});
return Ok(None);
}
};
let searches = {
let mut dao = self
@@ -2608,11 +2612,13 @@ Return ONLY the summary, nothing else."#,
}
}
/// Tool: store_entity — upsert an entity into the knowledge memory
/// Tool: store_entity — upsert an entity into the knowledge memory.
/// Embeddings go through the configured local backend (`LLM_BACKEND`),
/// independent of the per-request chat backend in the caller.
async fn tool_store_entity(
&self,
args: &serde_json::Value,
ollama: &OllamaClient,
_ollama: &OllamaClient,
cx: &opentelemetry::Context,
) -> String {
use crate::database::models::InsertEntity;
@@ -2672,9 +2678,16 @@ Return ONLY the summary, nothing else."#,
.collect()
};
// Generate embedding for name + description (best-effort)
// Generate embedding for name + description (best-effort) via the
// configured local backend.
let embed_text = format!("{} {}", name, description);
let embedding: Option<Vec<u8>> = match ollama.generate_embedding(&embed_text).await {
let embedding: Option<Vec<u8>> = match crate::ai::embed_one(
&self.ollama,
self.llamacpp.as_deref(),
&embed_text,
)
.await
{
Ok(vec) => {
let bytes: Vec<u8> = vec.iter().flat_map(|f| f.to_le_bytes()).collect();
Some(bytes)
@@ -3580,20 +3593,24 @@ Return ONLY the summary, nothing else."#,
.map(|s| s.trim().to_lowercase())
.filter(|s| !s.is_empty())
.unwrap_or_else(|| "local".to_string());
if !matches!(backend_label.as_str(), "local" | "hybrid" | "llamacpp") {
if !matches!(backend_label.as_str(), "local" | "hybrid") {
return Err(anyhow::anyhow!(
"unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'",
"unknown backend '{}'; expected 'local' or 'hybrid'",
backend_label
));
}
span.set_attribute(KeyValue::new("backend", backend_label.clone()));
let is_hybrid = backend_label == "hybrid";
let is_llamacpp = backend_label == "llamacpp";
// In hybrid + llamacpp modes the chat model never sees the image
// directly; we describe-then-inline locally before the agentic loop
// starts. Tracked as a single flag so vision/tool-gate logic doesn't
// have to branch twice.
let describes_then_inlines = is_hybrid || is_llamacpp;
// `LLM_BACKEND=llamacpp` swaps Ollama out for llama-swap as the
// "local" stack — chat + vision describe + embeddings all route
// through llama-swap. In hybrid mode this still applies to vision
// describe (chat continues to go to OpenRouter). The chat slot is
// text-only in either case, so we describe-then-inline.
let local_via_llamacpp =
crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
// Describe-then-inline: hybrid (chat is OpenRouter, text-only) or
// any path where chat goes through llama-swap (chat slot is text-only).
let describes_then_inlines = is_hybrid || local_via_llamacpp;
// 1b. Always build an Ollama client. In local mode it owns the chat
// loop; in hybrid/llamacpp mode it still handles tool-local calls
@@ -3688,13 +3705,14 @@ Return ONLY the summary, nothing else."#,
None
};
// 1d. In llamacpp mode, clone the configured LlamaCpp client and
// apply per-request overrides. Same shape as the openrouter
// branch above; describe_image will route through the vision
// slot configured on the client.
let llamacpp_client: Option<LlamaCppClient> = if is_llamacpp {
// 1d. When `LLM_BACKEND=llamacpp` and we're in local mode (not
// hybrid — hybrid keeps chat on OpenRouter), clone the llamacpp
// client and apply per-request overrides. Same shape as the
// openrouter branch above; describe_image will route through
// the vision slot configured on the client.
let llamacpp_client: Option<LlamaCppClient> = if local_via_llamacpp && !is_hybrid {
let arc = self.llamacpp.as_ref().ok_or_else(|| {
anyhow::anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured")
anyhow::anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured")
})?;
let mut c: LlamaCppClient = (**arc).clone();
if let Some(ref m) = custom_model {
@@ -3917,38 +3935,19 @@ Return ONLY the summary, nothing else."#,
None
};
// describe-then-inline path. In hybrid mode the vision backend
// defaults to Ollama but can be flipped to llamacpp via
// `HYBRID_VISION_BACKEND=llamacpp` (so chat goes to OpenRouter while
// vision/audio routes through llama-swap). In llamacpp mode we always
// use the llamacpp client's configured vision slot.
// describe-then-inline path. Vision describe routes through whichever
// `LLM_BACKEND` is configured — llama-swap when `local_via_llamacpp`
// is set (even in hybrid mode, since chat is OpenRouter but vision
// stays on the local stack), otherwise Ollama.
let inlined_visual_description: Option<String> = if describes_then_inlines {
match image_base64.as_deref() {
Some(b64) => {
let use_llamacpp_vision = if is_llamacpp {
true
} else {
// is_hybrid branch — consult env switch
matches!(
std::env::var("HYBRID_VISION_BACKEND")
.ok()
.as_deref()
.map(|s| s.trim().to_lowercase())
.as_deref(),
Some("llamacpp")
)
};
let described = if use_llamacpp_vision {
match self.llamacpp.as_ref() {
Some(c) => c.describe_image(b64).await,
None => {
log::warn!(
"describe-then-inline: requested llamacpp vision but LLAMA_SWAP_URL is unset, falling back to Ollama"
);
self.ollama.describe_image(b64).await
}
}
let described = if local_via_llamacpp {
self.llamacpp
.as_ref()
.expect("local_via_llamacpp guarantees Some")
.describe_image(b64)
.await
} else {
self.ollama.describe_image(b64).await
};
@@ -4044,10 +4043,10 @@ Return ONLY the summary, nothing else."#,
);
// 10. Define tools. Gate flags computed from current data presence;
// describe-then-inline modes (hybrid, llamacpp) omit describe_photo
// since the chat model receives the visual description inline (so
// we pass `false` for has_vision in those modes regardless of the
// model's actual capability).
// describe-then-inline modes (hybrid OR local_via_llamacpp) omit
// describe_photo since the chat model receives the visual
// description inline (so we pass `false` for has_vision in
// those modes regardless of the model's actual capability).
let gate_opts = self.current_gate_opts(has_vision && !describes_then_inlines);
let tools = Self::build_tool_definitions(gate_opts);