ai: collapse llamacpp into LLM_BACKEND env switch
Reverts the per-request backend="llamacpp" value. Chat/vision/embedding backend is now a deploy-time decision (LLM_BACKEND=ollama|llamacpp), applied globally across chat, vision describe, and embeddings — so embedding vectors stay in one space across the index. - Per-request backend whitelist back to "local"|"hybrid". A request arriving with backend="llamacpp" is rejected. - LLM_BACKEND=llamacpp swaps the entire local stack to llama-swap: chat hits the chat slot, describe hits the vision slot, embeddings hit the embed slot. Hybrid mode still routes chat to OpenRouter but uses LLM_BACKEND for the describe pass. - Drops env vars HYBRID_VISION_BACKEND, LLAMA_SWAP_VISION_MODELS, EMBEDDING_BACKEND (the last never shipped). Drops the LlamaCppClient.vision_models allowlist — capability inference now reports has_vision only for the configured vision_model slot. - Drops the /insights/llamacpp/models handler. /insights/models is the single endpoint; returns Ollama servers under LLM_BACKEND=ollama and llama-swap slots (from LLAMA_SWAP_ALLOWED_MODELS) under LLM_BACKEND=llamacpp. Same envelope shape either way. - New ai::embed_one helper routes embeddings through llama-swap when LLM_BACKEND=llamacpp (else Ollama). Wires it into the four insight_generator embedding sites. - Cross-replay matrix simplifies to pre-llamacpp shape (local↔local, hybrid↔hybrid, hybrid→local allowed; local→hybrid rejected).
This commit is contained in:
@@ -471,8 +471,11 @@ impl InsightGenerator {
|
||||
log::info!("RAG QUERY: {}", query);
|
||||
log::info!("========================================");
|
||||
|
||||
// Generate embedding for the query
|
||||
let query_embedding = self.ollama.generate_embedding(&query).await?;
|
||||
// Generate embedding for the query via the configured local backend
|
||||
// (`LLM_BACKEND` switch). Must match the backend that populated the
|
||||
// daily-summary embeddings or similarity search will be garbage.
|
||||
let query_embedding =
|
||||
crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &query).await?;
|
||||
|
||||
// Search for similar daily summaries with time-based weighting
|
||||
// This prioritizes summaries temporally close to the query date
|
||||
@@ -563,7 +566,7 @@ impl InsightGenerator {
|
||||
let calendar_cx = parent_cx.with_span(span);
|
||||
|
||||
let query_embedding = if let Some(loc) = location {
|
||||
match self.ollama.generate_embedding(loc).await {
|
||||
match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), loc).await {
|
||||
Ok(emb) => Some(emb),
|
||||
Err(e) => {
|
||||
log::warn!("Failed to generate embedding for location '{}': {}", loc, e);
|
||||
@@ -734,16 +737,17 @@ impl InsightGenerator {
|
||||
)
|
||||
};
|
||||
|
||||
let query_embedding = match self.ollama.generate_embedding(&query_text).await {
|
||||
Ok(emb) => emb,
|
||||
Err(e) => {
|
||||
log::warn!("Failed to generate search embedding: {}", e);
|
||||
search_cx.span().set_status(Status::Error {
|
||||
description: e.to_string().into(),
|
||||
});
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
let query_embedding =
|
||||
match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &query_text).await {
|
||||
Ok(emb) => emb,
|
||||
Err(e) => {
|
||||
log::warn!("Failed to generate search embedding: {}", e);
|
||||
search_cx.span().set_status(Status::Error {
|
||||
description: e.to_string().into(),
|
||||
});
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
|
||||
let searches = {
|
||||
let mut dao = self
|
||||
@@ -2608,11 +2612,13 @@ Return ONLY the summary, nothing else."#,
|
||||
}
|
||||
}
|
||||
|
||||
/// Tool: store_entity — upsert an entity into the knowledge memory
|
||||
/// Tool: store_entity — upsert an entity into the knowledge memory.
|
||||
/// Embeddings go through the configured local backend (`LLM_BACKEND`),
|
||||
/// independent of the per-request chat backend in the caller.
|
||||
async fn tool_store_entity(
|
||||
&self,
|
||||
args: &serde_json::Value,
|
||||
ollama: &OllamaClient,
|
||||
_ollama: &OllamaClient,
|
||||
cx: &opentelemetry::Context,
|
||||
) -> String {
|
||||
use crate::database::models::InsertEntity;
|
||||
@@ -2672,9 +2678,16 @@ Return ONLY the summary, nothing else."#,
|
||||
.collect()
|
||||
};
|
||||
|
||||
// Generate embedding for name + description (best-effort)
|
||||
// Generate embedding for name + description (best-effort) via the
|
||||
// configured local backend.
|
||||
let embed_text = format!("{} {}", name, description);
|
||||
let embedding: Option<Vec<u8>> = match ollama.generate_embedding(&embed_text).await {
|
||||
let embedding: Option<Vec<u8>> = match crate::ai::embed_one(
|
||||
&self.ollama,
|
||||
self.llamacpp.as_deref(),
|
||||
&embed_text,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(vec) => {
|
||||
let bytes: Vec<u8> = vec.iter().flat_map(|f| f.to_le_bytes()).collect();
|
||||
Some(bytes)
|
||||
@@ -3580,20 +3593,24 @@ Return ONLY the summary, nothing else."#,
|
||||
.map(|s| s.trim().to_lowercase())
|
||||
.filter(|s| !s.is_empty())
|
||||
.unwrap_or_else(|| "local".to_string());
|
||||
if !matches!(backend_label.as_str(), "local" | "hybrid" | "llamacpp") {
|
||||
if !matches!(backend_label.as_str(), "local" | "hybrid") {
|
||||
return Err(anyhow::anyhow!(
|
||||
"unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'",
|
||||
"unknown backend '{}'; expected 'local' or 'hybrid'",
|
||||
backend_label
|
||||
));
|
||||
}
|
||||
span.set_attribute(KeyValue::new("backend", backend_label.clone()));
|
||||
let is_hybrid = backend_label == "hybrid";
|
||||
let is_llamacpp = backend_label == "llamacpp";
|
||||
// In hybrid + llamacpp modes the chat model never sees the image
|
||||
// directly; we describe-then-inline locally before the agentic loop
|
||||
// starts. Tracked as a single flag so vision/tool-gate logic doesn't
|
||||
// have to branch twice.
|
||||
let describes_then_inlines = is_hybrid || is_llamacpp;
|
||||
// `LLM_BACKEND=llamacpp` swaps Ollama out for llama-swap as the
|
||||
// "local" stack — chat + vision describe + embeddings all route
|
||||
// through llama-swap. In hybrid mode this still applies to vision
|
||||
// describe (chat continues to go to OpenRouter). The chat slot is
|
||||
// text-only in either case, so we describe-then-inline.
|
||||
let local_via_llamacpp =
|
||||
crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
|
||||
// Describe-then-inline: hybrid (chat is OpenRouter, text-only) or
|
||||
// any path where chat goes through llama-swap (chat slot is text-only).
|
||||
let describes_then_inlines = is_hybrid || local_via_llamacpp;
|
||||
|
||||
// 1b. Always build an Ollama client. In local mode it owns the chat
|
||||
// loop; in hybrid/llamacpp mode it still handles tool-local calls
|
||||
@@ -3688,13 +3705,14 @@ Return ONLY the summary, nothing else."#,
|
||||
None
|
||||
};
|
||||
|
||||
// 1d. In llamacpp mode, clone the configured LlamaCpp client and
|
||||
// apply per-request overrides. Same shape as the openrouter
|
||||
// branch above; describe_image will route through the vision
|
||||
// slot configured on the client.
|
||||
let llamacpp_client: Option<LlamaCppClient> = if is_llamacpp {
|
||||
// 1d. When `LLM_BACKEND=llamacpp` and we're in local mode (not
|
||||
// hybrid — hybrid keeps chat on OpenRouter), clone the llamacpp
|
||||
// client and apply per-request overrides. Same shape as the
|
||||
// openrouter branch above; describe_image will route through
|
||||
// the vision slot configured on the client.
|
||||
let llamacpp_client: Option<LlamaCppClient> = if local_via_llamacpp && !is_hybrid {
|
||||
let arc = self.llamacpp.as_ref().ok_or_else(|| {
|
||||
anyhow::anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured")
|
||||
anyhow::anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured")
|
||||
})?;
|
||||
let mut c: LlamaCppClient = (**arc).clone();
|
||||
if let Some(ref m) = custom_model {
|
||||
@@ -3917,38 +3935,19 @@ Return ONLY the summary, nothing else."#,
|
||||
None
|
||||
};
|
||||
|
||||
// describe-then-inline path. In hybrid mode the vision backend
|
||||
// defaults to Ollama but can be flipped to llamacpp via
|
||||
// `HYBRID_VISION_BACKEND=llamacpp` (so chat goes to OpenRouter while
|
||||
// vision/audio routes through llama-swap). In llamacpp mode we always
|
||||
// use the llamacpp client's configured vision slot.
|
||||
// describe-then-inline path. Vision describe routes through whichever
|
||||
// `LLM_BACKEND` is configured — llama-swap when `local_via_llamacpp`
|
||||
// is set (even in hybrid mode, since chat is OpenRouter but vision
|
||||
// stays on the local stack), otherwise Ollama.
|
||||
let inlined_visual_description: Option<String> = if describes_then_inlines {
|
||||
match image_base64.as_deref() {
|
||||
Some(b64) => {
|
||||
let use_llamacpp_vision = if is_llamacpp {
|
||||
true
|
||||
} else {
|
||||
// is_hybrid branch — consult env switch
|
||||
matches!(
|
||||
std::env::var("HYBRID_VISION_BACKEND")
|
||||
.ok()
|
||||
.as_deref()
|
||||
.map(|s| s.trim().to_lowercase())
|
||||
.as_deref(),
|
||||
Some("llamacpp")
|
||||
)
|
||||
};
|
||||
|
||||
let described = if use_llamacpp_vision {
|
||||
match self.llamacpp.as_ref() {
|
||||
Some(c) => c.describe_image(b64).await,
|
||||
None => {
|
||||
log::warn!(
|
||||
"describe-then-inline: requested llamacpp vision but LLAMA_SWAP_URL is unset, falling back to Ollama"
|
||||
);
|
||||
self.ollama.describe_image(b64).await
|
||||
}
|
||||
}
|
||||
let described = if local_via_llamacpp {
|
||||
self.llamacpp
|
||||
.as_ref()
|
||||
.expect("local_via_llamacpp guarantees Some")
|
||||
.describe_image(b64)
|
||||
.await
|
||||
} else {
|
||||
self.ollama.describe_image(b64).await
|
||||
};
|
||||
@@ -4044,10 +4043,10 @@ Return ONLY the summary, nothing else."#,
|
||||
);
|
||||
|
||||
// 10. Define tools. Gate flags computed from current data presence;
|
||||
// describe-then-inline modes (hybrid, llamacpp) omit describe_photo
|
||||
// since the chat model receives the visual description inline (so
|
||||
// we pass `false` for has_vision in those modes regardless of the
|
||||
// model's actual capability).
|
||||
// describe-then-inline modes (hybrid OR local_via_llamacpp) omit
|
||||
// describe_photo since the chat model receives the visual
|
||||
// description inline (so we pass `false` for has_vision in
|
||||
// those modes regardless of the model's actual capability).
|
||||
let gate_opts = self.current_gate_opts(has_vision && !describes_then_inlines);
|
||||
let tools = Self::build_tool_definitions(gate_opts);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user