ai: collapse llamacpp into LLM_BACKEND env switch
Reverts the per-request backend="llamacpp" value. Chat/vision/embedding backend is now a deploy-time decision (LLM_BACKEND=ollama|llamacpp), applied globally across chat, vision describe, and embeddings — so embedding vectors stay in one space across the index. - Per-request backend whitelist back to "local"|"hybrid". A request arriving with backend="llamacpp" is rejected. - LLM_BACKEND=llamacpp swaps the entire local stack to llama-swap: chat hits the chat slot, describe hits the vision slot, embeddings hit the embed slot. Hybrid mode still routes chat to OpenRouter but uses LLM_BACKEND for the describe pass. - Drops env vars HYBRID_VISION_BACKEND, LLAMA_SWAP_VISION_MODELS, EMBEDDING_BACKEND (the last never shipped). Drops the LlamaCppClient.vision_models allowlist — capability inference now reports has_vision only for the configured vision_model slot. - Drops the /insights/llamacpp/models handler. /insights/models is the single endpoint; returns Ollama servers under LLM_BACKEND=ollama and llama-swap slots (from LLAMA_SWAP_ALLOWED_MODELS) under LLM_BACKEND=llamacpp. Same envelope shape either way. - New ai::embed_one helper routes embeddings through llama-swap when LLM_BACKEND=llamacpp (else Ollama). Wires it into the four insight_generator embedding sites. - Cross-replay matrix simplifies to pre-llamacpp shape (local↔local, hybrid↔hybrid, hybrid→local allowed; local→hybrid rejected).
This commit is contained in:
+86
-2
@@ -21,14 +21,14 @@ pub use handlers::{
|
||||
chat_history_handler, chat_rewind_handler, chat_stream_handler, chat_turn_handler,
|
||||
delete_insight_handler, export_training_data_handler, generate_agentic_insight_handler,
|
||||
generate_insight_handler, get_all_insights_handler, get_available_models_handler,
|
||||
get_insight_handler, get_llamacpp_models_handler, get_openrouter_models_handler,
|
||||
rate_insight_handler,
|
||||
get_insight_handler, get_openrouter_models_handler, rate_insight_handler,
|
||||
};
|
||||
pub use insight_generator::InsightGenerator;
|
||||
#[allow(unused_imports)]
|
||||
pub use llm_client::{
|
||||
ChatMessage, LlmClient, ModelCapabilities, Tool, ToolCall, ToolCallFunction, ToolFunction,
|
||||
};
|
||||
pub use llamacpp::LlamaCppClient;
|
||||
pub use ollama::{EMBEDDING_MODEL, OllamaClient};
|
||||
pub use sms_client::{SmsApiClient, SmsMessage};
|
||||
|
||||
@@ -40,3 +40,87 @@ pub use sms_client::{SmsApiClient, SmsMessage};
|
||||
pub fn user_display_name() -> String {
|
||||
std::env::var("USER_NAME").unwrap_or_else(|_| "Me".to_string())
|
||||
}
|
||||
|
||||
/// One switch for the "local" LLM stack: when `LLM_BACKEND=llamacpp` is
|
||||
/// set, chat / vision describe / embeddings all route through llama-swap
|
||||
/// instead of Ollama. Any other value (including unset, the default) is
|
||||
/// Ollama. This is intentionally global — embeddings must be drawn from
|
||||
/// a single source or similarity search across the index breaks (mixed
|
||||
/// vector spaces, possibly mixed dims). The `backend=hybrid` per-request
|
||||
/// override remains orthogonal: it always sends chat to OpenRouter, and
|
||||
/// uses `LLM_BACKEND` for the describe-then-inline vision pass.
|
||||
pub fn local_backend_is_llamacpp() -> bool {
|
||||
matches!(
|
||||
std::env::var("LLM_BACKEND")
|
||||
.ok()
|
||||
.as_deref()
|
||||
.map(|s| s.trim().to_lowercase())
|
||||
.as_deref(),
|
||||
Some("llamacpp")
|
||||
)
|
||||
}
|
||||
|
||||
/// Embed one string via the configured local backend. Routes through
|
||||
/// llama-swap when `LLM_BACKEND=llamacpp` (and a client is configured),
|
||||
/// else Ollama. Returns the single embedding vector. See
|
||||
/// [`local_backend_is_llamacpp`] for the rationale on consistency.
|
||||
pub async fn embed_one(
|
||||
ollama: &OllamaClient,
|
||||
llamacpp: Option<&LlamaCppClient>,
|
||||
text: &str,
|
||||
) -> anyhow::Result<Vec<f32>> {
|
||||
if local_backend_is_llamacpp() {
|
||||
if let Some(lc) = llamacpp {
|
||||
let mut vecs = <LlamaCppClient as LlmClient>::generate_embeddings(lc, &[text]).await?;
|
||||
return vecs
|
||||
.pop()
|
||||
.ok_or_else(|| anyhow::anyhow!("llama-swap returned no embeddings"));
|
||||
}
|
||||
log::warn!(
|
||||
"LLM_BACKEND=llamacpp but LlamaCppClient is unconfigured; falling back to Ollama embeddings"
|
||||
);
|
||||
}
|
||||
ollama.generate_embedding(text).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod env_dispatch_tests {
|
||||
use super::*;
|
||||
|
||||
fn with_env<F: FnOnce()>(key: &str, val: Option<&str>, f: F) {
|
||||
let prev = std::env::var(key).ok();
|
||||
match val {
|
||||
Some(v) => unsafe { std::env::set_var(key, v) },
|
||||
None => unsafe { std::env::remove_var(key) },
|
||||
}
|
||||
f();
|
||||
match prev {
|
||||
Some(v) => unsafe { std::env::set_var(key, v) },
|
||||
None => unsafe { std::env::remove_var(key) },
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn llm_backend_defaults_to_ollama() {
|
||||
with_env("LLM_BACKEND", None, || {
|
||||
assert!(!local_backend_is_llamacpp());
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn llm_backend_llamacpp_case_insensitive() {
|
||||
with_env("LLM_BACKEND", Some("LlamaCpp"), || {
|
||||
assert!(local_backend_is_llamacpp());
|
||||
});
|
||||
with_env("LLM_BACKEND", Some(" llamacpp "), || {
|
||||
assert!(local_backend_is_llamacpp());
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn llm_backend_unknown_value_is_ollama() {
|
||||
with_env("LLM_BACKEND", Some("vllm"), || {
|
||||
assert!(!local_backend_is_llamacpp());
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user