ai: collapse llamacpp into LLM_BACKEND env switch

Reverts the per-request backend="llamacpp" value. Chat/vision/embedding backend is now a deploy-time decision (LLM_BACKEND=ollama|llamacpp), applied globally across chat, vision describe, and embeddings — so embedding vectors stay in one space across the index. - Per-request backend whitelist back to "local"|"hybrid". A request arriving with backend="llamacpp" is rejected. - LLM_BACKEND=llamacpp swaps the entire local stack to llama-swap: chat hits the chat slot, describe hits the vision slot, embeddings hit the embed slot. Hybrid mode still routes chat to OpenRouter but uses LLM_BACKEND for the describe pass. - Drops env vars HYBRID_VISION_BACKEND, LLAMA_SWAP_VISION_MODELS, EMBEDDING_BACKEND (the last never shipped). Drops the LlamaCppClient.vision_models allowlist — capability inference now reports has_vision only for the configured vision_model slot. - Drops the /insights/llamacpp/models handler. /insights/models is the single endpoint; returns Ollama servers under LLM_BACKEND=ollama and llama-swap slots (from LLAMA_SWAP_ALLOWED_MODELS) under LLM_BACKEND=llamacpp. Same envelope shape either way. - New ai::embed_one helper routes embeddings through llama-swap when LLM_BACKEND=llamacpp (else Ollama). Wires it into the four insight_generator embedding sites. - Cross-replay matrix simplifies to pre-llamacpp shape (local↔local, hybrid↔hybrid, hybrid→local allowed; local→hybrid rejected).
2026-05-21 11:36:58 -04:00
parent d14df63f19
commit be51421b38
9 changed files with 338 additions and 301 deletions
@@ -358,10 +358,11 @@ fn parse_openrouter_allowed_models() -> Vec<String> {
 }

 /// Build a `LlamaCppClient` from environment variables. Returns `None` when
-/// `LLAMA_SWAP_URL` is unset (the llamacpp backend is then unavailable and
-/// requests for it return a clear error). The slot ids default to the
-/// names the bundled `llama-swap/config.yaml` uses — `chat` / `vision` /
-/// `embed` — so a minimal deploy only needs to set `LLAMA_SWAP_URL`.
+/// `LLAMA_SWAP_URL` is unset. The client is constructed unconditionally
+/// when the URL is set (so it's available even under `LLM_BACKEND=ollama`
+/// for ad-hoc tooling), but the agentic / chat paths only route through it
+/// when `LLM_BACKEND=llamacpp`. Slot ids default to the names the bundled
+/// `llama-swap/config.yaml` uses — `chat` / `vision` / `embed`.
 fn build_llamacpp_from_env() -> Option<Arc<LlamaCppClient>> {
    let base_url = env::var("LLAMA_SWAP_URL").ok()?;
    let primary_model = env::var("LLAMA_SWAP_PRIMARY_MODEL").ok();
@@ -372,12 +373,12 @@ fn build_llamacpp_from_env() -> Option<Arc<LlamaCppClient>> {
    if let Ok(model) = env::var("LLAMA_SWAP_VISION_MODEL") {
        client.set_vision_model(model);
    }
-    client.set_vision_models(parse_llamacpp_vision_models());
    Some(Arc::new(client))
 }

 /// Parse `LLAMA_SWAP_ALLOWED_MODELS` (comma-separated) into a vec. Used to
-/// drive `/insights/llamacpp/models`; empty when unset.
+/// populate the model picker when `LLM_BACKEND=llamacpp` — `/insights/models`
+/// surfaces these slots with capabilities. Empty when unset.
 fn parse_llamacpp_allowed_models() -> Vec<String> {
    env::var("LLAMA_SWAP_ALLOWED_MODELS")
        .unwrap_or_default()
@@ -387,20 +388,6 @@ fn parse_llamacpp_allowed_models() -> Vec<String> {
        .collect()
 }

-/// Parse `LLAMA_SWAP_VISION_MODELS` (comma-separated) — slot ids that report
-/// `has_vision = true` in capability lookups. The configured `vision_model`
-/// (default `vision`) is always considered vision-capable regardless of this
-/// list, so a deploy that only uses the default vision slot can leave it
-/// unset.
-fn parse_llamacpp_vision_models() -> Vec<String> {
-    env::var("LLAMA_SWAP_VISION_MODELS")
-        .unwrap_or_default()
-        .split(',')
-        .map(|s| s.trim().to_string())
-        .filter(|s| !s.is_empty())
-        .collect()
-}
-
 #[cfg(test)]
 impl AppState {
    /// Creates an AppState instance for testing with temporary directories