ai: collapse llamacpp into LLM_BACKEND env switch
Reverts the per-request backend="llamacpp" value. Chat/vision/embedding backend is now a deploy-time decision (LLM_BACKEND=ollama|llamacpp), applied globally across chat, vision describe, and embeddings — so embedding vectors stay in one space across the index. - Per-request backend whitelist back to "local"|"hybrid". A request arriving with backend="llamacpp" is rejected. - LLM_BACKEND=llamacpp swaps the entire local stack to llama-swap: chat hits the chat slot, describe hits the vision slot, embeddings hit the embed slot. Hybrid mode still routes chat to OpenRouter but uses LLM_BACKEND for the describe pass. - Drops env vars HYBRID_VISION_BACKEND, LLAMA_SWAP_VISION_MODELS, EMBEDDING_BACKEND (the last never shipped). Drops the LlamaCppClient.vision_models allowlist — capability inference now reports has_vision only for the configured vision_model slot. - Drops the /insights/llamacpp/models handler. /insights/models is the single endpoint; returns Ollama servers under LLM_BACKEND=ollama and llama-swap slots (from LLAMA_SWAP_ALLOWED_MODELS) under LLM_BACKEND=llamacpp. Same envelope shape either way. - New ai::embed_one helper routes embeddings through llama-swap when LLM_BACKEND=llamacpp (else Ollama). Wires it into the four insight_generator embedding sites. - Cross-replay matrix simplifies to pre-llamacpp shape (local↔local, hybrid↔hybrid, hybrid→local allowed; local→hybrid rejected).
This commit is contained in:
@@ -470,7 +470,16 @@ pub async fn generate_agentic_insight_handler(
|
||||
}
|
||||
}
|
||||
|
||||
/// GET /insights/models - List available models from both servers with capabilities
|
||||
/// GET /insights/models - Local-backend models with capabilities. Returns
|
||||
/// Ollama servers when `LLM_BACKEND=ollama` (default), or llama-swap slots
|
||||
/// when `LLM_BACKEND=llamacpp`. Same envelope shape either way so the
|
||||
/// client picker doesn't have to branch on backend kind.
|
||||
///
|
||||
/// For llama-swap: `models` comes verbatim from `LLAMA_SWAP_ALLOWED_MODELS`
|
||||
/// (no live `/v1/models` probe), `has_vision` is true only for the
|
||||
/// configured `LLAMA_SWAP_VISION_MODEL` slot id, and `has_tool_calling` is
|
||||
/// reported as true for every slot (llama-server is launched with `--jinja`
|
||||
/// by convention — a misconfigured slot surfaces as a chat-call error).
|
||||
#[get("/insights/models")]
|
||||
pub async fn get_available_models_handler(
|
||||
_claims: Claims,
|
||||
@@ -478,6 +487,29 @@ pub async fn get_available_models_handler(
|
||||
) -> impl Responder {
|
||||
log::debug!("Fetching available models with capabilities");
|
||||
|
||||
if crate::ai::local_backend_is_llamacpp()
|
||||
&& let Some(lc) = app_state.llamacpp.as_ref()
|
||||
{
|
||||
let models: Vec<ModelCapabilities> = app_state
|
||||
.llamacpp_allowed_models
|
||||
.iter()
|
||||
.map(|name| ModelCapabilities {
|
||||
name: name.clone(),
|
||||
has_vision: name == &lc.vision_model,
|
||||
has_tool_calling: true,
|
||||
})
|
||||
.collect();
|
||||
let primary = ServerModels {
|
||||
url: lc.base_url.clone(),
|
||||
models,
|
||||
default_model: lc.primary_model.clone(),
|
||||
};
|
||||
return HttpResponse::Ok().json(AvailableModelsResponse {
|
||||
primary,
|
||||
fallback: None,
|
||||
});
|
||||
}
|
||||
|
||||
let ollama_client = &app_state.ollama;
|
||||
|
||||
// Fetch models with capabilities from primary server
|
||||
@@ -549,36 +581,6 @@ pub async fn get_openrouter_models_handler(
|
||||
HttpResponse::Ok().json(response)
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
pub struct LlamaCppModelsResponse {
|
||||
pub models: Vec<String>,
|
||||
pub default_model: Option<String>,
|
||||
pub configured: bool,
|
||||
}
|
||||
|
||||
/// GET /insights/llamacpp/models - Curated llama-swap model ids exposed
|
||||
/// to clients for the llamacpp backend. Returned verbatim from
|
||||
/// `LLAMA_SWAP_ALLOWED_MODELS`; no live call to llama-swap. Use
|
||||
/// `LLAMA_SWAP_URL` plus `LLAMA_SWAP_PRIMARY_MODEL` on the server side to
|
||||
/// pick the actual chat slot.
|
||||
#[get("/insights/llamacpp/models")]
|
||||
pub async fn get_llamacpp_models_handler(
|
||||
_claims: Claims,
|
||||
app_state: web::Data<crate::state::AppState>,
|
||||
) -> impl Responder {
|
||||
let configured = app_state.llamacpp.is_some();
|
||||
let default_model = app_state
|
||||
.llamacpp
|
||||
.as_ref()
|
||||
.map(|c| c.primary_model.clone());
|
||||
let response = LlamaCppModelsResponse {
|
||||
models: app_state.llamacpp_allowed_models.clone(),
|
||||
default_model,
|
||||
configured,
|
||||
};
|
||||
HttpResponse::Ok().json(response)
|
||||
}
|
||||
|
||||
/// POST /insights/rate - Rate an insight (thumbs up/down for training data)
|
||||
#[post("/insights/rate")]
|
||||
pub async fn rate_insight_handler(
|
||||
|
||||
Reference in New Issue
Block a user