ai: collapse llamacpp into LLM_BACKEND env switch
Reverts the per-request backend="llamacpp" value. Chat/vision/embedding backend is now a deploy-time decision (LLM_BACKEND=ollama|llamacpp), applied globally across chat, vision describe, and embeddings — so embedding vectors stay in one space across the index. - Per-request backend whitelist back to "local"|"hybrid". A request arriving with backend="llamacpp" is rejected. - LLM_BACKEND=llamacpp swaps the entire local stack to llama-swap: chat hits the chat slot, describe hits the vision slot, embeddings hit the embed slot. Hybrid mode still routes chat to OpenRouter but uses LLM_BACKEND for the describe pass. - Drops env vars HYBRID_VISION_BACKEND, LLAMA_SWAP_VISION_MODELS, EMBEDDING_BACKEND (the last never shipped). Drops the LlamaCppClient.vision_models allowlist — capability inference now reports has_vision only for the configured vision_model slot. - Drops the /insights/llamacpp/models handler. /insights/models is the single endpoint; returns Ollama servers under LLM_BACKEND=ollama and llama-swap slots (from LLAMA_SWAP_ALLOWED_MODELS) under LLM_BACKEND=llamacpp. Same envelope shape either way. - New ai::embed_one helper routes embeddings through llama-swap when LLM_BACKEND=llamacpp (else Ollama). Wires it into the four insight_generator embedding sites. - Cross-replay matrix simplifies to pre-llamacpp shape (local↔local, hybrid↔hybrid, hybrid→local allowed; local→hybrid rejected).
This commit is contained in:
32
.env.example
32
.env.example
@@ -53,26 +53,30 @@ AGENTIC_CHAT_MAX_ITERATIONS=6
|
||||
# OPENROUTER_HTTP_REFERER=https://your-site.example
|
||||
# OPENROUTER_APP_TITLE=ImageApi
|
||||
|
||||
# ── AI Insights — local backend switch ──────────────────────────────────
|
||||
# Picks which local LLM stack the server uses for chat, vision describe,
|
||||
# and embeddings. `ollama` (default) uses the OLLAMA_* settings above;
|
||||
# `llamacpp` uses the LLAMA_SWAP_* settings below. The switch is global
|
||||
# and applies to both `backend=local` and `backend=hybrid` (hybrid keeps
|
||||
# chat on OpenRouter but still uses this stack for the describe pass).
|
||||
# Don't flip mid-deploy without re-embedding existing index rows —
|
||||
# mixed vector spaces break similarity search.
|
||||
# LLM_BACKEND=ollama
|
||||
|
||||
# ── AI Insights — llama.cpp / llama-swap (optional) ─────────────────────
|
||||
# Set LLAMA_SWAP_URL to enable the `llamacpp` chat_backend. Talks
|
||||
# OpenAI-compatible /v1 to a llama-swap proxy that fronts per-slot
|
||||
# llama-server instances (chat / vision / embed). Like hybrid, the
|
||||
# agentic loop describes images via the vision slot then inlines the
|
||||
# text into the chat slot — so the chat slot itself can be text-only.
|
||||
# Set LLAMA_SWAP_URL plus LLM_BACKEND=llamacpp to swap the local stack
|
||||
# off Ollama. Talks OpenAI-compatible /v1 to a llama-swap proxy fronting
|
||||
# per-slot llama-server instances (chat / vision / embed). The chat slot
|
||||
# is treated as text-only — images are pre-described via the vision slot
|
||||
# and inlined into the prompt.
|
||||
# LLAMA_SWAP_URL=http://localhost:9292/v1
|
||||
# LLAMA_SWAP_PRIMARY_MODEL=chat
|
||||
# LLAMA_SWAP_VISION_MODEL=vision
|
||||
# LLAMA_SWAP_EMBEDDING_MODEL=embed
|
||||
# Comma-separated allowlist of model ids the /v1/models endpoint should
|
||||
# advertise as vision-capable (llama-swap doesn't report modality).
|
||||
# LLAMA_SWAP_VISION_MODELS=vision
|
||||
# Comma-separated allowlist surfaced by /insights/llamacpp/models.
|
||||
# Comma-separated allowlist surfaced by /insights/models when
|
||||
# LLM_BACKEND=llamacpp.
|
||||
# LLAMA_SWAP_ALLOWED_MODELS=chat,vision,embed
|
||||
# LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=120
|
||||
# Routes hybrid mode's vision-describe pass through llama-swap's vision
|
||||
# slot instead of Ollama (chat still goes to OpenRouter). Values:
|
||||
# `ollama` (default) | `llamacpp`.
|
||||
# HYBRID_VISION_BACKEND=ollama
|
||||
# LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180
|
||||
|
||||
# ── AI Insights — sibling services (optional) ───────────────────────────
|
||||
# Apollo (places, face inference, CLIP encoders). Single-Apollo deploys
|
||||
|
||||
96
CLAUDE.md
96
CLAUDE.md
@@ -473,9 +473,8 @@ GET /memories?path=...&recursive=true
|
||||
POST /insights/generate (non-agentic single-shot)
|
||||
POST /insights/generate/agentic (tool-calling loop; body: { file_path, backend?, model?, ... })
|
||||
GET /insights?path=...&library=...
|
||||
GET /insights/models (local Ollama models + capabilities)
|
||||
GET /insights/models (local-backend models + capabilities; Ollama OR llama-swap based on LLM_BACKEND)
|
||||
GET /insights/openrouter/models (curated OpenRouter allowlist)
|
||||
GET /insights/llamacpp/models (curated llama-swap slot allowlist)
|
||||
POST /insights/rate (thumbs up/down for training data)
|
||||
|
||||
// Insight Chat Continuation
|
||||
@@ -632,22 +631,27 @@ OPENROUTER_EMBEDDING_MODEL=openai/text-embedding-3-small # Optional, embeddings
|
||||
OPENROUTER_HTTP_REFERER=https://your-site.example # Optional attribution header
|
||||
OPENROUTER_APP_TITLE=ImageApi # Optional attribution header
|
||||
|
||||
# llama.cpp / llama-swap (Llamacpp Backend) - sibling to Ollama; OpenAI-compatible
|
||||
# Local LLM backend switch. `ollama` (default) keeps the OLLAMA_* settings
|
||||
# above; `llamacpp` swaps the entire local stack (chat + vision describe +
|
||||
# embeddings) over to llama-swap. The switch is global and applies to
|
||||
# `backend=local` requests and to `backend=hybrid`'s describe pass (hybrid
|
||||
# chat still goes to OpenRouter). Don't flip mid-deploy without
|
||||
# re-embedding — mixed vector spaces break similarity search.
|
||||
LLM_BACKEND=ollama
|
||||
|
||||
# llama.cpp / llama-swap (used when LLM_BACKEND=llamacpp). OpenAI-compatible
|
||||
# proxy hosting one or more llama-server processes (chat / vision / embed slots).
|
||||
LLAMA_SWAP_URL=http://localhost:9292/v1 # Required to enable llamacpp backend
|
||||
LLAMA_SWAP_URL=http://localhost:9292/v1 # Required when LLM_BACKEND=llamacpp
|
||||
LLAMA_SWAP_PRIMARY_MODEL=chat # Chat slot id (matches config.yaml)
|
||||
LLAMA_SWAP_VISION_MODEL=vision # Vision slot id; describe_image routes here
|
||||
LLAMA_SWAP_EMBEDDING_MODEL=embed # Embedding slot id (when local embeddings via llamacpp)
|
||||
LLAMA_SWAP_VISION_MODELS=qwen-vl,llava # Comma-separated slot ids known to have vision.
|
||||
# Drives `has_vision` in /insights/llamacpp/models.
|
||||
# `LLAMA_SWAP_VISION_MODEL` is auto-included.
|
||||
LLAMA_SWAP_ALLOWED_MODELS=chat,coder # Curated allowlist exposed to clients via
|
||||
# GET /insights/llamacpp/models. Empty = no picker.
|
||||
LLAMA_SWAP_VISION_MODEL=vision # Vision slot id; describe_image routes here.
|
||||
# The only slot reported as has_vision=true in
|
||||
# /insights/models — chat slots are treated as
|
||||
# text-only (images pre-described and inlined).
|
||||
LLAMA_SWAP_EMBEDDING_MODEL=embed # Embedding slot id
|
||||
LLAMA_SWAP_ALLOWED_MODELS=chat,coder # Curated allowlist surfaced by GET /insights/models
|
||||
# when LLM_BACKEND=llamacpp. Empty = picker shows
|
||||
# only the configured primary model.
|
||||
LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS=180 # Per-request timeout; bump for slow CPU offload
|
||||
HYBRID_VISION_BACKEND=llamacpp # Optional override for hybrid mode's describe_image:
|
||||
# `ollama` (default) or `llamacpp`. When `llamacpp`,
|
||||
# hybrid still routes chat to OpenRouter but uses
|
||||
# llama-swap's vision slot to describe images.
|
||||
|
||||
# Insight Chat Continuation
|
||||
AGENTIC_CHAT_MAX_ITERATIONS=6 # Cap on tool-calling iterations per chat turn (default 6)
|
||||
@@ -668,13 +672,36 @@ The `OllamaClient` provides methods to query available models:
|
||||
|
||||
This allows runtime verification of model availability before generating insights.
|
||||
|
||||
**Local backend switch (`LLM_BACKEND`):**
|
||||
|
||||
One env var decides which "local" stack the server runs against — `ollama`
|
||||
(default) or `llamacpp`. It's global on purpose: chat, vision describe, and
|
||||
embeddings all route through the same backend, so the embedding-vector
|
||||
column in SQLite stays in one vector space. Don't flip mid-deploy without
|
||||
re-embedding the affected rows — similarity search will collapse.
|
||||
|
||||
- `LLM_BACKEND=ollama`: chat and embeddings use Ollama; vision describe
|
||||
uses Ollama's multimodal model.
|
||||
- `LLM_BACKEND=llamacpp`: chat hits llama-swap's `chat` slot (which is
|
||||
treated as text-only — images are pre-described via the `vision` slot
|
||||
and inlined), embeddings hit the `embed` slot, vision describe hits the
|
||||
`vision` slot. Requires `LLAMA_SWAP_URL`.
|
||||
|
||||
The per-request `backend=hybrid` override is orthogonal: it always sends
|
||||
chat to OpenRouter, but the describe pass still routes through whichever
|
||||
`LLM_BACKEND` is configured.
|
||||
|
||||
`GET /insights/models` returns the local-backend models with capabilities
|
||||
in the same envelope shape regardless of `LLM_BACKEND`: Ollama servers
|
||||
when `ollama`, llama-swap slots (from `LLAMA_SWAP_ALLOWED_MODELS`) when
|
||||
`llamacpp`. No `/insights/llamacpp/models` — the picker reads a single
|
||||
endpoint.
|
||||
|
||||
**Hybrid Backend (OpenRouter):**
|
||||
- Per-request opt-in via `backend=hybrid` on `POST /insights/generate/agentic`.
|
||||
- Vision describe happens before the agentic loop; the description is inlined
|
||||
into the chat prompt and the agentic loop runs on OpenRouter. By default
|
||||
vision uses local Ollama, but `HYBRID_VISION_BACKEND=llamacpp` flips it to
|
||||
llama-swap's vision slot (useful when you want chat on a frontier model and
|
||||
vision on a local-but-not-Ollama path).
|
||||
into the chat prompt and the agentic loop runs on OpenRouter. Vision
|
||||
routes through whichever `LLM_BACKEND` is configured.
|
||||
- `request.model` (if provided) overrides `OPENROUTER_DEFAULT_MODEL` for that
|
||||
call. The mobile picker reads from `OPENROUTER_ALLOWED_MODELS`.
|
||||
- No live capability precheck — the operator-curated allowlist is trusted.
|
||||
@@ -682,29 +709,14 @@ This allows runtime verification of model availability before generating insight
|
||||
- `GET /insights/openrouter/models` returns `{ models, default_model, configured }`
|
||||
for client picker UIs.
|
||||
|
||||
**Llamacpp Backend (llama-swap):**
|
||||
- Per-request opt-in via `backend=llamacpp` on `POST /insights/generate/agentic`.
|
||||
- Sibling to Ollama: a local OpenAI-compatible proxy (mostlygeek/llama-swap)
|
||||
fronting one or more `llama-server` processes. The chat slot is text-only
|
||||
by default; vision and embeddings have their own slots (`LLAMA_SWAP_VISION_MODEL`,
|
||||
`LLAMA_SWAP_EMBEDDING_MODEL`) that llama-swap routes to by model id. The
|
||||
bundled `docker-compose.yml` + `llama-swap/config.yaml` in the opencode root
|
||||
is the reference deploy.
|
||||
- Operates in the same describe-then-inline shape as hybrid: the chat model
|
||||
never sees raw images. Vision describe routes through llama-swap's vision
|
||||
slot (`describe_image` on `LlamaCppClient`).
|
||||
- `request.model` (if provided) overrides `LLAMA_SWAP_PRIMARY_MODEL` for that
|
||||
call (must match a slot id in llama-swap's `config.yaml`). The mobile picker
|
||||
reads from `LLAMA_SWAP_ALLOWED_MODELS`.
|
||||
- No live capability precheck — slot ids are trusted. Tool calling is assumed
|
||||
for every slot (llama-swap entries typically launch with `--jinja`).
|
||||
- `GET /insights/llamacpp/models` returns `{ models, default_model, configured }`.
|
||||
- Cross-replay matrix (chat continuation): `local ↔ llamacpp` allowed (the
|
||||
LlamaCppClient passes images through to the chat slot — you're responsible
|
||||
for a vision-capable slot if the stored transcript carries images);
|
||||
`hybrid ↔ llamacpp` allowed (both transcripts are text-only); `local →
|
||||
hybrid` and `llamacpp → hybrid` rejected (mid-conversation description
|
||||
source change isn't supported).
|
||||
**Cross-replay matrix (chat continuation):**
|
||||
- `local → local` allowed (whether served by Ollama or llama-swap; that's
|
||||
a deploy-time decision, not a request-time one).
|
||||
- `hybrid → hybrid` allowed.
|
||||
- `hybrid → local` allowed (the inlined description replays as text).
|
||||
- `local → hybrid` rejected — the stored transcript has raw images in the
|
||||
first user message and OpenRouter providers don't accept that shape
|
||||
consistently. Regenerate the insight in hybrid mode instead.
|
||||
|
||||
**Insight Chat Continuation:**
|
||||
|
||||
|
||||
@@ -470,7 +470,16 @@ pub async fn generate_agentic_insight_handler(
|
||||
}
|
||||
}
|
||||
|
||||
/// GET /insights/models - List available models from both servers with capabilities
|
||||
/// GET /insights/models - Local-backend models with capabilities. Returns
|
||||
/// Ollama servers when `LLM_BACKEND=ollama` (default), or llama-swap slots
|
||||
/// when `LLM_BACKEND=llamacpp`. Same envelope shape either way so the
|
||||
/// client picker doesn't have to branch on backend kind.
|
||||
///
|
||||
/// For llama-swap: `models` comes verbatim from `LLAMA_SWAP_ALLOWED_MODELS`
|
||||
/// (no live `/v1/models` probe), `has_vision` is true only for the
|
||||
/// configured `LLAMA_SWAP_VISION_MODEL` slot id, and `has_tool_calling` is
|
||||
/// reported as true for every slot (llama-server is launched with `--jinja`
|
||||
/// by convention — a misconfigured slot surfaces as a chat-call error).
|
||||
#[get("/insights/models")]
|
||||
pub async fn get_available_models_handler(
|
||||
_claims: Claims,
|
||||
@@ -478,6 +487,29 @@ pub async fn get_available_models_handler(
|
||||
) -> impl Responder {
|
||||
log::debug!("Fetching available models with capabilities");
|
||||
|
||||
if crate::ai::local_backend_is_llamacpp()
|
||||
&& let Some(lc) = app_state.llamacpp.as_ref()
|
||||
{
|
||||
let models: Vec<ModelCapabilities> = app_state
|
||||
.llamacpp_allowed_models
|
||||
.iter()
|
||||
.map(|name| ModelCapabilities {
|
||||
name: name.clone(),
|
||||
has_vision: name == &lc.vision_model,
|
||||
has_tool_calling: true,
|
||||
})
|
||||
.collect();
|
||||
let primary = ServerModels {
|
||||
url: lc.base_url.clone(),
|
||||
models,
|
||||
default_model: lc.primary_model.clone(),
|
||||
};
|
||||
return HttpResponse::Ok().json(AvailableModelsResponse {
|
||||
primary,
|
||||
fallback: None,
|
||||
});
|
||||
}
|
||||
|
||||
let ollama_client = &app_state.ollama;
|
||||
|
||||
// Fetch models with capabilities from primary server
|
||||
@@ -549,36 +581,6 @@ pub async fn get_openrouter_models_handler(
|
||||
HttpResponse::Ok().json(response)
|
||||
}
|
||||
|
||||
#[derive(serde::Serialize)]
|
||||
pub struct LlamaCppModelsResponse {
|
||||
pub models: Vec<String>,
|
||||
pub default_model: Option<String>,
|
||||
pub configured: bool,
|
||||
}
|
||||
|
||||
/// GET /insights/llamacpp/models - Curated llama-swap model ids exposed
|
||||
/// to clients for the llamacpp backend. Returned verbatim from
|
||||
/// `LLAMA_SWAP_ALLOWED_MODELS`; no live call to llama-swap. Use
|
||||
/// `LLAMA_SWAP_URL` plus `LLAMA_SWAP_PRIMARY_MODEL` on the server side to
|
||||
/// pick the actual chat slot.
|
||||
#[get("/insights/llamacpp/models")]
|
||||
pub async fn get_llamacpp_models_handler(
|
||||
_claims: Claims,
|
||||
app_state: web::Data<crate::state::AppState>,
|
||||
) -> impl Responder {
|
||||
let configured = app_state.llamacpp.is_some();
|
||||
let default_model = app_state
|
||||
.llamacpp
|
||||
.as_ref()
|
||||
.map(|c| c.primary_model.clone());
|
||||
let response = LlamaCppModelsResponse {
|
||||
models: app_state.llamacpp_allowed_models.clone(),
|
||||
default_model,
|
||||
configured,
|
||||
};
|
||||
HttpResponse::Ok().json(response)
|
||||
}
|
||||
|
||||
/// POST /insights/rate - Rate an insight (thumbs up/down for training data)
|
||||
#[post("/insights/rate")]
|
||||
pub async fn rate_insight_handler(
|
||||
|
||||
@@ -309,14 +309,15 @@ impl InsightChatService {
|
||||
.unwrap_or_else(|| stored_backend.clone());
|
||||
validate_cross_replay(&stored_backend, &effective_backend)?;
|
||||
let is_hybrid = effective_backend == "hybrid";
|
||||
let is_llamacpp = effective_backend == "llamacpp";
|
||||
let describes_then_inlines = is_hybrid || is_llamacpp;
|
||||
let local_via_llamacpp =
|
||||
crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
|
||||
let describes_then_inlines = is_hybrid || local_via_llamacpp;
|
||||
span.set_attribute(KeyValue::new("backend", effective_backend.clone()));
|
||||
|
||||
// 4. Build the chat backend client. Ollama in local mode, a freshly
|
||||
// cloned OpenRouter client in hybrid mode, a freshly cloned
|
||||
// LlamaCppClient in llamacpp mode (clone so per-request
|
||||
// sampling/model overrides don't leak into shared state).
|
||||
// 4. Build the chat backend client. Hybrid → OpenRouter; local with
|
||||
// `LLM_BACKEND=llamacpp` → llama-swap; otherwise Ollama. Clones
|
||||
// so per-request sampling/model overrides don't leak into shared
|
||||
// state.
|
||||
let max_iterations = req
|
||||
.max_iterations
|
||||
.unwrap_or(DEFAULT_MAX_ITERATIONS)
|
||||
@@ -353,9 +354,9 @@ impl InsightChatService {
|
||||
c.set_num_ctx(Some(ctx));
|
||||
}
|
||||
openrouter_client = Some(c);
|
||||
} else if is_llamacpp {
|
||||
} else if local_via_llamacpp {
|
||||
let arc = self.llamacpp.as_ref().ok_or_else(|| {
|
||||
anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured")
|
||||
anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured")
|
||||
})?;
|
||||
let mut c: LlamaCppClient = (**arc).clone();
|
||||
if let Some(ref m) = custom_model {
|
||||
@@ -373,8 +374,8 @@ impl InsightChatService {
|
||||
}
|
||||
llamacpp_client = Some(c);
|
||||
} else {
|
||||
// Local-mode model swap. Build a new client when the chat model
|
||||
// differs from the configured one (mirrors the agentic pattern).
|
||||
// Pure local (Ollama): model swap. Build a new client when the
|
||||
// chat model differs from the configured one.
|
||||
if let Some(ref m) = custom_model
|
||||
&& m != &self.ollama.primary_model
|
||||
{
|
||||
@@ -820,8 +821,9 @@ impl InsightChatService {
|
||||
.unwrap_or_else(|| stored_backend.clone());
|
||||
validate_cross_replay(&stored_backend, &effective_backend)?;
|
||||
let is_hybrid = effective_backend == "hybrid";
|
||||
let is_llamacpp = effective_backend == "llamacpp";
|
||||
let describes_then_inlines = is_hybrid || is_llamacpp;
|
||||
let local_via_llamacpp =
|
||||
crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
|
||||
let describes_then_inlines = is_hybrid || local_via_llamacpp;
|
||||
|
||||
let max_iterations = req
|
||||
.max_iterations
|
||||
@@ -841,9 +843,9 @@ impl InsightChatService {
|
||||
let model_used = chat_backend.primary_model().to_string();
|
||||
|
||||
// Tool set — local mode + first user turn carries an image →
|
||||
// offer describe_photo. Describe-then-inline modes (hybrid /
|
||||
// llamacpp): visual description was inlined when the insight was
|
||||
// bootstrapped, no describe tool needed.
|
||||
// offer describe_photo. Describe-then-inline modes (hybrid OR
|
||||
// local_via_llamacpp): visual description was inlined when the
|
||||
// insight was bootstrapped, no describe tool needed.
|
||||
let local_first_user_has_image = messages
|
||||
.iter()
|
||||
.find(|m| m.role == "user")
|
||||
@@ -987,8 +989,9 @@ impl InsightChatService {
|
||||
.unwrap_or_else(|| "default".to_string());
|
||||
let effective_backend = resolve_bootstrap_backend(req.backend.as_deref())?;
|
||||
let is_hybrid = effective_backend == "hybrid";
|
||||
let is_llamacpp = effective_backend == "llamacpp";
|
||||
let describes_then_inlines = is_hybrid || is_llamacpp;
|
||||
let local_via_llamacpp =
|
||||
crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
|
||||
let describes_then_inlines = is_hybrid || local_via_llamacpp;
|
||||
|
||||
let max_iterations = req
|
||||
.max_iterations
|
||||
@@ -1020,35 +1023,19 @@ impl InsightChatService {
|
||||
_ => None,
|
||||
});
|
||||
|
||||
// Describe-then-inline backends (hybrid, llamacpp): pre-describe the
|
||||
// image so a text-only chat model gets the visual description inline.
|
||||
// Vision source: llamacpp's vision slot in llamacpp mode; in hybrid
|
||||
// mode Ollama by default, llamacpp via `HYBRID_VISION_BACKEND=llamacpp`.
|
||||
// Describe-then-inline (hybrid OR local_via_llamacpp): pre-describe
|
||||
// the image so a text-only chat model gets the visual description
|
||||
// inline. Vision source follows `LLM_BACKEND`: llama-swap when
|
||||
// `local_via_llamacpp`, else Ollama.
|
||||
let visual_block = if describes_then_inlines {
|
||||
match image_base64.as_deref() {
|
||||
Some(b64) => {
|
||||
let use_llamacpp_vision = if is_llamacpp {
|
||||
true
|
||||
} else {
|
||||
matches!(
|
||||
std::env::var("HYBRID_VISION_BACKEND")
|
||||
.ok()
|
||||
.as_deref()
|
||||
.map(|s| s.trim().to_lowercase())
|
||||
.as_deref(),
|
||||
Some("llamacpp")
|
||||
)
|
||||
};
|
||||
let described = if use_llamacpp_vision {
|
||||
match self.llamacpp.as_ref() {
|
||||
Some(c) => c.describe_image(b64).await,
|
||||
None => {
|
||||
log::warn!(
|
||||
"bootstrap: requested llamacpp vision but LLAMA_SWAP_URL unset; falling back to Ollama"
|
||||
);
|
||||
self.ollama.describe_image(b64).await
|
||||
}
|
||||
}
|
||||
let described = if local_via_llamacpp {
|
||||
self.llamacpp
|
||||
.as_ref()
|
||||
.expect("local_via_llamacpp guarantees Some")
|
||||
.describe_image(b64)
|
||||
.await
|
||||
} else {
|
||||
self.ollama.describe_image(b64).await
|
||||
};
|
||||
@@ -1175,8 +1162,11 @@ impl InsightChatService {
|
||||
/// (boxed because each backend has a different concrete type) and the
|
||||
/// Ollama client used for describe-image / local tool calls.
|
||||
///
|
||||
/// `effective_backend` must be one of `"local"`, `"hybrid"`, `"llamacpp"`
|
||||
/// (validated upstream).
|
||||
/// `effective_backend` must be one of `"local"` or `"hybrid"` (validated
|
||||
/// upstream). Hybrid → OpenRouter; local with `LLM_BACKEND=llamacpp` →
|
||||
/// llama-swap; pure local → Ollama. Returns the dispatched chat client
|
||||
/// plus the (possibly per-request) Ollama client that the caller uses
|
||||
/// for non-chat helpers (image describe in non-llamacpp mode, tool ops).
|
||||
fn build_chat_clients(
|
||||
&self,
|
||||
effective_backend: &str,
|
||||
@@ -1206,10 +1196,10 @@ impl InsightChatService {
|
||||
return Ok((Box::new(c), ollama_client));
|
||||
}
|
||||
|
||||
if effective_backend == "llamacpp" {
|
||||
let arc = self.llamacpp.as_ref().ok_or_else(|| {
|
||||
anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured")
|
||||
})?;
|
||||
// Local mode — env switch decides between Ollama and llama-swap.
|
||||
if crate::ai::local_backend_is_llamacpp()
|
||||
&& let Some(arc) = self.llamacpp.as_ref()
|
||||
{
|
||||
let mut c: LlamaCppClient = (**arc).clone();
|
||||
if let Some(m) = custom_model {
|
||||
c.primary_model = m.to_string();
|
||||
@@ -1525,41 +1515,26 @@ fn resolve_date_taken_for_context(
|
||||
|
||||
/// Validate a stored→effective backend transition for a chat continuation.
|
||||
/// Continuation runs against a transcript that was generated with a specific
|
||||
/// backend; some transitions break the conversation shape:
|
||||
/// backend; the only blocked transition is `local → hybrid`, because the
|
||||
/// stored transcript has images embedded in the first user message and the
|
||||
/// hybrid path (OpenRouter chat with describe-then-inline) can't replay
|
||||
/// raw image bytes through OpenRouter consistently across providers.
|
||||
/// `hybrid → local` is allowed (the inlined description replays verbatim
|
||||
/// as text).
|
||||
///
|
||||
/// - `local → hybrid` — the stored transcript has images embedded in the
|
||||
/// first user message; the openrouter chat client surfaces them through
|
||||
/// the wire, but vision-only models routed via the hybrid path may not
|
||||
/// accept that shape consistently across providers. Reject to keep the
|
||||
/// `regenerate-in-hybrid-mode` workflow as the supported answer.
|
||||
/// - `llamacpp → hybrid` — the stored transcript already has an inlined
|
||||
/// visual description produced by llama-swap's vision slot. Switching
|
||||
/// to hybrid mid-conversation would mix description sources across
|
||||
/// subsequent turns (any new image in the chat continuation would be
|
||||
/// described by ollama-vision while the original was described by
|
||||
/// llama-vision). Reject for consistency.
|
||||
///
|
||||
/// All other transitions are allowed. `local ↔ llamacpp` works because
|
||||
/// LlamaCppClient passes image content-parts through to the chat slot —
|
||||
/// the user is responsible for picking a vision-capable chat model in
|
||||
/// that case. `hybrid ↔ llamacpp` works because both transcripts are
|
||||
/// text-only (visual description inlined at bootstrap).
|
||||
/// Whether "local" routes through Ollama or llama-swap is decided at
|
||||
/// startup by `LLM_BACKEND`; both share the same transcript shape from
|
||||
/// the chat-replay perspective.
|
||||
fn validate_cross_replay(stored: &str, effective: &str) -> Result<()> {
|
||||
if !matches!(effective, "local" | "hybrid" | "llamacpp") {
|
||||
if !matches!(effective, "local" | "hybrid") {
|
||||
bail!(
|
||||
"unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'",
|
||||
"unknown backend '{}'; expected 'local' or 'hybrid'",
|
||||
effective
|
||||
);
|
||||
}
|
||||
if stored == "local" && effective == "hybrid" {
|
||||
bail!(
|
||||
"switching from local to hybrid mid-chat isn't supported yet; \
|
||||
regenerate the insight in hybrid mode if you want OpenRouter chat"
|
||||
);
|
||||
}
|
||||
if stored == "llamacpp" && effective == "hybrid" {
|
||||
bail!(
|
||||
"switching from llamacpp to hybrid mid-chat isn't supported yet; \
|
||||
"switching from local to hybrid mid-chat isn't supported; \
|
||||
regenerate the insight in hybrid mode if you want OpenRouter chat"
|
||||
);
|
||||
}
|
||||
@@ -1576,9 +1551,9 @@ fn resolve_bootstrap_backend(supplied: Option<&str>) -> Result<String> {
|
||||
.map(|s| s.trim().to_lowercase())
|
||||
.filter(|s| !s.is_empty())
|
||||
.unwrap_or_else(|| "local".to_string());
|
||||
if !matches!(lower.as_str(), "local" | "hybrid" | "llamacpp") {
|
||||
if !matches!(lower.as_str(), "local" | "hybrid") {
|
||||
bail!(
|
||||
"unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'",
|
||||
"unknown backend '{}'; expected 'local' or 'hybrid'",
|
||||
lower
|
||||
);
|
||||
}
|
||||
@@ -2184,10 +2159,6 @@ mod tests {
|
||||
fn bootstrap_backend_accepts_local_and_hybrid_case_insensitively() {
|
||||
assert_eq!(resolve_bootstrap_backend(Some("LOCAL")).unwrap(), "local");
|
||||
assert_eq!(resolve_bootstrap_backend(Some("Hybrid")).unwrap(), "hybrid");
|
||||
assert_eq!(
|
||||
resolve_bootstrap_backend(Some("Llamacpp")).unwrap(),
|
||||
"llamacpp"
|
||||
);
|
||||
assert_eq!(
|
||||
resolve_bootstrap_backend(Some(" local ")).unwrap(),
|
||||
"local"
|
||||
@@ -2196,10 +2167,13 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn bootstrap_backend_rejects_unknown_label() {
|
||||
let err = resolve_bootstrap_backend(Some("openrouter")).unwrap_err();
|
||||
let msg = format!("{}", err);
|
||||
assert!(msg.contains("unknown backend"));
|
||||
assert!(msg.contains("openrouter"));
|
||||
// `llamacpp` is no longer a per-request backend value — it's chosen
|
||||
// at deploy time via `LLM_BACKEND`.
|
||||
for label in &["openrouter", "llamacpp", "ollama"] {
|
||||
let err = resolve_bootstrap_backend(Some(label)).unwrap_err();
|
||||
let msg = format!("{}", err);
|
||||
assert!(msg.contains("unknown backend"), "label={}", label);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -2209,29 +2183,20 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cross_replay_rejects_llamacpp_to_hybrid() {
|
||||
let err = validate_cross_replay("llamacpp", "hybrid").unwrap_err();
|
||||
assert!(format!("{}", err).contains("llamacpp to hybrid"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cross_replay_allows_local_llamacpp_and_hybrid_llamacpp_transitions() {
|
||||
// Local ↔ llamacpp: user is responsible for picking a vision-capable
|
||||
// chat slot when the transcript has images.
|
||||
assert!(validate_cross_replay("local", "llamacpp").is_ok());
|
||||
assert!(validate_cross_replay("llamacpp", "local").is_ok());
|
||||
// Hybrid ↔ llamacpp: both transcripts are text-only.
|
||||
assert!(validate_cross_replay("hybrid", "llamacpp").is_ok());
|
||||
// Same-backend replays are always fine.
|
||||
fn cross_replay_allows_supported_transitions() {
|
||||
assert!(validate_cross_replay("local", "local").is_ok());
|
||||
assert!(validate_cross_replay("hybrid", "hybrid").is_ok());
|
||||
assert!(validate_cross_replay("llamacpp", "llamacpp").is_ok());
|
||||
// Hybrid → local replays the inlined description as plain text.
|
||||
assert!(validate_cross_replay("hybrid", "local").is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cross_replay_rejects_unknown_effective() {
|
||||
let err = validate_cross_replay("local", "openrouter").unwrap_err();
|
||||
assert!(format!("{}", err).contains("unknown backend"));
|
||||
// Both "openrouter" and the former "llamacpp" value are unknown now.
|
||||
for label in &["openrouter", "llamacpp"] {
|
||||
let err = validate_cross_replay("local", label).unwrap_err();
|
||||
assert!(format!("{}", err).contains("unknown backend"), "label={}", label);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -471,8 +471,11 @@ impl InsightGenerator {
|
||||
log::info!("RAG QUERY: {}", query);
|
||||
log::info!("========================================");
|
||||
|
||||
// Generate embedding for the query
|
||||
let query_embedding = self.ollama.generate_embedding(&query).await?;
|
||||
// Generate embedding for the query via the configured local backend
|
||||
// (`LLM_BACKEND` switch). Must match the backend that populated the
|
||||
// daily-summary embeddings or similarity search will be garbage.
|
||||
let query_embedding =
|
||||
crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &query).await?;
|
||||
|
||||
// Search for similar daily summaries with time-based weighting
|
||||
// This prioritizes summaries temporally close to the query date
|
||||
@@ -563,7 +566,7 @@ impl InsightGenerator {
|
||||
let calendar_cx = parent_cx.with_span(span);
|
||||
|
||||
let query_embedding = if let Some(loc) = location {
|
||||
match self.ollama.generate_embedding(loc).await {
|
||||
match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), loc).await {
|
||||
Ok(emb) => Some(emb),
|
||||
Err(e) => {
|
||||
log::warn!("Failed to generate embedding for location '{}': {}", loc, e);
|
||||
@@ -734,16 +737,17 @@ impl InsightGenerator {
|
||||
)
|
||||
};
|
||||
|
||||
let query_embedding = match self.ollama.generate_embedding(&query_text).await {
|
||||
Ok(emb) => emb,
|
||||
Err(e) => {
|
||||
log::warn!("Failed to generate search embedding: {}", e);
|
||||
search_cx.span().set_status(Status::Error {
|
||||
description: e.to_string().into(),
|
||||
});
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
let query_embedding =
|
||||
match crate::ai::embed_one(&self.ollama, self.llamacpp.as_deref(), &query_text).await {
|
||||
Ok(emb) => emb,
|
||||
Err(e) => {
|
||||
log::warn!("Failed to generate search embedding: {}", e);
|
||||
search_cx.span().set_status(Status::Error {
|
||||
description: e.to_string().into(),
|
||||
});
|
||||
return Ok(None);
|
||||
}
|
||||
};
|
||||
|
||||
let searches = {
|
||||
let mut dao = self
|
||||
@@ -2608,11 +2612,13 @@ Return ONLY the summary, nothing else."#,
|
||||
}
|
||||
}
|
||||
|
||||
/// Tool: store_entity — upsert an entity into the knowledge memory
|
||||
/// Tool: store_entity — upsert an entity into the knowledge memory.
|
||||
/// Embeddings go through the configured local backend (`LLM_BACKEND`),
|
||||
/// independent of the per-request chat backend in the caller.
|
||||
async fn tool_store_entity(
|
||||
&self,
|
||||
args: &serde_json::Value,
|
||||
ollama: &OllamaClient,
|
||||
_ollama: &OllamaClient,
|
||||
cx: &opentelemetry::Context,
|
||||
) -> String {
|
||||
use crate::database::models::InsertEntity;
|
||||
@@ -2672,9 +2678,16 @@ Return ONLY the summary, nothing else."#,
|
||||
.collect()
|
||||
};
|
||||
|
||||
// Generate embedding for name + description (best-effort)
|
||||
// Generate embedding for name + description (best-effort) via the
|
||||
// configured local backend.
|
||||
let embed_text = format!("{} {}", name, description);
|
||||
let embedding: Option<Vec<u8>> = match ollama.generate_embedding(&embed_text).await {
|
||||
let embedding: Option<Vec<u8>> = match crate::ai::embed_one(
|
||||
&self.ollama,
|
||||
self.llamacpp.as_deref(),
|
||||
&embed_text,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(vec) => {
|
||||
let bytes: Vec<u8> = vec.iter().flat_map(|f| f.to_le_bytes()).collect();
|
||||
Some(bytes)
|
||||
@@ -3580,20 +3593,24 @@ Return ONLY the summary, nothing else."#,
|
||||
.map(|s| s.trim().to_lowercase())
|
||||
.filter(|s| !s.is_empty())
|
||||
.unwrap_or_else(|| "local".to_string());
|
||||
if !matches!(backend_label.as_str(), "local" | "hybrid" | "llamacpp") {
|
||||
if !matches!(backend_label.as_str(), "local" | "hybrid") {
|
||||
return Err(anyhow::anyhow!(
|
||||
"unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'",
|
||||
"unknown backend '{}'; expected 'local' or 'hybrid'",
|
||||
backend_label
|
||||
));
|
||||
}
|
||||
span.set_attribute(KeyValue::new("backend", backend_label.clone()));
|
||||
let is_hybrid = backend_label == "hybrid";
|
||||
let is_llamacpp = backend_label == "llamacpp";
|
||||
// In hybrid + llamacpp modes the chat model never sees the image
|
||||
// directly; we describe-then-inline locally before the agentic loop
|
||||
// starts. Tracked as a single flag so vision/tool-gate logic doesn't
|
||||
// have to branch twice.
|
||||
let describes_then_inlines = is_hybrid || is_llamacpp;
|
||||
// `LLM_BACKEND=llamacpp` swaps Ollama out for llama-swap as the
|
||||
// "local" stack — chat + vision describe + embeddings all route
|
||||
// through llama-swap. In hybrid mode this still applies to vision
|
||||
// describe (chat continues to go to OpenRouter). The chat slot is
|
||||
// text-only in either case, so we describe-then-inline.
|
||||
let local_via_llamacpp =
|
||||
crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
|
||||
// Describe-then-inline: hybrid (chat is OpenRouter, text-only) or
|
||||
// any path where chat goes through llama-swap (chat slot is text-only).
|
||||
let describes_then_inlines = is_hybrid || local_via_llamacpp;
|
||||
|
||||
// 1b. Always build an Ollama client. In local mode it owns the chat
|
||||
// loop; in hybrid/llamacpp mode it still handles tool-local calls
|
||||
@@ -3688,13 +3705,14 @@ Return ONLY the summary, nothing else."#,
|
||||
None
|
||||
};
|
||||
|
||||
// 1d. In llamacpp mode, clone the configured LlamaCpp client and
|
||||
// apply per-request overrides. Same shape as the openrouter
|
||||
// branch above; describe_image will route through the vision
|
||||
// slot configured on the client.
|
||||
let llamacpp_client: Option<LlamaCppClient> = if is_llamacpp {
|
||||
// 1d. When `LLM_BACKEND=llamacpp` and we're in local mode (not
|
||||
// hybrid — hybrid keeps chat on OpenRouter), clone the llamacpp
|
||||
// client and apply per-request overrides. Same shape as the
|
||||
// openrouter branch above; describe_image will route through
|
||||
// the vision slot configured on the client.
|
||||
let llamacpp_client: Option<LlamaCppClient> = if local_via_llamacpp && !is_hybrid {
|
||||
let arc = self.llamacpp.as_ref().ok_or_else(|| {
|
||||
anyhow::anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured")
|
||||
anyhow::anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured")
|
||||
})?;
|
||||
let mut c: LlamaCppClient = (**arc).clone();
|
||||
if let Some(ref m) = custom_model {
|
||||
@@ -3917,38 +3935,19 @@ Return ONLY the summary, nothing else."#,
|
||||
None
|
||||
};
|
||||
|
||||
// describe-then-inline path. In hybrid mode the vision backend
|
||||
// defaults to Ollama but can be flipped to llamacpp via
|
||||
// `HYBRID_VISION_BACKEND=llamacpp` (so chat goes to OpenRouter while
|
||||
// vision/audio routes through llama-swap). In llamacpp mode we always
|
||||
// use the llamacpp client's configured vision slot.
|
||||
// describe-then-inline path. Vision describe routes through whichever
|
||||
// `LLM_BACKEND` is configured — llama-swap when `local_via_llamacpp`
|
||||
// is set (even in hybrid mode, since chat is OpenRouter but vision
|
||||
// stays on the local stack), otherwise Ollama.
|
||||
let inlined_visual_description: Option<String> = if describes_then_inlines {
|
||||
match image_base64.as_deref() {
|
||||
Some(b64) => {
|
||||
let use_llamacpp_vision = if is_llamacpp {
|
||||
true
|
||||
} else {
|
||||
// is_hybrid branch — consult env switch
|
||||
matches!(
|
||||
std::env::var("HYBRID_VISION_BACKEND")
|
||||
.ok()
|
||||
.as_deref()
|
||||
.map(|s| s.trim().to_lowercase())
|
||||
.as_deref(),
|
||||
Some("llamacpp")
|
||||
)
|
||||
};
|
||||
|
||||
let described = if use_llamacpp_vision {
|
||||
match self.llamacpp.as_ref() {
|
||||
Some(c) => c.describe_image(b64).await,
|
||||
None => {
|
||||
log::warn!(
|
||||
"describe-then-inline: requested llamacpp vision but LLAMA_SWAP_URL is unset, falling back to Ollama"
|
||||
);
|
||||
self.ollama.describe_image(b64).await
|
||||
}
|
||||
}
|
||||
let described = if local_via_llamacpp {
|
||||
self.llamacpp
|
||||
.as_ref()
|
||||
.expect("local_via_llamacpp guarantees Some")
|
||||
.describe_image(b64)
|
||||
.await
|
||||
} else {
|
||||
self.ollama.describe_image(b64).await
|
||||
};
|
||||
@@ -4044,10 +4043,10 @@ Return ONLY the summary, nothing else."#,
|
||||
);
|
||||
|
||||
// 10. Define tools. Gate flags computed from current data presence;
|
||||
// describe-then-inline modes (hybrid, llamacpp) omit describe_photo
|
||||
// since the chat model receives the visual description inline (so
|
||||
// we pass `false` for has_vision in those modes regardless of the
|
||||
// model's actual capability).
|
||||
// describe-then-inline modes (hybrid OR local_via_llamacpp) omit
|
||||
// describe_photo since the chat model receives the visual
|
||||
// description inline (so we pass `false` for has_vision in
|
||||
// those modes regardless of the model's actual capability).
|
||||
let gate_opts = self.current_gate_opts(has_vision && !describes_then_inlines);
|
||||
let tools = Self::build_tool_definitions(gate_opts);
|
||||
|
||||
|
||||
@@ -11,10 +11,10 @@
|
||||
// `model` field, which is how llama-swap selects which backend process to
|
||||
// run.
|
||||
// - `/v1/models` returns only the configured slot ids — capabilities aren't
|
||||
// reported by the API, so `vision_models` is a config-time allowlist (env
|
||||
// `LLAMA_SWAP_VISION_MODELS`) used to set `has_vision` on responses.
|
||||
// `has_tool_calling` is assumed true for every slot, since llama-swap entries
|
||||
// default to launching llama-server with `--jinja`.
|
||||
// reported by the API, so we infer `has_vision` from a single config field
|
||||
// (`vision_model`, defaulting to `"vision"`) and assume `has_tool_calling`
|
||||
// is true for every slot, since llama-swap entries default to launching
|
||||
// llama-server with `--jinja`.
|
||||
//
|
||||
// First consumer lands alongside the three-way backend dispatch in
|
||||
// insight_generator / insight_chat.
|
||||
@@ -50,16 +50,10 @@ pub struct LlamaCppClient {
|
||||
/// Embedding model slot id (e.g. `"embed"`). Used for
|
||||
/// `generate_embeddings`.
|
||||
pub embedding_model: String,
|
||||
/// Vision model slot id (e.g. `"vision"`). Used for `describe_image` and
|
||||
/// included in `vision_models` automatically so capability lookups for
|
||||
/// the default vision slot report `has_vision = true` even when the env
|
||||
/// allowlist is empty.
|
||||
/// Vision model slot id (e.g. `"vision"`). Used for `describe_image`,
|
||||
/// and the only slot that reports `has_vision = true` in capability
|
||||
/// lookups (llama-swap's `/v1/models` doesn't surface modality).
|
||||
pub vision_model: String,
|
||||
/// Operator-curated set of slot ids known to be multimodal. Drives the
|
||||
/// `has_vision` field in `list_models` / `model_capabilities`, since
|
||||
/// llama-swap's `/v1/models` doesn't report modality. Empty allowlist
|
||||
/// still marks `vision_model` as vision-capable.
|
||||
pub vision_models: Vec<String>,
|
||||
num_ctx: Option<i32>,
|
||||
temperature: Option<f32>,
|
||||
top_p: Option<f32>,
|
||||
@@ -83,7 +77,6 @@ impl LlamaCppClient {
|
||||
primary_model: primary_model.unwrap_or_else(|| DEFAULT_PRIMARY_MODEL.to_string()),
|
||||
embedding_model: DEFAULT_EMBEDDING_MODEL.to_string(),
|
||||
vision_model: DEFAULT_VISION_MODEL.to_string(),
|
||||
vision_models: Vec::new(),
|
||||
num_ctx: None,
|
||||
temperature: None,
|
||||
top_p: None,
|
||||
@@ -100,10 +93,6 @@ impl LlamaCppClient {
|
||||
self.vision_model = model;
|
||||
}
|
||||
|
||||
pub fn set_vision_models(&mut self, models: Vec<String>) {
|
||||
self.vision_models = models;
|
||||
}
|
||||
|
||||
pub fn set_num_ctx(&mut self, num_ctx: Option<i32>) {
|
||||
self.num_ctx = num_ctx;
|
||||
}
|
||||
@@ -692,7 +681,7 @@ impl LlamaCppClient {
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or_default()
|
||||
.to_string();
|
||||
let has_vision = name == self.vision_model || self.vision_models.iter().any(|v| v == &name);
|
||||
let has_vision = name == self.vision_model;
|
||||
// Tool calling is the default for llama-swap entries we configure
|
||||
// (--jinja flag); no negative-list mechanism yet, so report true.
|
||||
ModelCapabilities {
|
||||
@@ -954,25 +943,21 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn capability_inference_uses_vision_model_and_allowlist() {
|
||||
fn capability_inference_marks_only_vision_slot() {
|
||||
let mut c = LlamaCppClient::new(None, Some("chat".into()));
|
||||
c.set_vision_model("vision".into());
|
||||
c.set_vision_models(vec!["qwen-vl".into()]);
|
||||
|
||||
let m_chat = json!({ "id": "chat" });
|
||||
let m_vision = json!({ "id": "vision" });
|
||||
let m_qwen = json!({ "id": "qwen-vl" });
|
||||
let m_other = json!({ "id": "embed" });
|
||||
|
||||
let chat = c.parse_model_capabilities(&m_chat);
|
||||
let vision = c.parse_model_capabilities(&m_vision);
|
||||
let qwen = c.parse_model_capabilities(&m_qwen);
|
||||
let other = c.parse_model_capabilities(&m_other);
|
||||
|
||||
assert!(!chat.has_vision);
|
||||
assert!(chat.has_tool_calling);
|
||||
assert!(vision.has_vision);
|
||||
assert!(qwen.has_vision);
|
||||
assert!(!other.has_vision);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,14 +21,14 @@ pub use handlers::{
|
||||
chat_history_handler, chat_rewind_handler, chat_stream_handler, chat_turn_handler,
|
||||
delete_insight_handler, export_training_data_handler, generate_agentic_insight_handler,
|
||||
generate_insight_handler, get_all_insights_handler, get_available_models_handler,
|
||||
get_insight_handler, get_llamacpp_models_handler, get_openrouter_models_handler,
|
||||
rate_insight_handler,
|
||||
get_insight_handler, get_openrouter_models_handler, rate_insight_handler,
|
||||
};
|
||||
pub use insight_generator::InsightGenerator;
|
||||
#[allow(unused_imports)]
|
||||
pub use llm_client::{
|
||||
ChatMessage, LlmClient, ModelCapabilities, Tool, ToolCall, ToolCallFunction, ToolFunction,
|
||||
};
|
||||
pub use llamacpp::LlamaCppClient;
|
||||
pub use ollama::{EMBEDDING_MODEL, OllamaClient};
|
||||
pub use sms_client::{SmsApiClient, SmsMessage};
|
||||
|
||||
@@ -40,3 +40,87 @@ pub use sms_client::{SmsApiClient, SmsMessage};
|
||||
pub fn user_display_name() -> String {
|
||||
std::env::var("USER_NAME").unwrap_or_else(|_| "Me".to_string())
|
||||
}
|
||||
|
||||
/// One switch for the "local" LLM stack: when `LLM_BACKEND=llamacpp` is
|
||||
/// set, chat / vision describe / embeddings all route through llama-swap
|
||||
/// instead of Ollama. Any other value (including unset, the default) is
|
||||
/// Ollama. This is intentionally global — embeddings must be drawn from
|
||||
/// a single source or similarity search across the index breaks (mixed
|
||||
/// vector spaces, possibly mixed dims). The `backend=hybrid` per-request
|
||||
/// override remains orthogonal: it always sends chat to OpenRouter, and
|
||||
/// uses `LLM_BACKEND` for the describe-then-inline vision pass.
|
||||
pub fn local_backend_is_llamacpp() -> bool {
|
||||
matches!(
|
||||
std::env::var("LLM_BACKEND")
|
||||
.ok()
|
||||
.as_deref()
|
||||
.map(|s| s.trim().to_lowercase())
|
||||
.as_deref(),
|
||||
Some("llamacpp")
|
||||
)
|
||||
}
|
||||
|
||||
/// Embed one string via the configured local backend. Routes through
|
||||
/// llama-swap when `LLM_BACKEND=llamacpp` (and a client is configured),
|
||||
/// else Ollama. Returns the single embedding vector. See
|
||||
/// [`local_backend_is_llamacpp`] for the rationale on consistency.
|
||||
pub async fn embed_one(
|
||||
ollama: &OllamaClient,
|
||||
llamacpp: Option<&LlamaCppClient>,
|
||||
text: &str,
|
||||
) -> anyhow::Result<Vec<f32>> {
|
||||
if local_backend_is_llamacpp() {
|
||||
if let Some(lc) = llamacpp {
|
||||
let mut vecs = <LlamaCppClient as LlmClient>::generate_embeddings(lc, &[text]).await?;
|
||||
return vecs
|
||||
.pop()
|
||||
.ok_or_else(|| anyhow::anyhow!("llama-swap returned no embeddings"));
|
||||
}
|
||||
log::warn!(
|
||||
"LLM_BACKEND=llamacpp but LlamaCppClient is unconfigured; falling back to Ollama embeddings"
|
||||
);
|
||||
}
|
||||
ollama.generate_embedding(text).await
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod env_dispatch_tests {
|
||||
use super::*;
|
||||
|
||||
fn with_env<F: FnOnce()>(key: &str, val: Option<&str>, f: F) {
|
||||
let prev = std::env::var(key).ok();
|
||||
match val {
|
||||
Some(v) => unsafe { std::env::set_var(key, v) },
|
||||
None => unsafe { std::env::remove_var(key) },
|
||||
}
|
||||
f();
|
||||
match prev {
|
||||
Some(v) => unsafe { std::env::set_var(key, v) },
|
||||
None => unsafe { std::env::remove_var(key) },
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn llm_backend_defaults_to_ollama() {
|
||||
with_env("LLM_BACKEND", None, || {
|
||||
assert!(!local_backend_is_llamacpp());
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn llm_backend_llamacpp_case_insensitive() {
|
||||
with_env("LLM_BACKEND", Some("LlamaCpp"), || {
|
||||
assert!(local_backend_is_llamacpp());
|
||||
});
|
||||
with_env("LLM_BACKEND", Some(" llamacpp "), || {
|
||||
assert!(local_backend_is_llamacpp());
|
||||
});
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn llm_backend_unknown_value_is_ollama() {
|
||||
with_env("LLM_BACKEND", Some("vllm"), || {
|
||||
assert!(!local_backend_is_llamacpp());
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
@@ -313,7 +313,6 @@ fn main() -> std::io::Result<()> {
|
||||
.service(ai::get_all_insights_handler)
|
||||
.service(ai::get_available_models_handler)
|
||||
.service(ai::get_openrouter_models_handler)
|
||||
.service(ai::get_llamacpp_models_handler)
|
||||
.service(ai::chat_turn_handler)
|
||||
.service(ai::chat_stream_handler)
|
||||
.service(ai::chat_history_handler)
|
||||
|
||||
27
src/state.rs
27
src/state.rs
@@ -358,10 +358,11 @@ fn parse_openrouter_allowed_models() -> Vec<String> {
|
||||
}
|
||||
|
||||
/// Build a `LlamaCppClient` from environment variables. Returns `None` when
|
||||
/// `LLAMA_SWAP_URL` is unset (the llamacpp backend is then unavailable and
|
||||
/// requests for it return a clear error). The slot ids default to the
|
||||
/// names the bundled `llama-swap/config.yaml` uses — `chat` / `vision` /
|
||||
/// `embed` — so a minimal deploy only needs to set `LLAMA_SWAP_URL`.
|
||||
/// `LLAMA_SWAP_URL` is unset. The client is constructed unconditionally
|
||||
/// when the URL is set (so it's available even under `LLM_BACKEND=ollama`
|
||||
/// for ad-hoc tooling), but the agentic / chat paths only route through it
|
||||
/// when `LLM_BACKEND=llamacpp`. Slot ids default to the names the bundled
|
||||
/// `llama-swap/config.yaml` uses — `chat` / `vision` / `embed`.
|
||||
fn build_llamacpp_from_env() -> Option<Arc<LlamaCppClient>> {
|
||||
let base_url = env::var("LLAMA_SWAP_URL").ok()?;
|
||||
let primary_model = env::var("LLAMA_SWAP_PRIMARY_MODEL").ok();
|
||||
@@ -372,12 +373,12 @@ fn build_llamacpp_from_env() -> Option<Arc<LlamaCppClient>> {
|
||||
if let Ok(model) = env::var("LLAMA_SWAP_VISION_MODEL") {
|
||||
client.set_vision_model(model);
|
||||
}
|
||||
client.set_vision_models(parse_llamacpp_vision_models());
|
||||
Some(Arc::new(client))
|
||||
}
|
||||
|
||||
/// Parse `LLAMA_SWAP_ALLOWED_MODELS` (comma-separated) into a vec. Used to
|
||||
/// drive `/insights/llamacpp/models`; empty when unset.
|
||||
/// populate the model picker when `LLM_BACKEND=llamacpp` — `/insights/models`
|
||||
/// surfaces these slots with capabilities. Empty when unset.
|
||||
fn parse_llamacpp_allowed_models() -> Vec<String> {
|
||||
env::var("LLAMA_SWAP_ALLOWED_MODELS")
|
||||
.unwrap_or_default()
|
||||
@@ -387,20 +388,6 @@ fn parse_llamacpp_allowed_models() -> Vec<String> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
/// Parse `LLAMA_SWAP_VISION_MODELS` (comma-separated) — slot ids that report
|
||||
/// `has_vision = true` in capability lookups. The configured `vision_model`
|
||||
/// (default `vision`) is always considered vision-capable regardless of this
|
||||
/// list, so a deploy that only uses the default vision slot can leave it
|
||||
/// unset.
|
||||
fn parse_llamacpp_vision_models() -> Vec<String> {
|
||||
env::var("LLAMA_SWAP_VISION_MODELS")
|
||||
.unwrap_or_default()
|
||||
.split(',')
|
||||
.map(|s| s.trim().to_string())
|
||||
.filter(|s| !s.is_empty())
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
impl AppState {
|
||||
/// Creates an AppState instance for testing with temporary directories
|
||||
|
||||
Reference in New Issue
Block a user