ai: collapse llamacpp into LLM_BACKEND env switch
Reverts the per-request backend="llamacpp" value. Chat/vision/embedding backend is now a deploy-time decision (LLM_BACKEND=ollama|llamacpp), applied globally across chat, vision describe, and embeddings — so embedding vectors stay in one space across the index. - Per-request backend whitelist back to "local"|"hybrid". A request arriving with backend="llamacpp" is rejected. - LLM_BACKEND=llamacpp swaps the entire local stack to llama-swap: chat hits the chat slot, describe hits the vision slot, embeddings hit the embed slot. Hybrid mode still routes chat to OpenRouter but uses LLM_BACKEND for the describe pass. - Drops env vars HYBRID_VISION_BACKEND, LLAMA_SWAP_VISION_MODELS, EMBEDDING_BACKEND (the last never shipped). Drops the LlamaCppClient.vision_models allowlist — capability inference now reports has_vision only for the configured vision_model slot. - Drops the /insights/llamacpp/models handler. /insights/models is the single endpoint; returns Ollama servers under LLM_BACKEND=ollama and llama-swap slots (from LLAMA_SWAP_ALLOWED_MODELS) under LLM_BACKEND=llamacpp. Same envelope shape either way. - New ai::embed_one helper routes embeddings through llama-swap when LLM_BACKEND=llamacpp (else Ollama). Wires it into the four insight_generator embedding sites. - Cross-replay matrix simplifies to pre-llamacpp shape (local↔local, hybrid↔hybrid, hybrid→local allowed; local→hybrid rejected).
This commit is contained in:
+68
-103
@@ -309,14 +309,15 @@ impl InsightChatService {
|
||||
.unwrap_or_else(|| stored_backend.clone());
|
||||
validate_cross_replay(&stored_backend, &effective_backend)?;
|
||||
let is_hybrid = effective_backend == "hybrid";
|
||||
let is_llamacpp = effective_backend == "llamacpp";
|
||||
let describes_then_inlines = is_hybrid || is_llamacpp;
|
||||
let local_via_llamacpp =
|
||||
crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
|
||||
let describes_then_inlines = is_hybrid || local_via_llamacpp;
|
||||
span.set_attribute(KeyValue::new("backend", effective_backend.clone()));
|
||||
|
||||
// 4. Build the chat backend client. Ollama in local mode, a freshly
|
||||
// cloned OpenRouter client in hybrid mode, a freshly cloned
|
||||
// LlamaCppClient in llamacpp mode (clone so per-request
|
||||
// sampling/model overrides don't leak into shared state).
|
||||
// 4. Build the chat backend client. Hybrid → OpenRouter; local with
|
||||
// `LLM_BACKEND=llamacpp` → llama-swap; otherwise Ollama. Clones
|
||||
// so per-request sampling/model overrides don't leak into shared
|
||||
// state.
|
||||
let max_iterations = req
|
||||
.max_iterations
|
||||
.unwrap_or(DEFAULT_MAX_ITERATIONS)
|
||||
@@ -353,9 +354,9 @@ impl InsightChatService {
|
||||
c.set_num_ctx(Some(ctx));
|
||||
}
|
||||
openrouter_client = Some(c);
|
||||
} else if is_llamacpp {
|
||||
} else if local_via_llamacpp {
|
||||
let arc = self.llamacpp.as_ref().ok_or_else(|| {
|
||||
anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured")
|
||||
anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured")
|
||||
})?;
|
||||
let mut c: LlamaCppClient = (**arc).clone();
|
||||
if let Some(ref m) = custom_model {
|
||||
@@ -373,8 +374,8 @@ impl InsightChatService {
|
||||
}
|
||||
llamacpp_client = Some(c);
|
||||
} else {
|
||||
// Local-mode model swap. Build a new client when the chat model
|
||||
// differs from the configured one (mirrors the agentic pattern).
|
||||
// Pure local (Ollama): model swap. Build a new client when the
|
||||
// chat model differs from the configured one.
|
||||
if let Some(ref m) = custom_model
|
||||
&& m != &self.ollama.primary_model
|
||||
{
|
||||
@@ -820,8 +821,9 @@ impl InsightChatService {
|
||||
.unwrap_or_else(|| stored_backend.clone());
|
||||
validate_cross_replay(&stored_backend, &effective_backend)?;
|
||||
let is_hybrid = effective_backend == "hybrid";
|
||||
let is_llamacpp = effective_backend == "llamacpp";
|
||||
let describes_then_inlines = is_hybrid || is_llamacpp;
|
||||
let local_via_llamacpp =
|
||||
crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
|
||||
let describes_then_inlines = is_hybrid || local_via_llamacpp;
|
||||
|
||||
let max_iterations = req
|
||||
.max_iterations
|
||||
@@ -841,9 +843,9 @@ impl InsightChatService {
|
||||
let model_used = chat_backend.primary_model().to_string();
|
||||
|
||||
// Tool set — local mode + first user turn carries an image →
|
||||
// offer describe_photo. Describe-then-inline modes (hybrid /
|
||||
// llamacpp): visual description was inlined when the insight was
|
||||
// bootstrapped, no describe tool needed.
|
||||
// offer describe_photo. Describe-then-inline modes (hybrid OR
|
||||
// local_via_llamacpp): visual description was inlined when the
|
||||
// insight was bootstrapped, no describe tool needed.
|
||||
let local_first_user_has_image = messages
|
||||
.iter()
|
||||
.find(|m| m.role == "user")
|
||||
@@ -987,8 +989,9 @@ impl InsightChatService {
|
||||
.unwrap_or_else(|| "default".to_string());
|
||||
let effective_backend = resolve_bootstrap_backend(req.backend.as_deref())?;
|
||||
let is_hybrid = effective_backend == "hybrid";
|
||||
let is_llamacpp = effective_backend == "llamacpp";
|
||||
let describes_then_inlines = is_hybrid || is_llamacpp;
|
||||
let local_via_llamacpp =
|
||||
crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
|
||||
let describes_then_inlines = is_hybrid || local_via_llamacpp;
|
||||
|
||||
let max_iterations = req
|
||||
.max_iterations
|
||||
@@ -1020,35 +1023,19 @@ impl InsightChatService {
|
||||
_ => None,
|
||||
});
|
||||
|
||||
// Describe-then-inline backends (hybrid, llamacpp): pre-describe the
|
||||
// image so a text-only chat model gets the visual description inline.
|
||||
// Vision source: llamacpp's vision slot in llamacpp mode; in hybrid
|
||||
// mode Ollama by default, llamacpp via `HYBRID_VISION_BACKEND=llamacpp`.
|
||||
// Describe-then-inline (hybrid OR local_via_llamacpp): pre-describe
|
||||
// the image so a text-only chat model gets the visual description
|
||||
// inline. Vision source follows `LLM_BACKEND`: llama-swap when
|
||||
// `local_via_llamacpp`, else Ollama.
|
||||
let visual_block = if describes_then_inlines {
|
||||
match image_base64.as_deref() {
|
||||
Some(b64) => {
|
||||
let use_llamacpp_vision = if is_llamacpp {
|
||||
true
|
||||
} else {
|
||||
matches!(
|
||||
std::env::var("HYBRID_VISION_BACKEND")
|
||||
.ok()
|
||||
.as_deref()
|
||||
.map(|s| s.trim().to_lowercase())
|
||||
.as_deref(),
|
||||
Some("llamacpp")
|
||||
)
|
||||
};
|
||||
let described = if use_llamacpp_vision {
|
||||
match self.llamacpp.as_ref() {
|
||||
Some(c) => c.describe_image(b64).await,
|
||||
None => {
|
||||
log::warn!(
|
||||
"bootstrap: requested llamacpp vision but LLAMA_SWAP_URL unset; falling back to Ollama"
|
||||
);
|
||||
self.ollama.describe_image(b64).await
|
||||
}
|
||||
}
|
||||
let described = if local_via_llamacpp {
|
||||
self.llamacpp
|
||||
.as_ref()
|
||||
.expect("local_via_llamacpp guarantees Some")
|
||||
.describe_image(b64)
|
||||
.await
|
||||
} else {
|
||||
self.ollama.describe_image(b64).await
|
||||
};
|
||||
@@ -1175,8 +1162,11 @@ impl InsightChatService {
|
||||
/// (boxed because each backend has a different concrete type) and the
|
||||
/// Ollama client used for describe-image / local tool calls.
|
||||
///
|
||||
/// `effective_backend` must be one of `"local"`, `"hybrid"`, `"llamacpp"`
|
||||
/// (validated upstream).
|
||||
/// `effective_backend` must be one of `"local"` or `"hybrid"` (validated
|
||||
/// upstream). Hybrid → OpenRouter; local with `LLM_BACKEND=llamacpp` →
|
||||
/// llama-swap; pure local → Ollama. Returns the dispatched chat client
|
||||
/// plus the (possibly per-request) Ollama client that the caller uses
|
||||
/// for non-chat helpers (image describe in non-llamacpp mode, tool ops).
|
||||
fn build_chat_clients(
|
||||
&self,
|
||||
effective_backend: &str,
|
||||
@@ -1206,10 +1196,10 @@ impl InsightChatService {
|
||||
return Ok((Box::new(c), ollama_client));
|
||||
}
|
||||
|
||||
if effective_backend == "llamacpp" {
|
||||
let arc = self.llamacpp.as_ref().ok_or_else(|| {
|
||||
anyhow!("llamacpp backend unavailable: LLAMA_SWAP_URL not configured")
|
||||
})?;
|
||||
// Local mode — env switch decides between Ollama and llama-swap.
|
||||
if crate::ai::local_backend_is_llamacpp()
|
||||
&& let Some(arc) = self.llamacpp.as_ref()
|
||||
{
|
||||
let mut c: LlamaCppClient = (**arc).clone();
|
||||
if let Some(m) = custom_model {
|
||||
c.primary_model = m.to_string();
|
||||
@@ -1525,41 +1515,26 @@ fn resolve_date_taken_for_context(
|
||||
|
||||
/// Validate a stored→effective backend transition for a chat continuation.
|
||||
/// Continuation runs against a transcript that was generated with a specific
|
||||
/// backend; some transitions break the conversation shape:
|
||||
/// backend; the only blocked transition is `local → hybrid`, because the
|
||||
/// stored transcript has images embedded in the first user message and the
|
||||
/// hybrid path (OpenRouter chat with describe-then-inline) can't replay
|
||||
/// raw image bytes through OpenRouter consistently across providers.
|
||||
/// `hybrid → local` is allowed (the inlined description replays verbatim
|
||||
/// as text).
|
||||
///
|
||||
/// - `local → hybrid` — the stored transcript has images embedded in the
|
||||
/// first user message; the openrouter chat client surfaces them through
|
||||
/// the wire, but vision-only models routed via the hybrid path may not
|
||||
/// accept that shape consistently across providers. Reject to keep the
|
||||
/// `regenerate-in-hybrid-mode` workflow as the supported answer.
|
||||
/// - `llamacpp → hybrid` — the stored transcript already has an inlined
|
||||
/// visual description produced by llama-swap's vision slot. Switching
|
||||
/// to hybrid mid-conversation would mix description sources across
|
||||
/// subsequent turns (any new image in the chat continuation would be
|
||||
/// described by ollama-vision while the original was described by
|
||||
/// llama-vision). Reject for consistency.
|
||||
///
|
||||
/// All other transitions are allowed. `local ↔ llamacpp` works because
|
||||
/// LlamaCppClient passes image content-parts through to the chat slot —
|
||||
/// the user is responsible for picking a vision-capable chat model in
|
||||
/// that case. `hybrid ↔ llamacpp` works because both transcripts are
|
||||
/// text-only (visual description inlined at bootstrap).
|
||||
/// Whether "local" routes through Ollama or llama-swap is decided at
|
||||
/// startup by `LLM_BACKEND`; both share the same transcript shape from
|
||||
/// the chat-replay perspective.
|
||||
fn validate_cross_replay(stored: &str, effective: &str) -> Result<()> {
|
||||
if !matches!(effective, "local" | "hybrid" | "llamacpp") {
|
||||
if !matches!(effective, "local" | "hybrid") {
|
||||
bail!(
|
||||
"unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'",
|
||||
"unknown backend '{}'; expected 'local' or 'hybrid'",
|
||||
effective
|
||||
);
|
||||
}
|
||||
if stored == "local" && effective == "hybrid" {
|
||||
bail!(
|
||||
"switching from local to hybrid mid-chat isn't supported yet; \
|
||||
regenerate the insight in hybrid mode if you want OpenRouter chat"
|
||||
);
|
||||
}
|
||||
if stored == "llamacpp" && effective == "hybrid" {
|
||||
bail!(
|
||||
"switching from llamacpp to hybrid mid-chat isn't supported yet; \
|
||||
"switching from local to hybrid mid-chat isn't supported; \
|
||||
regenerate the insight in hybrid mode if you want OpenRouter chat"
|
||||
);
|
||||
}
|
||||
@@ -1576,9 +1551,9 @@ fn resolve_bootstrap_backend(supplied: Option<&str>) -> Result<String> {
|
||||
.map(|s| s.trim().to_lowercase())
|
||||
.filter(|s| !s.is_empty())
|
||||
.unwrap_or_else(|| "local".to_string());
|
||||
if !matches!(lower.as_str(), "local" | "hybrid" | "llamacpp") {
|
||||
if !matches!(lower.as_str(), "local" | "hybrid") {
|
||||
bail!(
|
||||
"unknown backend '{}'; expected 'local', 'hybrid', or 'llamacpp'",
|
||||
"unknown backend '{}'; expected 'local' or 'hybrid'",
|
||||
lower
|
||||
);
|
||||
}
|
||||
@@ -2184,10 +2159,6 @@ mod tests {
|
||||
fn bootstrap_backend_accepts_local_and_hybrid_case_insensitively() {
|
||||
assert_eq!(resolve_bootstrap_backend(Some("LOCAL")).unwrap(), "local");
|
||||
assert_eq!(resolve_bootstrap_backend(Some("Hybrid")).unwrap(), "hybrid");
|
||||
assert_eq!(
|
||||
resolve_bootstrap_backend(Some("Llamacpp")).unwrap(),
|
||||
"llamacpp"
|
||||
);
|
||||
assert_eq!(
|
||||
resolve_bootstrap_backend(Some(" local ")).unwrap(),
|
||||
"local"
|
||||
@@ -2196,10 +2167,13 @@ mod tests {
|
||||
|
||||
#[test]
|
||||
fn bootstrap_backend_rejects_unknown_label() {
|
||||
let err = resolve_bootstrap_backend(Some("openrouter")).unwrap_err();
|
||||
let msg = format!("{}", err);
|
||||
assert!(msg.contains("unknown backend"));
|
||||
assert!(msg.contains("openrouter"));
|
||||
// `llamacpp` is no longer a per-request backend value — it's chosen
|
||||
// at deploy time via `LLM_BACKEND`.
|
||||
for label in &["openrouter", "llamacpp", "ollama"] {
|
||||
let err = resolve_bootstrap_backend(Some(label)).unwrap_err();
|
||||
let msg = format!("{}", err);
|
||||
assert!(msg.contains("unknown backend"), "label={}", label);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
@@ -2209,29 +2183,20 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cross_replay_rejects_llamacpp_to_hybrid() {
|
||||
let err = validate_cross_replay("llamacpp", "hybrid").unwrap_err();
|
||||
assert!(format!("{}", err).contains("llamacpp to hybrid"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cross_replay_allows_local_llamacpp_and_hybrid_llamacpp_transitions() {
|
||||
// Local ↔ llamacpp: user is responsible for picking a vision-capable
|
||||
// chat slot when the transcript has images.
|
||||
assert!(validate_cross_replay("local", "llamacpp").is_ok());
|
||||
assert!(validate_cross_replay("llamacpp", "local").is_ok());
|
||||
// Hybrid ↔ llamacpp: both transcripts are text-only.
|
||||
assert!(validate_cross_replay("hybrid", "llamacpp").is_ok());
|
||||
// Same-backend replays are always fine.
|
||||
fn cross_replay_allows_supported_transitions() {
|
||||
assert!(validate_cross_replay("local", "local").is_ok());
|
||||
assert!(validate_cross_replay("hybrid", "hybrid").is_ok());
|
||||
assert!(validate_cross_replay("llamacpp", "llamacpp").is_ok());
|
||||
// Hybrid → local replays the inlined description as plain text.
|
||||
assert!(validate_cross_replay("hybrid", "local").is_ok());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn cross_replay_rejects_unknown_effective() {
|
||||
let err = validate_cross_replay("local", "openrouter").unwrap_err();
|
||||
assert!(format!("{}", err).contains("unknown backend"));
|
||||
// Both "openrouter" and the former "llamacpp" value are unknown now.
|
||||
for label in &["openrouter", "llamacpp"] {
|
||||
let err = validate_cross_replay("local", label).unwrap_err();
|
||||
assert!(format!("{}", err).contains("unknown backend"), "label={}", label);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
Reference in New Issue
Block a user