feature/llamacpp-backend #101
118
src/ai/backend.rs
Normal file
118
src/ai/backend.rs
Normal file
@@ -0,0 +1,118 @@
|
||||
use anyhow::{Result, anyhow};
|
||||
|
||||
use crate::ai::llm_client::LlmClient;
|
||||
|
||||
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
||||
pub enum BackendKind {
|
||||
Local,
|
||||
Hybrid,
|
||||
}
|
||||
|
||||
impl BackendKind {
|
||||
pub fn parse(s: &str) -> Result<Self> {
|
||||
match s.trim().to_lowercase().as_str() {
|
||||
"local" | "" => Ok(Self::Local),
|
||||
"hybrid" => Ok(Self::Hybrid),
|
||||
other => Err(anyhow!("unknown backend '{}'; expected 'local' or 'hybrid'", other)),
|
||||
}
|
||||
}
|
||||
|
||||
pub fn as_str(&self) -> &'static str {
|
||||
match self {
|
||||
Self::Local => "local",
|
||||
Self::Hybrid => "hybrid",
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl std::fmt::Display for BackendKind {
|
||||
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
||||
f.write_str(self.as_str())
|
||||
}
|
||||
}
|
||||
|
||||
pub struct SamplingOverrides {
|
||||
pub model: Option<String>,
|
||||
pub num_ctx: Option<i32>,
|
||||
pub temperature: Option<f32>,
|
||||
pub top_p: Option<f32>,
|
||||
pub top_k: Option<i32>,
|
||||
pub min_p: Option<f32>,
|
||||
}
|
||||
|
||||
impl SamplingOverrides {
|
||||
pub fn has_sampling(&self) -> bool {
|
||||
self.temperature.is_some()
|
||||
|| self.top_p.is_some()
|
||||
|| self.top_k.is_some()
|
||||
|| self.min_p.is_some()
|
||||
}
|
||||
}
|
||||
|
||||
pub struct ResolvedBackend {
|
||||
chat: Box<dyn LlmClient>,
|
||||
local: Box<dyn LlmClient>,
|
||||
pub kind: BackendKind,
|
||||
/// `true` when the chat model receives images directly (Ollama with
|
||||
/// vision, or llamacpp). `false` for hybrid where we describe-then-inline.
|
||||
pub images_inline: bool,
|
||||
}
|
||||
|
||||
impl ResolvedBackend {
|
||||
pub fn new(
|
||||
chat: Box<dyn LlmClient>,
|
||||
local: Box<dyn LlmClient>,
|
||||
kind: BackendKind,
|
||||
images_inline: bool,
|
||||
) -> Self {
|
||||
Self { chat, local, kind, images_inline }
|
||||
}
|
||||
|
||||
pub fn chat(&self) -> &dyn LlmClient {
|
||||
self.chat.as_ref()
|
||||
}
|
||||
|
||||
pub fn local(&self) -> &dyn LlmClient {
|
||||
self.local.as_ref()
|
||||
}
|
||||
|
||||
pub fn model(&self) -> &str {
|
||||
self.chat.primary_model()
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn parse_backend_kind() {
|
||||
assert_eq!(BackendKind::parse("local").unwrap(), BackendKind::Local);
|
||||
assert_eq!(BackendKind::parse("hybrid").unwrap(), BackendKind::Hybrid);
|
||||
assert_eq!(BackendKind::parse(" Local ").unwrap(), BackendKind::Local);
|
||||
assert_eq!(BackendKind::parse("HYBRID").unwrap(), BackendKind::Hybrid);
|
||||
assert_eq!(BackendKind::parse("").unwrap(), BackendKind::Local);
|
||||
assert!(BackendKind::parse("vllm").is_err());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn backend_kind_as_str_roundtrips() {
|
||||
assert_eq!(BackendKind::parse(BackendKind::Local.as_str()).unwrap(), BackendKind::Local);
|
||||
assert_eq!(BackendKind::parse(BackendKind::Hybrid.as_str()).unwrap(), BackendKind::Hybrid);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn sampling_overrides_has_sampling() {
|
||||
let empty = SamplingOverrides {
|
||||
model: None, num_ctx: None, temperature: None,
|
||||
top_p: None, top_k: None, min_p: None,
|
||||
};
|
||||
assert!(!empty.has_sampling());
|
||||
|
||||
let with_temp = SamplingOverrides {
|
||||
model: None, num_ctx: Some(4096), temperature: Some(0.7),
|
||||
top_p: None, top_k: None, min_p: None,
|
||||
};
|
||||
assert!(with_temp.has_sampling());
|
||||
}
|
||||
}
|
||||
@@ -495,7 +495,7 @@ pub async fn get_available_models_handler(
|
||||
.iter()
|
||||
.map(|name| ModelCapabilities {
|
||||
name: name.clone(),
|
||||
has_vision: name == &lc.vision_model,
|
||||
has_vision: true,
|
||||
has_tool_calling: true,
|
||||
})
|
||||
.collect();
|
||||
|
||||
@@ -311,7 +311,7 @@ impl InsightChatService {
|
||||
let is_hybrid = effective_backend == "hybrid";
|
||||
let local_via_llamacpp =
|
||||
crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
|
||||
let describes_then_inlines = is_hybrid || local_via_llamacpp;
|
||||
let describes_then_inlines = is_hybrid;
|
||||
span.set_attribute(KeyValue::new("backend", effective_backend.clone()));
|
||||
|
||||
// 4. Build the chat backend client. Hybrid → OpenRouter; local with
|
||||
@@ -408,12 +408,11 @@ impl InsightChatService {
|
||||
let model_used = chat_backend.primary_model().to_string();
|
||||
span.set_attribute(KeyValue::new("model", model_used.clone()));
|
||||
|
||||
// 5. Decide vision + tool set. In describe-then-inline modes
|
||||
// (hybrid, llamacpp) we always omit `describe_photo` (matches the
|
||||
// original generation flow). In local we trust the stored
|
||||
// history's first-user shape: if it carries `images`, the
|
||||
// original model was vision-capable, and we keep `describe_photo`
|
||||
// available.
|
||||
// 5. Decide vision + tool set. In describe-then-inline mode
|
||||
// (hybrid only) we omit `describe_photo`. In local and llamacpp
|
||||
// we trust the stored history's first-user shape: if it carries
|
||||
// `images`, the original model was vision-capable, and we keep
|
||||
// `describe_photo` available.
|
||||
let local_first_user_has_image = messages
|
||||
.iter()
|
||||
.find(|m| m.role == "user")
|
||||
@@ -821,9 +820,7 @@ impl InsightChatService {
|
||||
.unwrap_or_else(|| stored_backend.clone());
|
||||
validate_cross_replay(&stored_backend, &effective_backend)?;
|
||||
let is_hybrid = effective_backend == "hybrid";
|
||||
let local_via_llamacpp =
|
||||
crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
|
||||
let describes_then_inlines = is_hybrid || local_via_llamacpp;
|
||||
let describes_then_inlines = is_hybrid;
|
||||
|
||||
let max_iterations = req
|
||||
.max_iterations
|
||||
@@ -842,10 +839,9 @@ impl InsightChatService {
|
||||
let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref();
|
||||
let model_used = chat_backend.primary_model().to_string();
|
||||
|
||||
// Tool set — local mode + first user turn carries an image →
|
||||
// offer describe_photo. Describe-then-inline modes (hybrid OR
|
||||
// local_via_llamacpp): visual description was inlined when the
|
||||
// insight was bootstrapped, no describe tool needed.
|
||||
// Tool set — local/llamacpp mode + first user turn carries an image →
|
||||
// offer describe_photo. Describe-then-inline mode (hybrid only):
|
||||
// visual description was inlined at bootstrap, no describe tool needed.
|
||||
let local_first_user_has_image = messages
|
||||
.iter()
|
||||
.find(|m| m.role == "user")
|
||||
@@ -991,7 +987,7 @@ impl InsightChatService {
|
||||
let is_hybrid = effective_backend == "hybrid";
|
||||
let local_via_llamacpp =
|
||||
crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
|
||||
let describes_then_inlines = is_hybrid || local_via_llamacpp;
|
||||
let describes_then_inlines = is_hybrid;
|
||||
|
||||
let max_iterations = req
|
||||
.max_iterations
|
||||
@@ -1023,10 +1019,9 @@ impl InsightChatService {
|
||||
_ => None,
|
||||
});
|
||||
|
||||
// Describe-then-inline (hybrid OR local_via_llamacpp): pre-describe
|
||||
// the image so a text-only chat model gets the visual description
|
||||
// inline. Vision source follows `LLM_BACKEND`: llama-swap when
|
||||
// `local_via_llamacpp`, else Ollama.
|
||||
// Describe-then-inline (hybrid only): pre-describe the image so a
|
||||
// text-only chat model gets the visual description inline. llamacpp
|
||||
// sends images directly to the chat model.
|
||||
let visual_block = if describes_then_inlines {
|
||||
match image_base64.as_deref() {
|
||||
Some(b64) => {
|
||||
|
||||
@@ -10,6 +10,7 @@ use std::io::Cursor;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use crate::ai::apollo_client::{ApolloClient, ApolloPlace};
|
||||
use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides};
|
||||
use crate::ai::llm_client::LlmClient;
|
||||
use crate::ai::ollama::{ChatMessage, OllamaClient, Tool};
|
||||
use crate::ai::llamacpp::LlamaCppClient;
|
||||
@@ -1781,14 +1782,18 @@ Return ONLY the summary, nothing else."#,
|
||||
);
|
||||
|
||||
let started = std::time::Instant::now();
|
||||
let response = ollama
|
||||
.generate_no_think(
|
||||
&prompt,
|
||||
Some(
|
||||
let system = Some(
|
||||
"You are a terse relevance ranker. You output only numbers separated by commas.",
|
||||
),
|
||||
)
|
||||
.await?;
|
||||
);
|
||||
let response = if crate::ai::local_backend_is_llamacpp() {
|
||||
if let Some(ref lc) = self.llamacpp {
|
||||
lc.generate(&prompt, system, None).await?
|
||||
} else {
|
||||
ollama.generate_no_think(&prompt, system).await?
|
||||
}
|
||||
} else {
|
||||
ollama.generate_no_think(&prompt, system).await?
|
||||
};
|
||||
log::info!(
|
||||
"rerank: finished in {} ms (prompt={} chars)",
|
||||
started.elapsed().as_millis(),
|
||||
@@ -2360,7 +2365,8 @@ Return ONLY the summary, nothing else."#,
|
||||
out
|
||||
}
|
||||
|
||||
/// Tool: describe_photo — generate a visual description of the photo
|
||||
/// Tool: describe_photo — generate a visual description of the photo.
|
||||
/// Routes through llama-swap when `LLM_BACKEND=llamacpp`, Ollama otherwise.
|
||||
async fn tool_describe_photo(
|
||||
&self,
|
||||
ollama: &OllamaClient,
|
||||
@@ -2369,10 +2375,21 @@ Return ONLY the summary, nothing else."#,
|
||||
log::info!("tool_describe_photo: generating visual description");
|
||||
|
||||
match image_base64 {
|
||||
Some(img) => match ollama.generate_photo_description(img).await {
|
||||
Some(img) => {
|
||||
let result = if crate::ai::local_backend_is_llamacpp() {
|
||||
if let Some(ref lc) = self.llamacpp {
|
||||
lc.describe_image(img).await
|
||||
} else {
|
||||
ollama.generate_photo_description(img).await
|
||||
}
|
||||
} else {
|
||||
ollama.generate_photo_description(img).await
|
||||
};
|
||||
match result {
|
||||
Ok(desc) => desc,
|
||||
Err(e) => format!("Error describing photo: {}", e),
|
||||
},
|
||||
}
|
||||
}
|
||||
None => "No image available for description.".to_string(),
|
||||
}
|
||||
}
|
||||
@@ -3560,6 +3577,177 @@ Return ONLY the summary, nothing else."#,
|
||||
out
|
||||
}
|
||||
|
||||
/// Consolidate client construction for the agentic insight loop.
|
||||
///
|
||||
/// Returns a [`ResolvedBackend`] containing the **chat** client (the model
|
||||
/// that drives the agent loop), the **local** client (always the configured
|
||||
/// local backend — Ollama or llama-swap — for utility calls like
|
||||
/// describe_image, rerank, embeddings), the backend kind, and whether the
|
||||
/// chat model receives images inline.
|
||||
pub async fn resolve_backend(
|
||||
&self,
|
||||
kind: BackendKind,
|
||||
overrides: &SamplingOverrides,
|
||||
) -> Result<ResolvedBackend> {
|
||||
let local_via_llamacpp =
|
||||
crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
|
||||
let is_hybrid = kind == BackendKind::Hybrid;
|
||||
|
||||
// ── chat client ────────────────────────────────────────────────
|
||||
let chat: Box<dyn LlmClient> = if is_hybrid {
|
||||
// Hybrid: chat through OpenRouter.
|
||||
let arc = self.openrouter.as_ref().ok_or_else(|| {
|
||||
anyhow::anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
|
||||
})?;
|
||||
let mut c: OpenRouterClient = (**arc).clone();
|
||||
if let Some(ref m) = overrides.model {
|
||||
c.primary_model = m.clone();
|
||||
}
|
||||
if overrides.has_sampling() {
|
||||
c.set_sampling_params(
|
||||
overrides.temperature,
|
||||
overrides.top_p,
|
||||
overrides.top_k,
|
||||
overrides.min_p,
|
||||
);
|
||||
}
|
||||
if let Some(ctx) = overrides.num_ctx {
|
||||
c.set_num_ctx(Some(ctx));
|
||||
}
|
||||
Box::new(c)
|
||||
} else if local_via_llamacpp {
|
||||
// Local via llama-swap.
|
||||
let arc = self.llamacpp.as_ref().ok_or_else(|| {
|
||||
anyhow::anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured")
|
||||
})?;
|
||||
let mut c: LlamaCppClient = (**arc).clone();
|
||||
if let Some(ref m) = overrides.model {
|
||||
c.primary_model = m.clone();
|
||||
}
|
||||
if overrides.has_sampling() {
|
||||
c.set_sampling_params(
|
||||
overrides.temperature,
|
||||
overrides.top_p,
|
||||
overrides.top_k,
|
||||
overrides.min_p,
|
||||
);
|
||||
}
|
||||
if let Some(ctx) = overrides.num_ctx {
|
||||
c.set_num_ctx(Some(ctx));
|
||||
}
|
||||
Box::new(c)
|
||||
} else {
|
||||
// Pure Ollama local.
|
||||
let mut ollama_client = if let Some(ref model) = overrides.model {
|
||||
OllamaClient::new(
|
||||
self.ollama.primary_url.clone(),
|
||||
self.ollama.fallback_url.clone(),
|
||||
model.clone(),
|
||||
Some(model.clone()),
|
||||
)
|
||||
} else {
|
||||
self.ollama.clone()
|
||||
};
|
||||
if overrides.has_sampling() {
|
||||
ollama_client.set_sampling_params(
|
||||
overrides.temperature,
|
||||
overrides.top_p,
|
||||
overrides.top_k,
|
||||
overrides.min_p,
|
||||
);
|
||||
}
|
||||
if let Some(ctx) = overrides.num_ctx {
|
||||
ollama_client.set_num_ctx(Some(ctx));
|
||||
}
|
||||
Box::new(ollama_client)
|
||||
};
|
||||
|
||||
// ── local client (utility calls: rerank, describe_image, etc.) ─
|
||||
let local: Box<dyn LlmClient> = if local_via_llamacpp {
|
||||
Box::new(self.llamacpp.as_ref().unwrap().as_ref().clone())
|
||||
} else {
|
||||
Box::new(self.ollama.clone())
|
||||
};
|
||||
|
||||
// ── images_inline ──────────────────────────────────────────────
|
||||
let images_inline = if is_hybrid {
|
||||
// Hybrid: chat model never sees images — describe-then-inject.
|
||||
false
|
||||
} else if local_via_llamacpp {
|
||||
// llama-swap models receive images directly via OpenAI content
|
||||
// parts. Capability probing isn't available (no `/api/show`),
|
||||
// so assume vision support; a misconfigured model surfaces as
|
||||
// a chat-call error.
|
||||
true
|
||||
} else {
|
||||
// Pure Ollama: probe model capabilities.
|
||||
let ollama_for_caps = if let Some(ref model) = overrides.model {
|
||||
// Verify custom model is available on at least one server.
|
||||
let available_on_primary =
|
||||
OllamaClient::is_model_available(&self.ollama.primary_url, model)
|
||||
.await
|
||||
.unwrap_or(false);
|
||||
|
||||
let available_on_fallback =
|
||||
if let Some(ref fallback_url) = self.ollama.fallback_url {
|
||||
OllamaClient::is_model_available(fallback_url, model)
|
||||
.await
|
||||
.unwrap_or(false)
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
if !available_on_primary && !available_on_fallback {
|
||||
anyhow::bail!(
|
||||
"model not available: '{}' not found on any configured server",
|
||||
model
|
||||
);
|
||||
}
|
||||
model.as_str()
|
||||
} else {
|
||||
self.ollama.primary_model.as_str()
|
||||
};
|
||||
|
||||
let capabilities = match OllamaClient::check_model_capabilities(
|
||||
&self.ollama.primary_url,
|
||||
ollama_for_caps,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(caps) => caps,
|
||||
Err(_) => {
|
||||
let fallback_url =
|
||||
self.ollama.fallback_url.as_deref().ok_or_else(|| {
|
||||
anyhow::anyhow!(
|
||||
"Failed to check model capabilities for '{}': model not found on primary server and no fallback configured",
|
||||
ollama_for_caps
|
||||
)
|
||||
})?;
|
||||
OllamaClient::check_model_capabilities(fallback_url, ollama_for_caps)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
anyhow::anyhow!(
|
||||
"Failed to check model capabilities for '{}': {}",
|
||||
ollama_for_caps,
|
||||
e
|
||||
)
|
||||
})?
|
||||
}
|
||||
};
|
||||
|
||||
if !capabilities.has_tool_calling {
|
||||
anyhow::bail!(
|
||||
"tool calling not supported by model '{}'",
|
||||
ollama_for_caps
|
||||
);
|
||||
}
|
||||
|
||||
capabilities.has_vision
|
||||
};
|
||||
|
||||
Ok(ResolvedBackend::new(chat, local, kind, images_inline))
|
||||
}
|
||||
|
||||
pub async fn generate_agentic_insight_for_photo(
|
||||
&self,
|
||||
file_path: &str,
|
||||
@@ -3602,26 +3790,22 @@ Return ONLY the summary, nothing else."#,
|
||||
span.set_attribute(KeyValue::new("backend", backend_label.clone()));
|
||||
let is_hybrid = backend_label == "hybrid";
|
||||
// `LLM_BACKEND=llamacpp` swaps Ollama out for llama-swap as the
|
||||
// "local" stack — chat + vision describe + embeddings all route
|
||||
// through llama-swap. In hybrid mode this still applies to vision
|
||||
// describe (chat continues to go to OpenRouter). The chat slot is
|
||||
// text-only in either case, so we describe-then-inline.
|
||||
// "local" stack — chat + embeddings route through llama-swap.
|
||||
// llamacpp models receive images directly (vision-capable); only
|
||||
// hybrid mode (OpenRouter chat) uses describe-then-inline.
|
||||
let local_via_llamacpp =
|
||||
crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
|
||||
// Describe-then-inline: hybrid (chat is OpenRouter, text-only) or
|
||||
// any path where chat goes through llama-swap (chat slot is text-only).
|
||||
let describes_then_inlines = is_hybrid || local_via_llamacpp;
|
||||
let describes_then_inlines = is_hybrid;
|
||||
let ollama_is_chat = !is_hybrid && !local_via_llamacpp;
|
||||
|
||||
// 1b. Always build an Ollama client. In local mode it owns the chat
|
||||
// loop; in hybrid/llamacpp mode it still handles tool-local calls
|
||||
// (e.g. future embedding-backed tools). The chat backend is
|
||||
// selected separately below.
|
||||
// Sampling overrides only apply in local mode — in
|
||||
// hybrid/llamacpp the user's params belong to the alternate chat
|
||||
// client.
|
||||
let apply_sampling_to_ollama = !describes_then_inlines;
|
||||
// Sampling overrides only apply when Ollama is the chat backend.
|
||||
let apply_sampling_to_ollama = ollama_is_chat;
|
||||
let mut ollama_client = if let Some(ref model) = custom_model
|
||||
&& !describes_then_inlines
|
||||
&& ollama_is_chat
|
||||
{
|
||||
log::info!("Using custom model for agentic: {}", model);
|
||||
span.set_attribute(KeyValue::new("custom_model", model.clone()));
|
||||
@@ -3632,7 +3816,7 @@ Return ONLY the summary, nothing else."#,
|
||||
Some(model.clone()),
|
||||
)
|
||||
} else {
|
||||
if !describes_then_inlines {
|
||||
if ollama_is_chat {
|
||||
span.set_attribute(KeyValue::new("model", self.ollama.primary_model.clone()));
|
||||
}
|
||||
self.ollama.clone()
|
||||
@@ -3752,10 +3936,13 @@ Return ONLY the summary, nothing else."#,
|
||||
// (OPENROUTER_ALLOWED_MODELS) — no live precheck. A bad model id
|
||||
// surfaces as a chat-call error on the next step.
|
||||
let has_vision = if describes_then_inlines {
|
||||
// In hybrid + llamacpp modes the chat model never sees images
|
||||
// directly — we describe-then-inject, so `has_vision` drives only
|
||||
// whether we bother loading the image to describe it, which we
|
||||
// always do.
|
||||
// Hybrid: chat model never sees images — describe-then-inject.
|
||||
true
|
||||
} else if local_via_llamacpp {
|
||||
// llama-swap models receive images directly via OpenAI content
|
||||
// parts. Capability probing isn't available (no `/api/show`),
|
||||
// so assume vision support; a misconfigured model surfaces as
|
||||
// a chat-call error.
|
||||
true
|
||||
} else {
|
||||
if let Some(ref model_name) = custom_model {
|
||||
@@ -3935,10 +4122,9 @@ Return ONLY the summary, nothing else."#,
|
||||
None
|
||||
};
|
||||
|
||||
// describe-then-inline path. Vision describe routes through whichever
|
||||
// `LLM_BACKEND` is configured — llama-swap when `local_via_llamacpp`
|
||||
// is set (even in hybrid mode, since chat is OpenRouter but vision
|
||||
// stays on the local stack), otherwise Ollama.
|
||||
// describe-then-inline path (hybrid only). Vision describe routes
|
||||
// through whichever local backend is configured — llama-swap when
|
||||
// `local_via_llamacpp`, otherwise Ollama.
|
||||
let inlined_visual_description: Option<String> = if describes_then_inlines {
|
||||
match image_base64.as_deref() {
|
||||
Some(b64) => {
|
||||
@@ -4043,10 +4229,10 @@ Return ONLY the summary, nothing else."#,
|
||||
);
|
||||
|
||||
// 10. Define tools. Gate flags computed from current data presence;
|
||||
// describe-then-inline modes (hybrid OR local_via_llamacpp) omit
|
||||
// describe_photo since the chat model receives the visual
|
||||
// description inline (so we pass `false` for has_vision in
|
||||
// those modes regardless of the model's actual capability).
|
||||
// hybrid mode omits describe_photo since the chat model receives
|
||||
// the visual description inline (so we pass `false` for
|
||||
// has_vision in that mode regardless of the model's actual
|
||||
// capability).
|
||||
let gate_opts = self.current_gate_opts(has_vision && !describes_then_inlines);
|
||||
let tools = Self::build_tool_definitions(gate_opts);
|
||||
|
||||
|
||||
@@ -50,9 +50,9 @@ pub struct LlamaCppClient {
|
||||
/// Embedding model slot id (e.g. `"embed"`). Used for
|
||||
/// `generate_embeddings`.
|
||||
pub embedding_model: String,
|
||||
/// Vision model slot id (e.g. `"vision"`). Used for `describe_image`,
|
||||
/// and the only slot that reports `has_vision = true` in capability
|
||||
/// lookups (llama-swap's `/v1/models` doesn't surface modality).
|
||||
/// Vision model slot id. Used for `describe_image` routing. Defaults
|
||||
/// to `primary_model` so describe_image works out of the box; override
|
||||
/// via `LLAMA_SWAP_VISION_MODEL` for a dedicated vision slot.
|
||||
pub vision_model: String,
|
||||
num_ctx: Option<i32>,
|
||||
temperature: Option<f32>,
|
||||
@@ -67,6 +67,7 @@ impl LlamaCppClient {
|
||||
.ok()
|
||||
.and_then(|v| v.parse::<u64>().ok())
|
||||
.unwrap_or(DEFAULT_REQUEST_TIMEOUT_SECS);
|
||||
let pm = primary_model.unwrap_or_else(|| DEFAULT_PRIMARY_MODEL.to_string());
|
||||
Self {
|
||||
client: Client::builder()
|
||||
.connect_timeout(Duration::from_secs(10))
|
||||
@@ -74,9 +75,9 @@ impl LlamaCppClient {
|
||||
.build()
|
||||
.unwrap_or_else(|_| Client::new()),
|
||||
base_url: base_url.unwrap_or_else(|| DEFAULT_BASE_URL.to_string()),
|
||||
primary_model: primary_model.unwrap_or_else(|| DEFAULT_PRIMARY_MODEL.to_string()),
|
||||
primary_model: pm.clone(),
|
||||
embedding_model: DEFAULT_EMBEDDING_MODEL.to_string(),
|
||||
vision_model: DEFAULT_VISION_MODEL.to_string(),
|
||||
vision_model: pm,
|
||||
num_ctx: None,
|
||||
temperature: None,
|
||||
top_p: None,
|
||||
@@ -124,6 +125,13 @@ impl LlamaCppClient {
|
||||
let mut obj = serde_json::Map::new();
|
||||
obj.insert("role".into(), Value::String(msg.role.clone()));
|
||||
|
||||
// Assistant messages with tool_calls must emit `content: null`
|
||||
// (not `""`) — some Jinja templates (Mistral-family) treat
|
||||
// empty-string content as a regular message rather than a
|
||||
// tool-calling turn, breaking role-alternation validation.
|
||||
let has_tool_calls = msg.role == "assistant"
|
||||
&& msg.tool_calls.as_ref().is_some_and(|tcs| !tcs.is_empty());
|
||||
|
||||
match &msg.images {
|
||||
Some(images) if !images.is_empty() => {
|
||||
let mut parts: Vec<Value> = Vec::new();
|
||||
@@ -139,6 +147,9 @@ impl LlamaCppClient {
|
||||
}
|
||||
obj.insert("content".into(), Value::Array(parts));
|
||||
}
|
||||
_ if has_tool_calls && msg.content.is_empty() => {
|
||||
obj.insert("content".into(), Value::Null);
|
||||
}
|
||||
_ => {
|
||||
obj.insert("content".into(), Value::String(msg.content.clone()));
|
||||
}
|
||||
@@ -276,6 +287,13 @@ impl LlamaCppClient {
|
||||
tools: Vec<Tool>,
|
||||
) -> Result<(ChatMessage, Option<i32>, Option<i32>)> {
|
||||
let url = format!("{}/chat/completions", self.base_url);
|
||||
let roles: Vec<&str> = messages.iter().map(|m| m.role.as_str()).collect();
|
||||
log::debug!(
|
||||
"llama-swap chat_completion: model={} roles={:?} tools={}",
|
||||
model,
|
||||
roles,
|
||||
tools.len()
|
||||
);
|
||||
let mut body = serde_json::Map::new();
|
||||
body.insert("model".into(), Value::String(model.to_string()));
|
||||
body.insert(
|
||||
@@ -381,6 +399,13 @@ impl LlmClient for LlamaCppClient {
|
||||
tools: Vec<Tool>,
|
||||
) -> Result<BoxStream<'static, Result<LlmStreamEvent>>> {
|
||||
let url = format!("{}/chat/completions", self.base_url);
|
||||
let roles: Vec<&str> = messages.iter().map(|m| m.role.as_str()).collect();
|
||||
log::debug!(
|
||||
"llama-swap stream: model={} roles={:?} tools={}",
|
||||
self.primary_model,
|
||||
roles,
|
||||
tools.len()
|
||||
);
|
||||
let mut body = serde_json::Map::new();
|
||||
body.insert(
|
||||
"model".into(),
|
||||
@@ -681,12 +706,12 @@ impl LlamaCppClient {
|
||||
.and_then(|v| v.as_str())
|
||||
.unwrap_or_default()
|
||||
.to_string();
|
||||
let has_vision = name == self.vision_model;
|
||||
// Tool calling is the default for llama-swap entries we configure
|
||||
// (--jinja flag); no negative-list mechanism yet, so report true.
|
||||
// llama-swap doesn't expose per-model modality flags. Assume all
|
||||
// configured models support vision and tool calling; a model
|
||||
// without multimodal support surfaces as a chat-call error.
|
||||
ModelCapabilities {
|
||||
name,
|
||||
has_vision,
|
||||
has_vision: true,
|
||||
has_tool_calling: true,
|
||||
}
|
||||
}
|
||||
@@ -943,7 +968,7 @@ mod tests {
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn capability_inference_marks_only_vision_slot() {
|
||||
fn all_models_report_vision_capable() {
|
||||
let mut c = LlamaCppClient::new(None, Some("chat".into()));
|
||||
c.set_vision_model("vision".into());
|
||||
|
||||
@@ -955,9 +980,9 @@ mod tests {
|
||||
let vision = c.parse_model_capabilities(&m_vision);
|
||||
let other = c.parse_model_capabilities(&m_other);
|
||||
|
||||
assert!(!chat.has_vision);
|
||||
assert!(chat.has_vision);
|
||||
assert!(chat.has_tool_calling);
|
||||
assert!(vision.has_vision);
|
||||
assert!(!other.has_vision);
|
||||
assert!(other.has_vision);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
pub mod apollo_client;
|
||||
pub mod backend;
|
||||
pub mod clip_client;
|
||||
pub mod daily_summary_job;
|
||||
pub mod face_client;
|
||||
|
||||
Reference in New Issue
Block a user