feature/llamacpp-backend #101

Merged
cameron merged 11 commits from feature/llamacpp-backend into master 2026-05-26 18:58:48 +00:00
6 changed files with 395 additions and 70 deletions
Showing only changes of commit 0631820fbf - Show all commits

118
src/ai/backend.rs Normal file
View File

@@ -0,0 +1,118 @@
use anyhow::{Result, anyhow};
use crate::ai::llm_client::LlmClient;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum BackendKind {
Local,
Hybrid,
}
impl BackendKind {
pub fn parse(s: &str) -> Result<Self> {
match s.trim().to_lowercase().as_str() {
"local" | "" => Ok(Self::Local),
"hybrid" => Ok(Self::Hybrid),
other => Err(anyhow!("unknown backend '{}'; expected 'local' or 'hybrid'", other)),
}
}
pub fn as_str(&self) -> &'static str {
match self {
Self::Local => "local",
Self::Hybrid => "hybrid",
}
}
}
impl std::fmt::Display for BackendKind {
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
f.write_str(self.as_str())
}
}
pub struct SamplingOverrides {
pub model: Option<String>,
pub num_ctx: Option<i32>,
pub temperature: Option<f32>,
pub top_p: Option<f32>,
pub top_k: Option<i32>,
pub min_p: Option<f32>,
}
impl SamplingOverrides {
pub fn has_sampling(&self) -> bool {
self.temperature.is_some()
|| self.top_p.is_some()
|| self.top_k.is_some()
|| self.min_p.is_some()
}
}
pub struct ResolvedBackend {
chat: Box<dyn LlmClient>,
local: Box<dyn LlmClient>,
pub kind: BackendKind,
/// `true` when the chat model receives images directly (Ollama with
/// vision, or llamacpp). `false` for hybrid where we describe-then-inline.
pub images_inline: bool,
}
impl ResolvedBackend {
pub fn new(
chat: Box<dyn LlmClient>,
local: Box<dyn LlmClient>,
kind: BackendKind,
images_inline: bool,
) -> Self {
Self { chat, local, kind, images_inline }
}
pub fn chat(&self) -> &dyn LlmClient {
self.chat.as_ref()
}
pub fn local(&self) -> &dyn LlmClient {
self.local.as_ref()
}
pub fn model(&self) -> &str {
self.chat.primary_model()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn parse_backend_kind() {
assert_eq!(BackendKind::parse("local").unwrap(), BackendKind::Local);
assert_eq!(BackendKind::parse("hybrid").unwrap(), BackendKind::Hybrid);
assert_eq!(BackendKind::parse(" Local ").unwrap(), BackendKind::Local);
assert_eq!(BackendKind::parse("HYBRID").unwrap(), BackendKind::Hybrid);
assert_eq!(BackendKind::parse("").unwrap(), BackendKind::Local);
assert!(BackendKind::parse("vllm").is_err());
}
#[test]
fn backend_kind_as_str_roundtrips() {
assert_eq!(BackendKind::parse(BackendKind::Local.as_str()).unwrap(), BackendKind::Local);
assert_eq!(BackendKind::parse(BackendKind::Hybrid.as_str()).unwrap(), BackendKind::Hybrid);
}
#[test]
fn sampling_overrides_has_sampling() {
let empty = SamplingOverrides {
model: None, num_ctx: None, temperature: None,
top_p: None, top_k: None, min_p: None,
};
assert!(!empty.has_sampling());
let with_temp = SamplingOverrides {
model: None, num_ctx: Some(4096), temperature: Some(0.7),
top_p: None, top_k: None, min_p: None,
};
assert!(with_temp.has_sampling());
}
}

View File

@@ -495,7 +495,7 @@ pub async fn get_available_models_handler(
.iter()
.map(|name| ModelCapabilities {
name: name.clone(),
has_vision: name == &lc.vision_model,
has_vision: true,
has_tool_calling: true,
})
.collect();

View File

@@ -311,7 +311,7 @@ impl InsightChatService {
let is_hybrid = effective_backend == "hybrid";
let local_via_llamacpp =
crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
let describes_then_inlines = is_hybrid || local_via_llamacpp;
let describes_then_inlines = is_hybrid;
span.set_attribute(KeyValue::new("backend", effective_backend.clone()));
// 4. Build the chat backend client. Hybrid → OpenRouter; local with
@@ -408,12 +408,11 @@ impl InsightChatService {
let model_used = chat_backend.primary_model().to_string();
span.set_attribute(KeyValue::new("model", model_used.clone()));
// 5. Decide vision + tool set. In describe-then-inline modes
// (hybrid, llamacpp) we always omit `describe_photo` (matches the
// original generation flow). In local we trust the stored
// history's first-user shape: if it carries `images`, the
// original model was vision-capable, and we keep `describe_photo`
// available.
// 5. Decide vision + tool set. In describe-then-inline mode
// (hybrid only) we omit `describe_photo`. In local and llamacpp
// we trust the stored history's first-user shape: if it carries
// `images`, the original model was vision-capable, and we keep
// `describe_photo` available.
let local_first_user_has_image = messages
.iter()
.find(|m| m.role == "user")
@@ -821,9 +820,7 @@ impl InsightChatService {
.unwrap_or_else(|| stored_backend.clone());
validate_cross_replay(&stored_backend, &effective_backend)?;
let is_hybrid = effective_backend == "hybrid";
let local_via_llamacpp =
crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
let describes_then_inlines = is_hybrid || local_via_llamacpp;
let describes_then_inlines = is_hybrid;
let max_iterations = req
.max_iterations
@@ -842,10 +839,9 @@ impl InsightChatService {
let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref();
let model_used = chat_backend.primary_model().to_string();
// Tool set — local mode + first user turn carries an image →
// offer describe_photo. Describe-then-inline modes (hybrid OR
// local_via_llamacpp): visual description was inlined when the
// insight was bootstrapped, no describe tool needed.
// Tool set — local/llamacpp mode + first user turn carries an image →
// offer describe_photo. Describe-then-inline mode (hybrid only):
// visual description was inlined at bootstrap, no describe tool needed.
let local_first_user_has_image = messages
.iter()
.find(|m| m.role == "user")
@@ -991,7 +987,7 @@ impl InsightChatService {
let is_hybrid = effective_backend == "hybrid";
let local_via_llamacpp =
crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
let describes_then_inlines = is_hybrid || local_via_llamacpp;
let describes_then_inlines = is_hybrid;
let max_iterations = req
.max_iterations
@@ -1023,10 +1019,9 @@ impl InsightChatService {
_ => None,
});
// Describe-then-inline (hybrid OR local_via_llamacpp): pre-describe
// the image so a text-only chat model gets the visual description
// inline. Vision source follows `LLM_BACKEND`: llama-swap when
// `local_via_llamacpp`, else Ollama.
// Describe-then-inline (hybrid only): pre-describe the image so a
// text-only chat model gets the visual description inline. llamacpp
// sends images directly to the chat model.
let visual_block = if describes_then_inlines {
match image_base64.as_deref() {
Some(b64) => {

View File

@@ -10,6 +10,7 @@ use std::io::Cursor;
use std::sync::{Arc, Mutex};
use crate::ai::apollo_client::{ApolloClient, ApolloPlace};
use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides};
use crate::ai::llm_client::LlmClient;
use crate::ai::ollama::{ChatMessage, OllamaClient, Tool};
use crate::ai::llamacpp::LlamaCppClient;
@@ -1781,14 +1782,18 @@ Return ONLY the summary, nothing else."#,
);
let started = std::time::Instant::now();
let response = ollama
.generate_no_think(
&prompt,
Some(
let system = Some(
"You are a terse relevance ranker. You output only numbers separated by commas.",
),
)
.await?;
);
let response = if crate::ai::local_backend_is_llamacpp() {
if let Some(ref lc) = self.llamacpp {
lc.generate(&prompt, system, None).await?
} else {
ollama.generate_no_think(&prompt, system).await?
}
} else {
ollama.generate_no_think(&prompt, system).await?
};
log::info!(
"rerank: finished in {} ms (prompt={} chars)",
started.elapsed().as_millis(),
@@ -2360,7 +2365,8 @@ Return ONLY the summary, nothing else."#,
out
}
/// Tool: describe_photo — generate a visual description of the photo
/// Tool: describe_photo — generate a visual description of the photo.
/// Routes through llama-swap when `LLM_BACKEND=llamacpp`, Ollama otherwise.
async fn tool_describe_photo(
&self,
ollama: &OllamaClient,
@@ -2369,10 +2375,21 @@ Return ONLY the summary, nothing else."#,
log::info!("tool_describe_photo: generating visual description");
match image_base64 {
Some(img) => match ollama.generate_photo_description(img).await {
Some(img) => {
let result = if crate::ai::local_backend_is_llamacpp() {
if let Some(ref lc) = self.llamacpp {
lc.describe_image(img).await
} else {
ollama.generate_photo_description(img).await
}
} else {
ollama.generate_photo_description(img).await
};
match result {
Ok(desc) => desc,
Err(e) => format!("Error describing photo: {}", e),
},
}
}
None => "No image available for description.".to_string(),
}
}
@@ -3560,6 +3577,177 @@ Return ONLY the summary, nothing else."#,
out
}
/// Consolidate client construction for the agentic insight loop.
///
/// Returns a [`ResolvedBackend`] containing the **chat** client (the model
/// that drives the agent loop), the **local** client (always the configured
/// local backend — Ollama or llama-swap — for utility calls like
/// describe_image, rerank, embeddings), the backend kind, and whether the
/// chat model receives images inline.
pub async fn resolve_backend(
&self,
kind: BackendKind,
overrides: &SamplingOverrides,
) -> Result<ResolvedBackend> {
let local_via_llamacpp =
crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
let is_hybrid = kind == BackendKind::Hybrid;
// ── chat client ────────────────────────────────────────────────
let chat: Box<dyn LlmClient> = if is_hybrid {
// Hybrid: chat through OpenRouter.
let arc = self.openrouter.as_ref().ok_or_else(|| {
anyhow::anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
})?;
let mut c: OpenRouterClient = (**arc).clone();
if let Some(ref m) = overrides.model {
c.primary_model = m.clone();
}
if overrides.has_sampling() {
c.set_sampling_params(
overrides.temperature,
overrides.top_p,
overrides.top_k,
overrides.min_p,
);
}
if let Some(ctx) = overrides.num_ctx {
c.set_num_ctx(Some(ctx));
}
Box::new(c)
} else if local_via_llamacpp {
// Local via llama-swap.
let arc = self.llamacpp.as_ref().ok_or_else(|| {
anyhow::anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured")
})?;
let mut c: LlamaCppClient = (**arc).clone();
if let Some(ref m) = overrides.model {
c.primary_model = m.clone();
}
if overrides.has_sampling() {
c.set_sampling_params(
overrides.temperature,
overrides.top_p,
overrides.top_k,
overrides.min_p,
);
}
if let Some(ctx) = overrides.num_ctx {
c.set_num_ctx(Some(ctx));
}
Box::new(c)
} else {
// Pure Ollama local.
let mut ollama_client = if let Some(ref model) = overrides.model {
OllamaClient::new(
self.ollama.primary_url.clone(),
self.ollama.fallback_url.clone(),
model.clone(),
Some(model.clone()),
)
} else {
self.ollama.clone()
};
if overrides.has_sampling() {
ollama_client.set_sampling_params(
overrides.temperature,
overrides.top_p,
overrides.top_k,
overrides.min_p,
);
}
if let Some(ctx) = overrides.num_ctx {
ollama_client.set_num_ctx(Some(ctx));
}
Box::new(ollama_client)
};
// ── local client (utility calls: rerank, describe_image, etc.) ─
let local: Box<dyn LlmClient> = if local_via_llamacpp {
Box::new(self.llamacpp.as_ref().unwrap().as_ref().clone())
} else {
Box::new(self.ollama.clone())
};
// ── images_inline ──────────────────────────────────────────────
let images_inline = if is_hybrid {
// Hybrid: chat model never sees images — describe-then-inject.
false
} else if local_via_llamacpp {
// llama-swap models receive images directly via OpenAI content
// parts. Capability probing isn't available (no `/api/show`),
// so assume vision support; a misconfigured model surfaces as
// a chat-call error.
true
} else {
// Pure Ollama: probe model capabilities.
let ollama_for_caps = if let Some(ref model) = overrides.model {
// Verify custom model is available on at least one server.
let available_on_primary =
OllamaClient::is_model_available(&self.ollama.primary_url, model)
.await
.unwrap_or(false);
let available_on_fallback =
if let Some(ref fallback_url) = self.ollama.fallback_url {
OllamaClient::is_model_available(fallback_url, model)
.await
.unwrap_or(false)
} else {
false
};
if !available_on_primary && !available_on_fallback {
anyhow::bail!(
"model not available: '{}' not found on any configured server",
model
);
}
model.as_str()
} else {
self.ollama.primary_model.as_str()
};
let capabilities = match OllamaClient::check_model_capabilities(
&self.ollama.primary_url,
ollama_for_caps,
)
.await
{
Ok(caps) => caps,
Err(_) => {
let fallback_url =
self.ollama.fallback_url.as_deref().ok_or_else(|| {
anyhow::anyhow!(
"Failed to check model capabilities for '{}': model not found on primary server and no fallback configured",
ollama_for_caps
)
})?;
OllamaClient::check_model_capabilities(fallback_url, ollama_for_caps)
.await
.map_err(|e| {
anyhow::anyhow!(
"Failed to check model capabilities for '{}': {}",
ollama_for_caps,
e
)
})?
}
};
if !capabilities.has_tool_calling {
anyhow::bail!(
"tool calling not supported by model '{}'",
ollama_for_caps
);
}
capabilities.has_vision
};
Ok(ResolvedBackend::new(chat, local, kind, images_inline))
}
pub async fn generate_agentic_insight_for_photo(
&self,
file_path: &str,
@@ -3602,26 +3790,22 @@ Return ONLY the summary, nothing else."#,
span.set_attribute(KeyValue::new("backend", backend_label.clone()));
let is_hybrid = backend_label == "hybrid";
// `LLM_BACKEND=llamacpp` swaps Ollama out for llama-swap as the
// "local" stack — chat + vision describe + embeddings all route
// through llama-swap. In hybrid mode this still applies to vision
// describe (chat continues to go to OpenRouter). The chat slot is
// text-only in either case, so we describe-then-inline.
// "local" stack — chat + embeddings route through llama-swap.
// llamacpp models receive images directly (vision-capable); only
// hybrid mode (OpenRouter chat) uses describe-then-inline.
let local_via_llamacpp =
crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
// Describe-then-inline: hybrid (chat is OpenRouter, text-only) or
// any path where chat goes through llama-swap (chat slot is text-only).
let describes_then_inlines = is_hybrid || local_via_llamacpp;
let describes_then_inlines = is_hybrid;
let ollama_is_chat = !is_hybrid && !local_via_llamacpp;
// 1b. Always build an Ollama client. In local mode it owns the chat
// loop; in hybrid/llamacpp mode it still handles tool-local calls
// (e.g. future embedding-backed tools). The chat backend is
// selected separately below.
// Sampling overrides only apply in local mode — in
// hybrid/llamacpp the user's params belong to the alternate chat
// client.
let apply_sampling_to_ollama = !describes_then_inlines;
// Sampling overrides only apply when Ollama is the chat backend.
let apply_sampling_to_ollama = ollama_is_chat;
let mut ollama_client = if let Some(ref model) = custom_model
&& !describes_then_inlines
&& ollama_is_chat
{
log::info!("Using custom model for agentic: {}", model);
span.set_attribute(KeyValue::new("custom_model", model.clone()));
@@ -3632,7 +3816,7 @@ Return ONLY the summary, nothing else."#,
Some(model.clone()),
)
} else {
if !describes_then_inlines {
if ollama_is_chat {
span.set_attribute(KeyValue::new("model", self.ollama.primary_model.clone()));
}
self.ollama.clone()
@@ -3752,10 +3936,13 @@ Return ONLY the summary, nothing else."#,
// (OPENROUTER_ALLOWED_MODELS) — no live precheck. A bad model id
// surfaces as a chat-call error on the next step.
let has_vision = if describes_then_inlines {
// In hybrid + llamacpp modes the chat model never sees images
// directly — we describe-then-inject, so `has_vision` drives only
// whether we bother loading the image to describe it, which we
// always do.
// Hybrid: chat model never sees images — describe-then-inject.
true
} else if local_via_llamacpp {
// llama-swap models receive images directly via OpenAI content
// parts. Capability probing isn't available (no `/api/show`),
// so assume vision support; a misconfigured model surfaces as
// a chat-call error.
true
} else {
if let Some(ref model_name) = custom_model {
@@ -3935,10 +4122,9 @@ Return ONLY the summary, nothing else."#,
None
};
// describe-then-inline path. Vision describe routes through whichever
// `LLM_BACKEND` is configured — llama-swap when `local_via_llamacpp`
// is set (even in hybrid mode, since chat is OpenRouter but vision
// stays on the local stack), otherwise Ollama.
// describe-then-inline path (hybrid only). Vision describe routes
// through whichever local backend is configured — llama-swap when
// `local_via_llamacpp`, otherwise Ollama.
let inlined_visual_description: Option<String> = if describes_then_inlines {
match image_base64.as_deref() {
Some(b64) => {
@@ -4043,10 +4229,10 @@ Return ONLY the summary, nothing else."#,
);
// 10. Define tools. Gate flags computed from current data presence;
// describe-then-inline modes (hybrid OR local_via_llamacpp) omit
// describe_photo since the chat model receives the visual
// description inline (so we pass `false` for has_vision in
// those modes regardless of the model's actual capability).
// hybrid mode omits describe_photo since the chat model receives
// the visual description inline (so we pass `false` for
// has_vision in that mode regardless of the model's actual
// capability).
let gate_opts = self.current_gate_opts(has_vision && !describes_then_inlines);
let tools = Self::build_tool_definitions(gate_opts);

View File

@@ -50,9 +50,9 @@ pub struct LlamaCppClient {
/// Embedding model slot id (e.g. `"embed"`). Used for
/// `generate_embeddings`.
pub embedding_model: String,
/// Vision model slot id (e.g. `"vision"`). Used for `describe_image`,
/// and the only slot that reports `has_vision = true` in capability
/// lookups (llama-swap's `/v1/models` doesn't surface modality).
/// Vision model slot id. Used for `describe_image` routing. Defaults
/// to `primary_model` so describe_image works out of the box; override
/// via `LLAMA_SWAP_VISION_MODEL` for a dedicated vision slot.
pub vision_model: String,
num_ctx: Option<i32>,
temperature: Option<f32>,
@@ -67,6 +67,7 @@ impl LlamaCppClient {
.ok()
.and_then(|v| v.parse::<u64>().ok())
.unwrap_or(DEFAULT_REQUEST_TIMEOUT_SECS);
let pm = primary_model.unwrap_or_else(|| DEFAULT_PRIMARY_MODEL.to_string());
Self {
client: Client::builder()
.connect_timeout(Duration::from_secs(10))
@@ -74,9 +75,9 @@ impl LlamaCppClient {
.build()
.unwrap_or_else(|_| Client::new()),
base_url: base_url.unwrap_or_else(|| DEFAULT_BASE_URL.to_string()),
primary_model: primary_model.unwrap_or_else(|| DEFAULT_PRIMARY_MODEL.to_string()),
primary_model: pm.clone(),
embedding_model: DEFAULT_EMBEDDING_MODEL.to_string(),
vision_model: DEFAULT_VISION_MODEL.to_string(),
vision_model: pm,
num_ctx: None,
temperature: None,
top_p: None,
@@ -124,6 +125,13 @@ impl LlamaCppClient {
let mut obj = serde_json::Map::new();
obj.insert("role".into(), Value::String(msg.role.clone()));
// Assistant messages with tool_calls must emit `content: null`
// (not `""`) — some Jinja templates (Mistral-family) treat
// empty-string content as a regular message rather than a
// tool-calling turn, breaking role-alternation validation.
let has_tool_calls = msg.role == "assistant"
&& msg.tool_calls.as_ref().is_some_and(|tcs| !tcs.is_empty());
match &msg.images {
Some(images) if !images.is_empty() => {
let mut parts: Vec<Value> = Vec::new();
@@ -139,6 +147,9 @@ impl LlamaCppClient {
}
obj.insert("content".into(), Value::Array(parts));
}
_ if has_tool_calls && msg.content.is_empty() => {
obj.insert("content".into(), Value::Null);
}
_ => {
obj.insert("content".into(), Value::String(msg.content.clone()));
}
@@ -276,6 +287,13 @@ impl LlamaCppClient {
tools: Vec<Tool>,
) -> Result<(ChatMessage, Option<i32>, Option<i32>)> {
let url = format!("{}/chat/completions", self.base_url);
let roles: Vec<&str> = messages.iter().map(|m| m.role.as_str()).collect();
log::debug!(
"llama-swap chat_completion: model={} roles={:?} tools={}",
model,
roles,
tools.len()
);
let mut body = serde_json::Map::new();
body.insert("model".into(), Value::String(model.to_string()));
body.insert(
@@ -381,6 +399,13 @@ impl LlmClient for LlamaCppClient {
tools: Vec<Tool>,
) -> Result<BoxStream<'static, Result<LlmStreamEvent>>> {
let url = format!("{}/chat/completions", self.base_url);
let roles: Vec<&str> = messages.iter().map(|m| m.role.as_str()).collect();
log::debug!(
"llama-swap stream: model={} roles={:?} tools={}",
self.primary_model,
roles,
tools.len()
);
let mut body = serde_json::Map::new();
body.insert(
"model".into(),
@@ -681,12 +706,12 @@ impl LlamaCppClient {
.and_then(|v| v.as_str())
.unwrap_or_default()
.to_string();
let has_vision = name == self.vision_model;
// Tool calling is the default for llama-swap entries we configure
// (--jinja flag); no negative-list mechanism yet, so report true.
// llama-swap doesn't expose per-model modality flags. Assume all
// configured models support vision and tool calling; a model
// without multimodal support surfaces as a chat-call error.
ModelCapabilities {
name,
has_vision,
has_vision: true,
has_tool_calling: true,
}
}
@@ -943,7 +968,7 @@ mod tests {
}
#[test]
fn capability_inference_marks_only_vision_slot() {
fn all_models_report_vision_capable() {
let mut c = LlamaCppClient::new(None, Some("chat".into()));
c.set_vision_model("vision".into());
@@ -955,9 +980,9 @@ mod tests {
let vision = c.parse_model_capabilities(&m_vision);
let other = c.parse_model_capabilities(&m_other);
assert!(!chat.has_vision);
assert!(chat.has_vision);
assert!(chat.has_tool_calling);
assert!(vision.has_vision);
assert!(!other.has_vision);
assert!(other.has_vision);
}
}

View File

@@ -1,4 +1,5 @@
pub mod apollo_client;
pub mod backend;
pub mod clip_client;
pub mod daily_summary_job;
pub mod face_client;