ai: send images directly to llamacpp chat models + add ResolvedBackend

llamacpp models now receive images via OpenAI content-parts instead of
the describe-then-inline strategy (hybrid mode unchanged). Fixes
assistant messages with tool_calls emitting content: null instead of ""
to satisfy strict Jinja template role-alternation checks. Adds debug
logging of message role sequences on llamacpp requests.

Introduces BackendKind enum, SamplingOverrides, and ResolvedBackend in
a new backend.rs module. InsightGenerator::resolve_backend centralises
client construction + vision capability detection — next step wires the
existing inline dispatch through it.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Cameron Cordes
2026-05-24 14:00:37 -04:00
parent be51421b38
commit 0631820fbf
6 changed files with 395 additions and 70 deletions

View File

@@ -10,6 +10,7 @@ use std::io::Cursor;
use std::sync::{Arc, Mutex};
use crate::ai::apollo_client::{ApolloClient, ApolloPlace};
use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides};
use crate::ai::llm_client::LlmClient;
use crate::ai::ollama::{ChatMessage, OllamaClient, Tool};
use crate::ai::llamacpp::LlamaCppClient;
@@ -1781,14 +1782,18 @@ Return ONLY the summary, nothing else."#,
);
let started = std::time::Instant::now();
let response = ollama
.generate_no_think(
&prompt,
Some(
"You are a terse relevance ranker. You output only numbers separated by commas.",
),
)
.await?;
let system = Some(
"You are a terse relevance ranker. You output only numbers separated by commas.",
);
let response = if crate::ai::local_backend_is_llamacpp() {
if let Some(ref lc) = self.llamacpp {
lc.generate(&prompt, system, None).await?
} else {
ollama.generate_no_think(&prompt, system).await?
}
} else {
ollama.generate_no_think(&prompt, system).await?
};
log::info!(
"rerank: finished in {} ms (prompt={} chars)",
started.elapsed().as_millis(),
@@ -2360,7 +2365,8 @@ Return ONLY the summary, nothing else."#,
out
}
/// Tool: describe_photo — generate a visual description of the photo
/// Tool: describe_photo — generate a visual description of the photo.
/// Routes through llama-swap when `LLM_BACKEND=llamacpp`, Ollama otherwise.
async fn tool_describe_photo(
&self,
ollama: &OllamaClient,
@@ -2369,10 +2375,21 @@ Return ONLY the summary, nothing else."#,
log::info!("tool_describe_photo: generating visual description");
match image_base64 {
Some(img) => match ollama.generate_photo_description(img).await {
Ok(desc) => desc,
Err(e) => format!("Error describing photo: {}", e),
},
Some(img) => {
let result = if crate::ai::local_backend_is_llamacpp() {
if let Some(ref lc) = self.llamacpp {
lc.describe_image(img).await
} else {
ollama.generate_photo_description(img).await
}
} else {
ollama.generate_photo_description(img).await
};
match result {
Ok(desc) => desc,
Err(e) => format!("Error describing photo: {}", e),
}
}
None => "No image available for description.".to_string(),
}
}
@@ -3560,6 +3577,177 @@ Return ONLY the summary, nothing else."#,
out
}
/// Consolidate client construction for the agentic insight loop.
///
/// Returns a [`ResolvedBackend`] containing the **chat** client (the model
/// that drives the agent loop), the **local** client (always the configured
/// local backend — Ollama or llama-swap — for utility calls like
/// describe_image, rerank, embeddings), the backend kind, and whether the
/// chat model receives images inline.
pub async fn resolve_backend(
&self,
kind: BackendKind,
overrides: &SamplingOverrides,
) -> Result<ResolvedBackend> {
let local_via_llamacpp =
crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
let is_hybrid = kind == BackendKind::Hybrid;
// ── chat client ────────────────────────────────────────────────
let chat: Box<dyn LlmClient> = if is_hybrid {
// Hybrid: chat through OpenRouter.
let arc = self.openrouter.as_ref().ok_or_else(|| {
anyhow::anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
})?;
let mut c: OpenRouterClient = (**arc).clone();
if let Some(ref m) = overrides.model {
c.primary_model = m.clone();
}
if overrides.has_sampling() {
c.set_sampling_params(
overrides.temperature,
overrides.top_p,
overrides.top_k,
overrides.min_p,
);
}
if let Some(ctx) = overrides.num_ctx {
c.set_num_ctx(Some(ctx));
}
Box::new(c)
} else if local_via_llamacpp {
// Local via llama-swap.
let arc = self.llamacpp.as_ref().ok_or_else(|| {
anyhow::anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured")
})?;
let mut c: LlamaCppClient = (**arc).clone();
if let Some(ref m) = overrides.model {
c.primary_model = m.clone();
}
if overrides.has_sampling() {
c.set_sampling_params(
overrides.temperature,
overrides.top_p,
overrides.top_k,
overrides.min_p,
);
}
if let Some(ctx) = overrides.num_ctx {
c.set_num_ctx(Some(ctx));
}
Box::new(c)
} else {
// Pure Ollama local.
let mut ollama_client = if let Some(ref model) = overrides.model {
OllamaClient::new(
self.ollama.primary_url.clone(),
self.ollama.fallback_url.clone(),
model.clone(),
Some(model.clone()),
)
} else {
self.ollama.clone()
};
if overrides.has_sampling() {
ollama_client.set_sampling_params(
overrides.temperature,
overrides.top_p,
overrides.top_k,
overrides.min_p,
);
}
if let Some(ctx) = overrides.num_ctx {
ollama_client.set_num_ctx(Some(ctx));
}
Box::new(ollama_client)
};
// ── local client (utility calls: rerank, describe_image, etc.) ─
let local: Box<dyn LlmClient> = if local_via_llamacpp {
Box::new(self.llamacpp.as_ref().unwrap().as_ref().clone())
} else {
Box::new(self.ollama.clone())
};
// ── images_inline ──────────────────────────────────────────────
let images_inline = if is_hybrid {
// Hybrid: chat model never sees images — describe-then-inject.
false
} else if local_via_llamacpp {
// llama-swap models receive images directly via OpenAI content
// parts. Capability probing isn't available (no `/api/show`),
// so assume vision support; a misconfigured model surfaces as
// a chat-call error.
true
} else {
// Pure Ollama: probe model capabilities.
let ollama_for_caps = if let Some(ref model) = overrides.model {
// Verify custom model is available on at least one server.
let available_on_primary =
OllamaClient::is_model_available(&self.ollama.primary_url, model)
.await
.unwrap_or(false);
let available_on_fallback =
if let Some(ref fallback_url) = self.ollama.fallback_url {
OllamaClient::is_model_available(fallback_url, model)
.await
.unwrap_or(false)
} else {
false
};
if !available_on_primary && !available_on_fallback {
anyhow::bail!(
"model not available: '{}' not found on any configured server",
model
);
}
model.as_str()
} else {
self.ollama.primary_model.as_str()
};
let capabilities = match OllamaClient::check_model_capabilities(
&self.ollama.primary_url,
ollama_for_caps,
)
.await
{
Ok(caps) => caps,
Err(_) => {
let fallback_url =
self.ollama.fallback_url.as_deref().ok_or_else(|| {
anyhow::anyhow!(
"Failed to check model capabilities for '{}': model not found on primary server and no fallback configured",
ollama_for_caps
)
})?;
OllamaClient::check_model_capabilities(fallback_url, ollama_for_caps)
.await
.map_err(|e| {
anyhow::anyhow!(
"Failed to check model capabilities for '{}': {}",
ollama_for_caps,
e
)
})?
}
};
if !capabilities.has_tool_calling {
anyhow::bail!(
"tool calling not supported by model '{}'",
ollama_for_caps
);
}
capabilities.has_vision
};
Ok(ResolvedBackend::new(chat, local, kind, images_inline))
}
pub async fn generate_agentic_insight_for_photo(
&self,
file_path: &str,
@@ -3602,26 +3790,22 @@ Return ONLY the summary, nothing else."#,
span.set_attribute(KeyValue::new("backend", backend_label.clone()));
let is_hybrid = backend_label == "hybrid";
// `LLM_BACKEND=llamacpp` swaps Ollama out for llama-swap as the
// "local" stack — chat + vision describe + embeddings all route
// through llama-swap. In hybrid mode this still applies to vision
// describe (chat continues to go to OpenRouter). The chat slot is
// text-only in either case, so we describe-then-inline.
// "local" stack — chat + embeddings route through llama-swap.
// llamacpp models receive images directly (vision-capable); only
// hybrid mode (OpenRouter chat) uses describe-then-inline.
let local_via_llamacpp =
crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
// Describe-then-inline: hybrid (chat is OpenRouter, text-only) or
// any path where chat goes through llama-swap (chat slot is text-only).
let describes_then_inlines = is_hybrid || local_via_llamacpp;
let describes_then_inlines = is_hybrid;
let ollama_is_chat = !is_hybrid && !local_via_llamacpp;
// 1b. Always build an Ollama client. In local mode it owns the chat
// loop; in hybrid/llamacpp mode it still handles tool-local calls
// (e.g. future embedding-backed tools). The chat backend is
// selected separately below.
// Sampling overrides only apply in local mode — in
// hybrid/llamacpp the user's params belong to the alternate chat
// client.
let apply_sampling_to_ollama = !describes_then_inlines;
// Sampling overrides only apply when Ollama is the chat backend.
let apply_sampling_to_ollama = ollama_is_chat;
let mut ollama_client = if let Some(ref model) = custom_model
&& !describes_then_inlines
&& ollama_is_chat
{
log::info!("Using custom model for agentic: {}", model);
span.set_attribute(KeyValue::new("custom_model", model.clone()));
@@ -3632,7 +3816,7 @@ Return ONLY the summary, nothing else."#,
Some(model.clone()),
)
} else {
if !describes_then_inlines {
if ollama_is_chat {
span.set_attribute(KeyValue::new("model", self.ollama.primary_model.clone()));
}
self.ollama.clone()
@@ -3752,10 +3936,13 @@ Return ONLY the summary, nothing else."#,
// (OPENROUTER_ALLOWED_MODELS) — no live precheck. A bad model id
// surfaces as a chat-call error on the next step.
let has_vision = if describes_then_inlines {
// In hybrid + llamacpp modes the chat model never sees images
// directly — we describe-then-inject, so `has_vision` drives only
// whether we bother loading the image to describe it, which we
// always do.
// Hybrid: chat model never sees images — describe-then-inject.
true
} else if local_via_llamacpp {
// llama-swap models receive images directly via OpenAI content
// parts. Capability probing isn't available (no `/api/show`),
// so assume vision support; a misconfigured model surfaces as
// a chat-call error.
true
} else {
if let Some(ref model_name) = custom_model {
@@ -3935,10 +4122,9 @@ Return ONLY the summary, nothing else."#,
None
};
// describe-then-inline path. Vision describe routes through whichever
// `LLM_BACKEND` is configured — llama-swap when `local_via_llamacpp`
// is set (even in hybrid mode, since chat is OpenRouter but vision
// stays on the local stack), otherwise Ollama.
// describe-then-inline path (hybrid only). Vision describe routes
// through whichever local backend is configured — llama-swap when
// `local_via_llamacpp`, otherwise Ollama.
let inlined_visual_description: Option<String> = if describes_then_inlines {
match image_base64.as_deref() {
Some(b64) => {
@@ -4043,10 +4229,10 @@ Return ONLY the summary, nothing else."#,
);
// 10. Define tools. Gate flags computed from current data presence;
// describe-then-inline modes (hybrid OR local_via_llamacpp) omit
// describe_photo since the chat model receives the visual
// description inline (so we pass `false` for has_vision in
// those modes regardless of the model's actual capability).
// hybrid mode omits describe_photo since the chat model receives
// the visual description inline (so we pass `false` for
// has_vision in that mode regardless of the model's actual
// capability).
let gate_opts = self.current_gate_opts(has_vision && !describes_then_inlines);
let tools = Self::build_tool_definitions(gate_opts);