ai: send images directly to llamacpp chat models + add ResolvedBackend
llamacpp models now receive images via OpenAI content-parts instead of the describe-then-inline strategy (hybrid mode unchanged). Fixes assistant messages with tool_calls emitting content: null instead of "" to satisfy strict Jinja template role-alternation checks. Adds debug logging of message role sequences on llamacpp requests. Introduces BackendKind enum, SamplingOverrides, and ResolvedBackend in a new backend.rs module. InsightGenerator::resolve_backend centralises client construction + vision capability detection — next step wires the existing inline dispatch through it. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -10,6 +10,7 @@ use std::io::Cursor;
|
||||
use std::sync::{Arc, Mutex};
|
||||
|
||||
use crate::ai::apollo_client::{ApolloClient, ApolloPlace};
|
||||
use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides};
|
||||
use crate::ai::llm_client::LlmClient;
|
||||
use crate::ai::ollama::{ChatMessage, OllamaClient, Tool};
|
||||
use crate::ai::llamacpp::LlamaCppClient;
|
||||
@@ -1781,14 +1782,18 @@ Return ONLY the summary, nothing else."#,
|
||||
);
|
||||
|
||||
let started = std::time::Instant::now();
|
||||
let response = ollama
|
||||
.generate_no_think(
|
||||
&prompt,
|
||||
Some(
|
||||
"You are a terse relevance ranker. You output only numbers separated by commas.",
|
||||
),
|
||||
)
|
||||
.await?;
|
||||
let system = Some(
|
||||
"You are a terse relevance ranker. You output only numbers separated by commas.",
|
||||
);
|
||||
let response = if crate::ai::local_backend_is_llamacpp() {
|
||||
if let Some(ref lc) = self.llamacpp {
|
||||
lc.generate(&prompt, system, None).await?
|
||||
} else {
|
||||
ollama.generate_no_think(&prompt, system).await?
|
||||
}
|
||||
} else {
|
||||
ollama.generate_no_think(&prompt, system).await?
|
||||
};
|
||||
log::info!(
|
||||
"rerank: finished in {} ms (prompt={} chars)",
|
||||
started.elapsed().as_millis(),
|
||||
@@ -2360,7 +2365,8 @@ Return ONLY the summary, nothing else."#,
|
||||
out
|
||||
}
|
||||
|
||||
/// Tool: describe_photo — generate a visual description of the photo
|
||||
/// Tool: describe_photo — generate a visual description of the photo.
|
||||
/// Routes through llama-swap when `LLM_BACKEND=llamacpp`, Ollama otherwise.
|
||||
async fn tool_describe_photo(
|
||||
&self,
|
||||
ollama: &OllamaClient,
|
||||
@@ -2369,10 +2375,21 @@ Return ONLY the summary, nothing else."#,
|
||||
log::info!("tool_describe_photo: generating visual description");
|
||||
|
||||
match image_base64 {
|
||||
Some(img) => match ollama.generate_photo_description(img).await {
|
||||
Ok(desc) => desc,
|
||||
Err(e) => format!("Error describing photo: {}", e),
|
||||
},
|
||||
Some(img) => {
|
||||
let result = if crate::ai::local_backend_is_llamacpp() {
|
||||
if let Some(ref lc) = self.llamacpp {
|
||||
lc.describe_image(img).await
|
||||
} else {
|
||||
ollama.generate_photo_description(img).await
|
||||
}
|
||||
} else {
|
||||
ollama.generate_photo_description(img).await
|
||||
};
|
||||
match result {
|
||||
Ok(desc) => desc,
|
||||
Err(e) => format!("Error describing photo: {}", e),
|
||||
}
|
||||
}
|
||||
None => "No image available for description.".to_string(),
|
||||
}
|
||||
}
|
||||
@@ -3560,6 +3577,177 @@ Return ONLY the summary, nothing else."#,
|
||||
out
|
||||
}
|
||||
|
||||
/// Consolidate client construction for the agentic insight loop.
|
||||
///
|
||||
/// Returns a [`ResolvedBackend`] containing the **chat** client (the model
|
||||
/// that drives the agent loop), the **local** client (always the configured
|
||||
/// local backend — Ollama or llama-swap — for utility calls like
|
||||
/// describe_image, rerank, embeddings), the backend kind, and whether the
|
||||
/// chat model receives images inline.
|
||||
pub async fn resolve_backend(
|
||||
&self,
|
||||
kind: BackendKind,
|
||||
overrides: &SamplingOverrides,
|
||||
) -> Result<ResolvedBackend> {
|
||||
let local_via_llamacpp =
|
||||
crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
|
||||
let is_hybrid = kind == BackendKind::Hybrid;
|
||||
|
||||
// ── chat client ────────────────────────────────────────────────
|
||||
let chat: Box<dyn LlmClient> = if is_hybrid {
|
||||
// Hybrid: chat through OpenRouter.
|
||||
let arc = self.openrouter.as_ref().ok_or_else(|| {
|
||||
anyhow::anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured")
|
||||
})?;
|
||||
let mut c: OpenRouterClient = (**arc).clone();
|
||||
if let Some(ref m) = overrides.model {
|
||||
c.primary_model = m.clone();
|
||||
}
|
||||
if overrides.has_sampling() {
|
||||
c.set_sampling_params(
|
||||
overrides.temperature,
|
||||
overrides.top_p,
|
||||
overrides.top_k,
|
||||
overrides.min_p,
|
||||
);
|
||||
}
|
||||
if let Some(ctx) = overrides.num_ctx {
|
||||
c.set_num_ctx(Some(ctx));
|
||||
}
|
||||
Box::new(c)
|
||||
} else if local_via_llamacpp {
|
||||
// Local via llama-swap.
|
||||
let arc = self.llamacpp.as_ref().ok_or_else(|| {
|
||||
anyhow::anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured")
|
||||
})?;
|
||||
let mut c: LlamaCppClient = (**arc).clone();
|
||||
if let Some(ref m) = overrides.model {
|
||||
c.primary_model = m.clone();
|
||||
}
|
||||
if overrides.has_sampling() {
|
||||
c.set_sampling_params(
|
||||
overrides.temperature,
|
||||
overrides.top_p,
|
||||
overrides.top_k,
|
||||
overrides.min_p,
|
||||
);
|
||||
}
|
||||
if let Some(ctx) = overrides.num_ctx {
|
||||
c.set_num_ctx(Some(ctx));
|
||||
}
|
||||
Box::new(c)
|
||||
} else {
|
||||
// Pure Ollama local.
|
||||
let mut ollama_client = if let Some(ref model) = overrides.model {
|
||||
OllamaClient::new(
|
||||
self.ollama.primary_url.clone(),
|
||||
self.ollama.fallback_url.clone(),
|
||||
model.clone(),
|
||||
Some(model.clone()),
|
||||
)
|
||||
} else {
|
||||
self.ollama.clone()
|
||||
};
|
||||
if overrides.has_sampling() {
|
||||
ollama_client.set_sampling_params(
|
||||
overrides.temperature,
|
||||
overrides.top_p,
|
||||
overrides.top_k,
|
||||
overrides.min_p,
|
||||
);
|
||||
}
|
||||
if let Some(ctx) = overrides.num_ctx {
|
||||
ollama_client.set_num_ctx(Some(ctx));
|
||||
}
|
||||
Box::new(ollama_client)
|
||||
};
|
||||
|
||||
// ── local client (utility calls: rerank, describe_image, etc.) ─
|
||||
let local: Box<dyn LlmClient> = if local_via_llamacpp {
|
||||
Box::new(self.llamacpp.as_ref().unwrap().as_ref().clone())
|
||||
} else {
|
||||
Box::new(self.ollama.clone())
|
||||
};
|
||||
|
||||
// ── images_inline ──────────────────────────────────────────────
|
||||
let images_inline = if is_hybrid {
|
||||
// Hybrid: chat model never sees images — describe-then-inject.
|
||||
false
|
||||
} else if local_via_llamacpp {
|
||||
// llama-swap models receive images directly via OpenAI content
|
||||
// parts. Capability probing isn't available (no `/api/show`),
|
||||
// so assume vision support; a misconfigured model surfaces as
|
||||
// a chat-call error.
|
||||
true
|
||||
} else {
|
||||
// Pure Ollama: probe model capabilities.
|
||||
let ollama_for_caps = if let Some(ref model) = overrides.model {
|
||||
// Verify custom model is available on at least one server.
|
||||
let available_on_primary =
|
||||
OllamaClient::is_model_available(&self.ollama.primary_url, model)
|
||||
.await
|
||||
.unwrap_or(false);
|
||||
|
||||
let available_on_fallback =
|
||||
if let Some(ref fallback_url) = self.ollama.fallback_url {
|
||||
OllamaClient::is_model_available(fallback_url, model)
|
||||
.await
|
||||
.unwrap_or(false)
|
||||
} else {
|
||||
false
|
||||
};
|
||||
|
||||
if !available_on_primary && !available_on_fallback {
|
||||
anyhow::bail!(
|
||||
"model not available: '{}' not found on any configured server",
|
||||
model
|
||||
);
|
||||
}
|
||||
model.as_str()
|
||||
} else {
|
||||
self.ollama.primary_model.as_str()
|
||||
};
|
||||
|
||||
let capabilities = match OllamaClient::check_model_capabilities(
|
||||
&self.ollama.primary_url,
|
||||
ollama_for_caps,
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(caps) => caps,
|
||||
Err(_) => {
|
||||
let fallback_url =
|
||||
self.ollama.fallback_url.as_deref().ok_or_else(|| {
|
||||
anyhow::anyhow!(
|
||||
"Failed to check model capabilities for '{}': model not found on primary server and no fallback configured",
|
||||
ollama_for_caps
|
||||
)
|
||||
})?;
|
||||
OllamaClient::check_model_capabilities(fallback_url, ollama_for_caps)
|
||||
.await
|
||||
.map_err(|e| {
|
||||
anyhow::anyhow!(
|
||||
"Failed to check model capabilities for '{}': {}",
|
||||
ollama_for_caps,
|
||||
e
|
||||
)
|
||||
})?
|
||||
}
|
||||
};
|
||||
|
||||
if !capabilities.has_tool_calling {
|
||||
anyhow::bail!(
|
||||
"tool calling not supported by model '{}'",
|
||||
ollama_for_caps
|
||||
);
|
||||
}
|
||||
|
||||
capabilities.has_vision
|
||||
};
|
||||
|
||||
Ok(ResolvedBackend::new(chat, local, kind, images_inline))
|
||||
}
|
||||
|
||||
pub async fn generate_agentic_insight_for_photo(
|
||||
&self,
|
||||
file_path: &str,
|
||||
@@ -3602,26 +3790,22 @@ Return ONLY the summary, nothing else."#,
|
||||
span.set_attribute(KeyValue::new("backend", backend_label.clone()));
|
||||
let is_hybrid = backend_label == "hybrid";
|
||||
// `LLM_BACKEND=llamacpp` swaps Ollama out for llama-swap as the
|
||||
// "local" stack — chat + vision describe + embeddings all route
|
||||
// through llama-swap. In hybrid mode this still applies to vision
|
||||
// describe (chat continues to go to OpenRouter). The chat slot is
|
||||
// text-only in either case, so we describe-then-inline.
|
||||
// "local" stack — chat + embeddings route through llama-swap.
|
||||
// llamacpp models receive images directly (vision-capable); only
|
||||
// hybrid mode (OpenRouter chat) uses describe-then-inline.
|
||||
let local_via_llamacpp =
|
||||
crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some();
|
||||
// Describe-then-inline: hybrid (chat is OpenRouter, text-only) or
|
||||
// any path where chat goes through llama-swap (chat slot is text-only).
|
||||
let describes_then_inlines = is_hybrid || local_via_llamacpp;
|
||||
let describes_then_inlines = is_hybrid;
|
||||
let ollama_is_chat = !is_hybrid && !local_via_llamacpp;
|
||||
|
||||
// 1b. Always build an Ollama client. In local mode it owns the chat
|
||||
// loop; in hybrid/llamacpp mode it still handles tool-local calls
|
||||
// (e.g. future embedding-backed tools). The chat backend is
|
||||
// selected separately below.
|
||||
// Sampling overrides only apply in local mode — in
|
||||
// hybrid/llamacpp the user's params belong to the alternate chat
|
||||
// client.
|
||||
let apply_sampling_to_ollama = !describes_then_inlines;
|
||||
// Sampling overrides only apply when Ollama is the chat backend.
|
||||
let apply_sampling_to_ollama = ollama_is_chat;
|
||||
let mut ollama_client = if let Some(ref model) = custom_model
|
||||
&& !describes_then_inlines
|
||||
&& ollama_is_chat
|
||||
{
|
||||
log::info!("Using custom model for agentic: {}", model);
|
||||
span.set_attribute(KeyValue::new("custom_model", model.clone()));
|
||||
@@ -3632,7 +3816,7 @@ Return ONLY the summary, nothing else."#,
|
||||
Some(model.clone()),
|
||||
)
|
||||
} else {
|
||||
if !describes_then_inlines {
|
||||
if ollama_is_chat {
|
||||
span.set_attribute(KeyValue::new("model", self.ollama.primary_model.clone()));
|
||||
}
|
||||
self.ollama.clone()
|
||||
@@ -3752,10 +3936,13 @@ Return ONLY the summary, nothing else."#,
|
||||
// (OPENROUTER_ALLOWED_MODELS) — no live precheck. A bad model id
|
||||
// surfaces as a chat-call error on the next step.
|
||||
let has_vision = if describes_then_inlines {
|
||||
// In hybrid + llamacpp modes the chat model never sees images
|
||||
// directly — we describe-then-inject, so `has_vision` drives only
|
||||
// whether we bother loading the image to describe it, which we
|
||||
// always do.
|
||||
// Hybrid: chat model never sees images — describe-then-inject.
|
||||
true
|
||||
} else if local_via_llamacpp {
|
||||
// llama-swap models receive images directly via OpenAI content
|
||||
// parts. Capability probing isn't available (no `/api/show`),
|
||||
// so assume vision support; a misconfigured model surfaces as
|
||||
// a chat-call error.
|
||||
true
|
||||
} else {
|
||||
if let Some(ref model_name) = custom_model {
|
||||
@@ -3935,10 +4122,9 @@ Return ONLY the summary, nothing else."#,
|
||||
None
|
||||
};
|
||||
|
||||
// describe-then-inline path. Vision describe routes through whichever
|
||||
// `LLM_BACKEND` is configured — llama-swap when `local_via_llamacpp`
|
||||
// is set (even in hybrid mode, since chat is OpenRouter but vision
|
||||
// stays on the local stack), otherwise Ollama.
|
||||
// describe-then-inline path (hybrid only). Vision describe routes
|
||||
// through whichever local backend is configured — llama-swap when
|
||||
// `local_via_llamacpp`, otherwise Ollama.
|
||||
let inlined_visual_description: Option<String> = if describes_then_inlines {
|
||||
match image_base64.as_deref() {
|
||||
Some(b64) => {
|
||||
@@ -4043,10 +4229,10 @@ Return ONLY the summary, nothing else."#,
|
||||
);
|
||||
|
||||
// 10. Define tools. Gate flags computed from current data presence;
|
||||
// describe-then-inline modes (hybrid OR local_via_llamacpp) omit
|
||||
// describe_photo since the chat model receives the visual
|
||||
// description inline (so we pass `false` for has_vision in
|
||||
// those modes regardless of the model's actual capability).
|
||||
// hybrid mode omits describe_photo since the chat model receives
|
||||
// the visual description inline (so we pass `false` for
|
||||
// has_vision in that mode regardless of the model's actual
|
||||
// capability).
|
||||
let gate_opts = self.current_gate_opts(has_vision && !describes_then_inlines);
|
||||
let tools = Self::build_tool_definitions(gate_opts);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user