diff --git a/src/ai/backend.rs b/src/ai/backend.rs new file mode 100644 index 0000000..0fe7747 --- /dev/null +++ b/src/ai/backend.rs @@ -0,0 +1,118 @@ +use anyhow::{Result, anyhow}; + +use crate::ai::llm_client::LlmClient; + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum BackendKind { + Local, + Hybrid, +} + +impl BackendKind { + pub fn parse(s: &str) -> Result { + match s.trim().to_lowercase().as_str() { + "local" | "" => Ok(Self::Local), + "hybrid" => Ok(Self::Hybrid), + other => Err(anyhow!("unknown backend '{}'; expected 'local' or 'hybrid'", other)), + } + } + + pub fn as_str(&self) -> &'static str { + match self { + Self::Local => "local", + Self::Hybrid => "hybrid", + } + } +} + +impl std::fmt::Display for BackendKind { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + f.write_str(self.as_str()) + } +} + +pub struct SamplingOverrides { + pub model: Option, + pub num_ctx: Option, + pub temperature: Option, + pub top_p: Option, + pub top_k: Option, + pub min_p: Option, +} + +impl SamplingOverrides { + pub fn has_sampling(&self) -> bool { + self.temperature.is_some() + || self.top_p.is_some() + || self.top_k.is_some() + || self.min_p.is_some() + } +} + +pub struct ResolvedBackend { + chat: Box, + local: Box, + pub kind: BackendKind, + /// `true` when the chat model receives images directly (Ollama with + /// vision, or llamacpp). `false` for hybrid where we describe-then-inline. + pub images_inline: bool, +} + +impl ResolvedBackend { + pub fn new( + chat: Box, + local: Box, + kind: BackendKind, + images_inline: bool, + ) -> Self { + Self { chat, local, kind, images_inline } + } + + pub fn chat(&self) -> &dyn LlmClient { + self.chat.as_ref() + } + + pub fn local(&self) -> &dyn LlmClient { + self.local.as_ref() + } + + pub fn model(&self) -> &str { + self.chat.primary_model() + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn parse_backend_kind() { + assert_eq!(BackendKind::parse("local").unwrap(), BackendKind::Local); + assert_eq!(BackendKind::parse("hybrid").unwrap(), BackendKind::Hybrid); + assert_eq!(BackendKind::parse(" Local ").unwrap(), BackendKind::Local); + assert_eq!(BackendKind::parse("HYBRID").unwrap(), BackendKind::Hybrid); + assert_eq!(BackendKind::parse("").unwrap(), BackendKind::Local); + assert!(BackendKind::parse("vllm").is_err()); + } + + #[test] + fn backend_kind_as_str_roundtrips() { + assert_eq!(BackendKind::parse(BackendKind::Local.as_str()).unwrap(), BackendKind::Local); + assert_eq!(BackendKind::parse(BackendKind::Hybrid.as_str()).unwrap(), BackendKind::Hybrid); + } + + #[test] + fn sampling_overrides_has_sampling() { + let empty = SamplingOverrides { + model: None, num_ctx: None, temperature: None, + top_p: None, top_k: None, min_p: None, + }; + assert!(!empty.has_sampling()); + + let with_temp = SamplingOverrides { + model: None, num_ctx: Some(4096), temperature: Some(0.7), + top_p: None, top_k: None, min_p: None, + }; + assert!(with_temp.has_sampling()); + } +} diff --git a/src/ai/handlers.rs b/src/ai/handlers.rs index c86a5c8..a7a3720 100644 --- a/src/ai/handlers.rs +++ b/src/ai/handlers.rs @@ -495,7 +495,7 @@ pub async fn get_available_models_handler( .iter() .map(|name| ModelCapabilities { name: name.clone(), - has_vision: name == &lc.vision_model, + has_vision: true, has_tool_calling: true, }) .collect(); diff --git a/src/ai/insight_chat.rs b/src/ai/insight_chat.rs index b549dab..54350df 100644 --- a/src/ai/insight_chat.rs +++ b/src/ai/insight_chat.rs @@ -311,7 +311,7 @@ impl InsightChatService { let is_hybrid = effective_backend == "hybrid"; let local_via_llamacpp = crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); - let describes_then_inlines = is_hybrid || local_via_llamacpp; + let describes_then_inlines = is_hybrid; span.set_attribute(KeyValue::new("backend", effective_backend.clone())); // 4. Build the chat backend client. Hybrid → OpenRouter; local with @@ -408,12 +408,11 @@ impl InsightChatService { let model_used = chat_backend.primary_model().to_string(); span.set_attribute(KeyValue::new("model", model_used.clone())); - // 5. Decide vision + tool set. In describe-then-inline modes - // (hybrid, llamacpp) we always omit `describe_photo` (matches the - // original generation flow). In local we trust the stored - // history's first-user shape: if it carries `images`, the - // original model was vision-capable, and we keep `describe_photo` - // available. + // 5. Decide vision + tool set. In describe-then-inline mode + // (hybrid only) we omit `describe_photo`. In local and llamacpp + // we trust the stored history's first-user shape: if it carries + // `images`, the original model was vision-capable, and we keep + // `describe_photo` available. let local_first_user_has_image = messages .iter() .find(|m| m.role == "user") @@ -821,9 +820,7 @@ impl InsightChatService { .unwrap_or_else(|| stored_backend.clone()); validate_cross_replay(&stored_backend, &effective_backend)?; let is_hybrid = effective_backend == "hybrid"; - let local_via_llamacpp = - crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); - let describes_then_inlines = is_hybrid || local_via_llamacpp; + let describes_then_inlines = is_hybrid; let max_iterations = req .max_iterations @@ -842,10 +839,9 @@ impl InsightChatService { let chat_backend: &dyn LlmClient = chat_backend_holder.as_ref(); let model_used = chat_backend.primary_model().to_string(); - // Tool set — local mode + first user turn carries an image → - // offer describe_photo. Describe-then-inline modes (hybrid OR - // local_via_llamacpp): visual description was inlined when the - // insight was bootstrapped, no describe tool needed. + // Tool set — local/llamacpp mode + first user turn carries an image → + // offer describe_photo. Describe-then-inline mode (hybrid only): + // visual description was inlined at bootstrap, no describe tool needed. let local_first_user_has_image = messages .iter() .find(|m| m.role == "user") @@ -991,7 +987,7 @@ impl InsightChatService { let is_hybrid = effective_backend == "hybrid"; let local_via_llamacpp = crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); - let describes_then_inlines = is_hybrid || local_via_llamacpp; + let describes_then_inlines = is_hybrid; let max_iterations = req .max_iterations @@ -1023,10 +1019,9 @@ impl InsightChatService { _ => None, }); - // Describe-then-inline (hybrid OR local_via_llamacpp): pre-describe - // the image so a text-only chat model gets the visual description - // inline. Vision source follows `LLM_BACKEND`: llama-swap when - // `local_via_llamacpp`, else Ollama. + // Describe-then-inline (hybrid only): pre-describe the image so a + // text-only chat model gets the visual description inline. llamacpp + // sends images directly to the chat model. let visual_block = if describes_then_inlines { match image_base64.as_deref() { Some(b64) => { diff --git a/src/ai/insight_generator.rs b/src/ai/insight_generator.rs index 08fcfef..075001a 100644 --- a/src/ai/insight_generator.rs +++ b/src/ai/insight_generator.rs @@ -10,6 +10,7 @@ use std::io::Cursor; use std::sync::{Arc, Mutex}; use crate::ai::apollo_client::{ApolloClient, ApolloPlace}; +use crate::ai::backend::{BackendKind, ResolvedBackend, SamplingOverrides}; use crate::ai::llm_client::LlmClient; use crate::ai::ollama::{ChatMessage, OllamaClient, Tool}; use crate::ai::llamacpp::LlamaCppClient; @@ -1781,14 +1782,18 @@ Return ONLY the summary, nothing else."#, ); let started = std::time::Instant::now(); - let response = ollama - .generate_no_think( - &prompt, - Some( - "You are a terse relevance ranker. You output only numbers separated by commas.", - ), - ) - .await?; + let system = Some( + "You are a terse relevance ranker. You output only numbers separated by commas.", + ); + let response = if crate::ai::local_backend_is_llamacpp() { + if let Some(ref lc) = self.llamacpp { + lc.generate(&prompt, system, None).await? + } else { + ollama.generate_no_think(&prompt, system).await? + } + } else { + ollama.generate_no_think(&prompt, system).await? + }; log::info!( "rerank: finished in {} ms (prompt={} chars)", started.elapsed().as_millis(), @@ -2360,7 +2365,8 @@ Return ONLY the summary, nothing else."#, out } - /// Tool: describe_photo — generate a visual description of the photo + /// Tool: describe_photo — generate a visual description of the photo. + /// Routes through llama-swap when `LLM_BACKEND=llamacpp`, Ollama otherwise. async fn tool_describe_photo( &self, ollama: &OllamaClient, @@ -2369,10 +2375,21 @@ Return ONLY the summary, nothing else."#, log::info!("tool_describe_photo: generating visual description"); match image_base64 { - Some(img) => match ollama.generate_photo_description(img).await { - Ok(desc) => desc, - Err(e) => format!("Error describing photo: {}", e), - }, + Some(img) => { + let result = if crate::ai::local_backend_is_llamacpp() { + if let Some(ref lc) = self.llamacpp { + lc.describe_image(img).await + } else { + ollama.generate_photo_description(img).await + } + } else { + ollama.generate_photo_description(img).await + }; + match result { + Ok(desc) => desc, + Err(e) => format!("Error describing photo: {}", e), + } + } None => "No image available for description.".to_string(), } } @@ -3560,6 +3577,177 @@ Return ONLY the summary, nothing else."#, out } + /// Consolidate client construction for the agentic insight loop. + /// + /// Returns a [`ResolvedBackend`] containing the **chat** client (the model + /// that drives the agent loop), the **local** client (always the configured + /// local backend — Ollama or llama-swap — for utility calls like + /// describe_image, rerank, embeddings), the backend kind, and whether the + /// chat model receives images inline. + pub async fn resolve_backend( + &self, + kind: BackendKind, + overrides: &SamplingOverrides, + ) -> Result { + let local_via_llamacpp = + crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); + let is_hybrid = kind == BackendKind::Hybrid; + + // ── chat client ──────────────────────────────────────────────── + let chat: Box = if is_hybrid { + // Hybrid: chat through OpenRouter. + let arc = self.openrouter.as_ref().ok_or_else(|| { + anyhow::anyhow!("hybrid backend unavailable: OPENROUTER_API_KEY not configured") + })?; + let mut c: OpenRouterClient = (**arc).clone(); + if let Some(ref m) = overrides.model { + c.primary_model = m.clone(); + } + if overrides.has_sampling() { + c.set_sampling_params( + overrides.temperature, + overrides.top_p, + overrides.top_k, + overrides.min_p, + ); + } + if let Some(ctx) = overrides.num_ctx { + c.set_num_ctx(Some(ctx)); + } + Box::new(c) + } else if local_via_llamacpp { + // Local via llama-swap. + let arc = self.llamacpp.as_ref().ok_or_else(|| { + anyhow::anyhow!("LLM_BACKEND=llamacpp but LLAMA_SWAP_URL not configured") + })?; + let mut c: LlamaCppClient = (**arc).clone(); + if let Some(ref m) = overrides.model { + c.primary_model = m.clone(); + } + if overrides.has_sampling() { + c.set_sampling_params( + overrides.temperature, + overrides.top_p, + overrides.top_k, + overrides.min_p, + ); + } + if let Some(ctx) = overrides.num_ctx { + c.set_num_ctx(Some(ctx)); + } + Box::new(c) + } else { + // Pure Ollama local. + let mut ollama_client = if let Some(ref model) = overrides.model { + OllamaClient::new( + self.ollama.primary_url.clone(), + self.ollama.fallback_url.clone(), + model.clone(), + Some(model.clone()), + ) + } else { + self.ollama.clone() + }; + if overrides.has_sampling() { + ollama_client.set_sampling_params( + overrides.temperature, + overrides.top_p, + overrides.top_k, + overrides.min_p, + ); + } + if let Some(ctx) = overrides.num_ctx { + ollama_client.set_num_ctx(Some(ctx)); + } + Box::new(ollama_client) + }; + + // ── local client (utility calls: rerank, describe_image, etc.) ─ + let local: Box = if local_via_llamacpp { + Box::new(self.llamacpp.as_ref().unwrap().as_ref().clone()) + } else { + Box::new(self.ollama.clone()) + }; + + // ── images_inline ────────────────────────────────────────────── + let images_inline = if is_hybrid { + // Hybrid: chat model never sees images — describe-then-inject. + false + } else if local_via_llamacpp { + // llama-swap models receive images directly via OpenAI content + // parts. Capability probing isn't available (no `/api/show`), + // so assume vision support; a misconfigured model surfaces as + // a chat-call error. + true + } else { + // Pure Ollama: probe model capabilities. + let ollama_for_caps = if let Some(ref model) = overrides.model { + // Verify custom model is available on at least one server. + let available_on_primary = + OllamaClient::is_model_available(&self.ollama.primary_url, model) + .await + .unwrap_or(false); + + let available_on_fallback = + if let Some(ref fallback_url) = self.ollama.fallback_url { + OllamaClient::is_model_available(fallback_url, model) + .await + .unwrap_or(false) + } else { + false + }; + + if !available_on_primary && !available_on_fallback { + anyhow::bail!( + "model not available: '{}' not found on any configured server", + model + ); + } + model.as_str() + } else { + self.ollama.primary_model.as_str() + }; + + let capabilities = match OllamaClient::check_model_capabilities( + &self.ollama.primary_url, + ollama_for_caps, + ) + .await + { + Ok(caps) => caps, + Err(_) => { + let fallback_url = + self.ollama.fallback_url.as_deref().ok_or_else(|| { + anyhow::anyhow!( + "Failed to check model capabilities for '{}': model not found on primary server and no fallback configured", + ollama_for_caps + ) + })?; + OllamaClient::check_model_capabilities(fallback_url, ollama_for_caps) + .await + .map_err(|e| { + anyhow::anyhow!( + "Failed to check model capabilities for '{}': {}", + ollama_for_caps, + e + ) + })? + } + }; + + if !capabilities.has_tool_calling { + anyhow::bail!( + "tool calling not supported by model '{}'", + ollama_for_caps + ); + } + + capabilities.has_vision + }; + + Ok(ResolvedBackend::new(chat, local, kind, images_inline)) + } + pub async fn generate_agentic_insight_for_photo( &self, file_path: &str, @@ -3602,26 +3790,22 @@ Return ONLY the summary, nothing else."#, span.set_attribute(KeyValue::new("backend", backend_label.clone())); let is_hybrid = backend_label == "hybrid"; // `LLM_BACKEND=llamacpp` swaps Ollama out for llama-swap as the - // "local" stack — chat + vision describe + embeddings all route - // through llama-swap. In hybrid mode this still applies to vision - // describe (chat continues to go to OpenRouter). The chat slot is - // text-only in either case, so we describe-then-inline. + // "local" stack — chat + embeddings route through llama-swap. + // llamacpp models receive images directly (vision-capable); only + // hybrid mode (OpenRouter chat) uses describe-then-inline. let local_via_llamacpp = crate::ai::local_backend_is_llamacpp() && self.llamacpp.is_some(); - // Describe-then-inline: hybrid (chat is OpenRouter, text-only) or - // any path where chat goes through llama-swap (chat slot is text-only). - let describes_then_inlines = is_hybrid || local_via_llamacpp; + let describes_then_inlines = is_hybrid; + let ollama_is_chat = !is_hybrid && !local_via_llamacpp; // 1b. Always build an Ollama client. In local mode it owns the chat // loop; in hybrid/llamacpp mode it still handles tool-local calls // (e.g. future embedding-backed tools). The chat backend is // selected separately below. - // Sampling overrides only apply in local mode — in - // hybrid/llamacpp the user's params belong to the alternate chat - // client. - let apply_sampling_to_ollama = !describes_then_inlines; + // Sampling overrides only apply when Ollama is the chat backend. + let apply_sampling_to_ollama = ollama_is_chat; let mut ollama_client = if let Some(ref model) = custom_model - && !describes_then_inlines + && ollama_is_chat { log::info!("Using custom model for agentic: {}", model); span.set_attribute(KeyValue::new("custom_model", model.clone())); @@ -3632,7 +3816,7 @@ Return ONLY the summary, nothing else."#, Some(model.clone()), ) } else { - if !describes_then_inlines { + if ollama_is_chat { span.set_attribute(KeyValue::new("model", self.ollama.primary_model.clone())); } self.ollama.clone() @@ -3752,10 +3936,13 @@ Return ONLY the summary, nothing else."#, // (OPENROUTER_ALLOWED_MODELS) — no live precheck. A bad model id // surfaces as a chat-call error on the next step. let has_vision = if describes_then_inlines { - // In hybrid + llamacpp modes the chat model never sees images - // directly — we describe-then-inject, so `has_vision` drives only - // whether we bother loading the image to describe it, which we - // always do. + // Hybrid: chat model never sees images — describe-then-inject. + true + } else if local_via_llamacpp { + // llama-swap models receive images directly via OpenAI content + // parts. Capability probing isn't available (no `/api/show`), + // so assume vision support; a misconfigured model surfaces as + // a chat-call error. true } else { if let Some(ref model_name) = custom_model { @@ -3935,10 +4122,9 @@ Return ONLY the summary, nothing else."#, None }; - // describe-then-inline path. Vision describe routes through whichever - // `LLM_BACKEND` is configured — llama-swap when `local_via_llamacpp` - // is set (even in hybrid mode, since chat is OpenRouter but vision - // stays on the local stack), otherwise Ollama. + // describe-then-inline path (hybrid only). Vision describe routes + // through whichever local backend is configured — llama-swap when + // `local_via_llamacpp`, otherwise Ollama. let inlined_visual_description: Option = if describes_then_inlines { match image_base64.as_deref() { Some(b64) => { @@ -4043,10 +4229,10 @@ Return ONLY the summary, nothing else."#, ); // 10. Define tools. Gate flags computed from current data presence; - // describe-then-inline modes (hybrid OR local_via_llamacpp) omit - // describe_photo since the chat model receives the visual - // description inline (so we pass `false` for has_vision in - // those modes regardless of the model's actual capability). + // hybrid mode omits describe_photo since the chat model receives + // the visual description inline (so we pass `false` for + // has_vision in that mode regardless of the model's actual + // capability). let gate_opts = self.current_gate_opts(has_vision && !describes_then_inlines); let tools = Self::build_tool_definitions(gate_opts); diff --git a/src/ai/llamacpp.rs b/src/ai/llamacpp.rs index d23fad5..2a03aed 100644 --- a/src/ai/llamacpp.rs +++ b/src/ai/llamacpp.rs @@ -50,9 +50,9 @@ pub struct LlamaCppClient { /// Embedding model slot id (e.g. `"embed"`). Used for /// `generate_embeddings`. pub embedding_model: String, - /// Vision model slot id (e.g. `"vision"`). Used for `describe_image`, - /// and the only slot that reports `has_vision = true` in capability - /// lookups (llama-swap's `/v1/models` doesn't surface modality). + /// Vision model slot id. Used for `describe_image` routing. Defaults + /// to `primary_model` so describe_image works out of the box; override + /// via `LLAMA_SWAP_VISION_MODEL` for a dedicated vision slot. pub vision_model: String, num_ctx: Option, temperature: Option, @@ -67,6 +67,7 @@ impl LlamaCppClient { .ok() .and_then(|v| v.parse::().ok()) .unwrap_or(DEFAULT_REQUEST_TIMEOUT_SECS); + let pm = primary_model.unwrap_or_else(|| DEFAULT_PRIMARY_MODEL.to_string()); Self { client: Client::builder() .connect_timeout(Duration::from_secs(10)) @@ -74,9 +75,9 @@ impl LlamaCppClient { .build() .unwrap_or_else(|_| Client::new()), base_url: base_url.unwrap_or_else(|| DEFAULT_BASE_URL.to_string()), - primary_model: primary_model.unwrap_or_else(|| DEFAULT_PRIMARY_MODEL.to_string()), + primary_model: pm.clone(), embedding_model: DEFAULT_EMBEDDING_MODEL.to_string(), - vision_model: DEFAULT_VISION_MODEL.to_string(), + vision_model: pm, num_ctx: None, temperature: None, top_p: None, @@ -124,6 +125,13 @@ impl LlamaCppClient { let mut obj = serde_json::Map::new(); obj.insert("role".into(), Value::String(msg.role.clone())); + // Assistant messages with tool_calls must emit `content: null` + // (not `""`) — some Jinja templates (Mistral-family) treat + // empty-string content as a regular message rather than a + // tool-calling turn, breaking role-alternation validation. + let has_tool_calls = msg.role == "assistant" + && msg.tool_calls.as_ref().is_some_and(|tcs| !tcs.is_empty()); + match &msg.images { Some(images) if !images.is_empty() => { let mut parts: Vec = Vec::new(); @@ -139,6 +147,9 @@ impl LlamaCppClient { } obj.insert("content".into(), Value::Array(parts)); } + _ if has_tool_calls && msg.content.is_empty() => { + obj.insert("content".into(), Value::Null); + } _ => { obj.insert("content".into(), Value::String(msg.content.clone())); } @@ -276,6 +287,13 @@ impl LlamaCppClient { tools: Vec, ) -> Result<(ChatMessage, Option, Option)> { let url = format!("{}/chat/completions", self.base_url); + let roles: Vec<&str> = messages.iter().map(|m| m.role.as_str()).collect(); + log::debug!( + "llama-swap chat_completion: model={} roles={:?} tools={}", + model, + roles, + tools.len() + ); let mut body = serde_json::Map::new(); body.insert("model".into(), Value::String(model.to_string())); body.insert( @@ -381,6 +399,13 @@ impl LlmClient for LlamaCppClient { tools: Vec, ) -> Result>> { let url = format!("{}/chat/completions", self.base_url); + let roles: Vec<&str> = messages.iter().map(|m| m.role.as_str()).collect(); + log::debug!( + "llama-swap stream: model={} roles={:?} tools={}", + self.primary_model, + roles, + tools.len() + ); let mut body = serde_json::Map::new(); body.insert( "model".into(), @@ -681,12 +706,12 @@ impl LlamaCppClient { .and_then(|v| v.as_str()) .unwrap_or_default() .to_string(); - let has_vision = name == self.vision_model; - // Tool calling is the default for llama-swap entries we configure - // (--jinja flag); no negative-list mechanism yet, so report true. + // llama-swap doesn't expose per-model modality flags. Assume all + // configured models support vision and tool calling; a model + // without multimodal support surfaces as a chat-call error. ModelCapabilities { name, - has_vision, + has_vision: true, has_tool_calling: true, } } @@ -943,7 +968,7 @@ mod tests { } #[test] - fn capability_inference_marks_only_vision_slot() { + fn all_models_report_vision_capable() { let mut c = LlamaCppClient::new(None, Some("chat".into())); c.set_vision_model("vision".into()); @@ -955,9 +980,9 @@ mod tests { let vision = c.parse_model_capabilities(&m_vision); let other = c.parse_model_capabilities(&m_other); - assert!(!chat.has_vision); + assert!(chat.has_vision); assert!(chat.has_tool_calling); assert!(vision.has_vision); - assert!(!other.has_vision); + assert!(other.has_vision); } } diff --git a/src/ai/mod.rs b/src/ai/mod.rs index 8d634fd..09cbfda 100644 --- a/src/ai/mod.rs +++ b/src/ai/mod.rs @@ -1,4 +1,5 @@ pub mod apollo_client; +pub mod backend; pub mod clip_client; pub mod daily_summary_job; pub mod face_client;