ImageApi/src/ai/llamacpp.rs

// LlamaCppClient — talks to a llama-swap proxy that fronts one or more
// llama-server processes. llama-swap exposes an OpenAI-compatible HTTP
// surface (`/v1/chat/completions`, `/v1/embeddings`, `/v1/models`), so the
// wire translation mirrors `OpenRouterClient` almost exactly.
//
// Differences from OpenRouter:
// - No bearer auth or attribution headers; llama-swap is LAN-only.
// - Three model slots (`primary_model` = chat, `vision_model`, `embedding_model`)
//   each map to a model id in the llama-swap config. `describe_image` and
//   `generate_embeddings` issue requests with the appropriate slot id in the
//   `model` field, which is how llama-swap selects which backend process to
//   run.
// - `/v1/models` returns only the configured slot ids — capabilities aren't
//   reported by the API, so `vision_models` is a config-time allowlist (env
//   `LLAMA_SWAP_VISION_MODELS`) used to set `has_vision` on responses.
//   `has_tool_calling` is assumed true for every slot, since llama-swap entries
//   default to launching llama-server with `--jinja`.
//
// First consumer lands alongside the three-way backend dispatch in
// insight_generator / insight_chat.
#![allow(dead_code)]

use anyhow::{Context, Result, anyhow, bail};
use async_trait::async_trait;
use reqwest::Client;
use serde::Deserialize;
use serde_json::{Value, json};
use std::time::Duration;

use crate::ai::llm_client::{
    ChatMessage, LlmClient, LlmStreamEvent, ModelCapabilities, Tool, ToolCall, ToolCallFunction,
};
use futures::stream::{BoxStream, StreamExt};

const DEFAULT_BASE_URL: &str = "http://localhost:9292/v1";
const DEFAULT_PRIMARY_MODEL: &str = "chat";
const DEFAULT_VISION_MODEL: &str = "vision";
const DEFAULT_EMBEDDING_MODEL: &str = "embed";
const DEFAULT_REQUEST_TIMEOUT_SECS: u64 = 180;

/// OpenAI-compatible client targeting a llama-swap proxy in front of one or
/// more llama-server processes. See the module doc-comment for the slot model.
#[derive(Clone)]
pub struct LlamaCppClient {
    client: Client,
    pub base_url: String,
    /// Chat model slot id (e.g. `"chat"`). Used for `generate` /
    /// `chat_with_tools` / `chat_with_tools_stream`.
    pub primary_model: String,
    /// Embedding model slot id (e.g. `"embed"`). Used for
    /// `generate_embeddings`.
    pub embedding_model: String,
    /// Vision model slot id (e.g. `"vision"`). Used for `describe_image` and
    /// included in `vision_models` automatically so capability lookups for
    /// the default vision slot report `has_vision = true` even when the env
    /// allowlist is empty.
    pub vision_model: String,
    /// Operator-curated set of slot ids known to be multimodal. Drives the
    /// `has_vision` field in `list_models` / `model_capabilities`, since
    /// llama-swap's `/v1/models` doesn't report modality. Empty allowlist
    /// still marks `vision_model` as vision-capable.
    pub vision_models: Vec<String>,
    num_ctx: Option<i32>,
    temperature: Option<f32>,
    top_p: Option<f32>,
    top_k: Option<i32>,
    min_p: Option<f32>,
}

impl LlamaCppClient {
    pub fn new(base_url: Option<String>, primary_model: Option<String>) -> Self {
        let timeout_secs = std::env::var("LLAMA_SWAP_REQUEST_TIMEOUT_SECONDS")
            .ok()
            .and_then(|v| v.parse::<u64>().ok())
            .unwrap_or(DEFAULT_REQUEST_TIMEOUT_SECS);
        Self {
            client: Client::builder()
                .connect_timeout(Duration::from_secs(10))
                .timeout(Duration::from_secs(timeout_secs))
                .build()
                .unwrap_or_else(|_| Client::new()),
            base_url: base_url.unwrap_or_else(|| DEFAULT_BASE_URL.to_string()),
            primary_model: primary_model.unwrap_or_else(|| DEFAULT_PRIMARY_MODEL.to_string()),
            embedding_model: DEFAULT_EMBEDDING_MODEL.to_string(),
            vision_model: DEFAULT_VISION_MODEL.to_string(),
            vision_models: Vec::new(),
            num_ctx: None,
            temperature: None,
            top_p: None,
            top_k: None,
            min_p: None,
        }
    }

    pub fn set_embedding_model(&mut self, model: String) {
        self.embedding_model = model;
    }

    pub fn set_vision_model(&mut self, model: String) {
        self.vision_model = model;
    }

    pub fn set_vision_models(&mut self, models: Vec<String>) {
        self.vision_models = models;
    }

    pub fn set_num_ctx(&mut self, num_ctx: Option<i32>) {
        self.num_ctx = num_ctx;
    }

    pub fn set_sampling_params(
        &mut self,
        temperature: Option<f32>,
        top_p: Option<f32>,
        top_k: Option<i32>,
        min_p: Option<f32>,
    ) {
        self.temperature = temperature;
        self.top_p = top_p;
        self.top_k = top_k;
        self.min_p = min_p;
    }

    /// Translate canonical messages to the OpenAI-compatible wire shape.
    /// Behaviorally identical to `OpenRouterClient::messages_to_openai` —
    /// stringify tool-call arguments, rewrite images into content-parts, attach
    /// `tool_call_id` to `role=tool` messages based on the preceding assistant
    /// turn's tool calls.
    fn messages_to_openai(messages: &[ChatMessage]) -> Vec<Value> {
        let mut out = Vec::with_capacity(messages.len());
        let mut last_tool_call_ids: Vec<String> = Vec::new();
        let mut next_tool_result_idx: usize = 0;

        for msg in messages {
            let mut obj = serde_json::Map::new();
            obj.insert("role".into(), Value::String(msg.role.clone()));

            match &msg.images {
                Some(images) if !images.is_empty() => {
                    let mut parts: Vec<Value> = Vec::new();
                    if !msg.content.is_empty() {
                        parts.push(json!({"type": "text", "text": msg.content}));
                    }
                    for img in images {
                        let url = image_to_data_url(img);
                        parts.push(json!({
                            "type": "image_url",
                            "image_url": { "url": url }
                        }));
                    }
                    obj.insert("content".into(), Value::Array(parts));
                }
                _ => {
                    obj.insert("content".into(), Value::String(msg.content.clone()));
                }
            }

            if let Some(tcs) = &msg.tool_calls
                && msg.role == "assistant"
            {
                let converted: Vec<Value> = tcs
                    .iter()
                    .enumerate()
                    .map(|(i, call)| {
                        let id = call.id.clone().unwrap_or_else(|| format!("call_{}", i));
                        let args_str = serde_json::to_string(&call.function.arguments)
                            .unwrap_or_else(|_| "{}".to_string());
                        json!({
                            "id": id,
                            "type": "function",
                            "function": {
                                "name": call.function.name,
                                "arguments": args_str,
                            }
                        })
                    })
                    .collect();
                last_tool_call_ids = converted
                    .iter()
                    .filter_map(|v| v.get("id").and_then(|x| x.as_str()).map(String::from))
                    .collect();
                next_tool_result_idx = 0;
                obj.insert("tool_calls".into(), Value::Array(converted));
            }

            if msg.role == "tool" {
                let id = last_tool_call_ids
                    .get(next_tool_result_idx)
                    .cloned()
                    .unwrap_or_else(|| "call_0".to_string());
                obj.insert("tool_call_id".into(), Value::String(id));
                next_tool_result_idx += 1;
            }

            out.push(Value::Object(obj));
        }

        out
    }

    /// Parse an OpenAI-compatible assistant message back into canonical shape.
    /// llama.cpp emits `reasoning_content` on thinking models; we drop it for
    /// parity with OpenRouter (which also strips upstream reasoning fields).
    fn openai_message_to_chat(msg: &Value) -> Result<ChatMessage> {
        let obj = msg
            .as_object()
            .ok_or_else(|| anyhow!("response message is not an object"))?;
        let role = obj
            .get("role")
            .and_then(|v| v.as_str())
            .unwrap_or("assistant")
            .to_string();
        let content = obj
            .get("content")
            .and_then(|v| v.as_str())
            .unwrap_or("")
            .to_string();

        let tool_calls = if let Some(tcs) = obj.get("tool_calls").and_then(|v| v.as_array()) {
            let mut parsed = Vec::with_capacity(tcs.len());
            for tc in tcs {
                let id = tc.get("id").and_then(|v| v.as_str()).map(String::from);
                let function = tc
                    .get("function")
                    .ok_or_else(|| anyhow!("tool_call missing function field"))?;
                let name = function
                    .get("name")
                    .and_then(|v| v.as_str())
                    .unwrap_or_default()
                    .to_string();
                let args_value = match function.get("arguments") {
                    Some(Value::String(s)) => {
                        serde_json::from_str::<Value>(s).unwrap_or_else(|_| json!({}))
                    }
                    Some(v @ Value::Object(_)) => v.clone(),
                    _ => json!({}),
                };
                parsed.push(ToolCall {
                    id,
                    function: ToolCallFunction {
                        name,
                        arguments: args_value,
                    },
                });
            }
            Some(parsed)
        } else {
            None
        };

        Ok(ChatMessage {
            role,
            content,
            tool_calls,
            images: None,
        })
    }

    fn build_options(&self) -> Vec<(&'static str, Value)> {
        let mut v = Vec::new();
        if let Some(t) = self.temperature {
            v.push(("temperature", json!(t)));
        }
        if let Some(p) = self.top_p {
            v.push(("top_p", json!(p)));
        }
        if let Some(k) = self.top_k {
            v.push(("top_k", json!(k)));
        }
        if let Some(m) = self.min_p {
            v.push(("min_p", json!(m)));
        }
        // num_ctx isn't an OpenAI param; llama-server bakes ctx in at launch
        // via -c, so we silently drop the override here. The config.yaml
        // entry is the source of truth for context size.
        let _ = self.num_ctx;
        v
    }

    /// Issue a chat request with an explicit model id override. Used by
    /// `describe_image` to route through the vision slot without mutating
    /// `self.primary_model`.
    async fn chat_completion_with_model(
        &self,
        model: &str,
        messages: Vec<ChatMessage>,
        tools: Vec<Tool>,
    ) -> Result<(ChatMessage, Option<i32>, Option<i32>)> {
        let url = format!("{}/chat/completions", self.base_url);
        let mut body = serde_json::Map::new();
        body.insert("model".into(), Value::String(model.to_string()));
        body.insert(
            "messages".into(),
            Value::Array(Self::messages_to_openai(&messages)),
        );
        body.insert("stream".into(), Value::Bool(false));
        if !tools.is_empty() {
            body.insert(
                "tools".into(),
                serde_json::to_value(&tools).context("serializing tools")?,
            );
        }
        for (k, v) in self.build_options() {
            body.insert(k.into(), v);
        }

        let resp = self
            .client
            .post(&url)
            .json(&Value::Object(body))
            .send()
            .await
            .with_context(|| format!("POST {} failed", url))?;

        if !resp.status().is_success() {
            let status = resp.status();
            let body = resp.text().await.unwrap_or_default();
            bail!("llama-swap chat request failed: {} — {}", status, body);
        }

        let parsed: Value = resp.json().await.context("parsing chat response")?;
        let choice = parsed
            .get("choices")
            .and_then(|v| v.as_array())
            .and_then(|a| a.first())
            .ok_or_else(|| {
                anyhow!(
                    "response missing choices[0]: {}",
                    extract_error_detail(&parsed)
                )
            })?;
        let msg = choice.get("message").ok_or_else(|| {
            anyhow!(
                "choices[0] missing message: {}",
                extract_error_detail(&parsed)
            )
        })?;
        let chat_msg = Self::openai_message_to_chat(msg)?;

        let usage = parsed.get("usage");
        let prompt_tokens = usage
            .and_then(|u| u.get("prompt_tokens"))
            .and_then(|v| v.as_i64())
            .map(|n| n as i32);
        let completion_tokens = usage
            .and_then(|u| u.get("completion_tokens"))
            .and_then(|v| v.as_i64())
            .map(|n| n as i32);

        Ok((chat_msg, prompt_tokens, completion_tokens))
    }
}

#[async_trait]
impl LlmClient for LlamaCppClient {
    async fn generate(
        &self,
        prompt: &str,
        system: Option<&str>,
        images: Option<Vec<String>>,
    ) -> Result<String> {
        let mut messages: Vec<ChatMessage> = Vec::new();
        if let Some(sys) = system {
            messages.push(ChatMessage::system(sys));
        }
        let mut user = ChatMessage::user(prompt);
        user.images = images;
        messages.push(user);

        let (reply, _, _) = self.chat_with_tools(messages, Vec::new()).await?;
        Ok(reply.content)
    }

    async fn chat_with_tools(
        &self,
        messages: Vec<ChatMessage>,
        tools: Vec<Tool>,
    ) -> Result<(ChatMessage, Option<i32>, Option<i32>)> {
        log::info!(
            "llama-swap chat_with_tools: model={} messages={} tools={}",
            self.primary_model,
            messages.len(),
            tools.len()
        );
        self.chat_completion_with_model(&self.primary_model.clone(), messages, tools)
            .await
    }

    async fn chat_with_tools_stream(
        &self,
        messages: Vec<ChatMessage>,
        tools: Vec<Tool>,
    ) -> Result<BoxStream<'static, Result<LlmStreamEvent>>> {
        let url = format!("{}/chat/completions", self.base_url);
        let mut body = serde_json::Map::new();
        body.insert(
            "model".into(),
            Value::String(self.primary_model.clone()),
        );
        body.insert(
            "messages".into(),
            Value::Array(Self::messages_to_openai(&messages)),
        );
        body.insert("stream".into(), Value::Bool(true));
        body.insert(
            "stream_options".into(),
            serde_json::json!({ "include_usage": true }),
        );
        if !tools.is_empty() {
            body.insert(
                "tools".into(),
                serde_json::to_value(&tools).context("serializing tools")?,
            );
        }
        for (k, v) in self.build_options() {
            body.insert(k.into(), v);
        }

        let resp = self
            .client
            .post(&url)
            .json(&Value::Object(body))
            .send()
            .await
            .with_context(|| format!("POST {} failed", url))?;

        if !resp.status().is_success() {
            let status = resp.status();
            let body = resp.text().await.unwrap_or_default();
            bail!("llama-swap stream request failed: {} — {}", status, body);
        }

        let byte_stream = resp.bytes_stream();
        let stream = async_stream::stream! {
            let mut byte_stream = byte_stream;
            let mut buf: Vec<u8> = Vec::new();
            let mut accumulated_content = String::new();
            let mut tool_state: std::collections::BTreeMap<
                usize,
                (Option<String>, Option<String>, String),
            > = std::collections::BTreeMap::new();
            let mut role = "assistant".to_string();
            let mut prompt_tokens: Option<i32> = None;
            let mut completion_tokens: Option<i32> = None;
            let mut done_seen = false;

            while let Some(chunk) = byte_stream.next().await {
                let chunk = match chunk {
                    Ok(b) => b,
                    Err(e) => {
                        yield Err(anyhow!("stream read failed: {}", e));
                        return;
                    }
                };
                buf.extend_from_slice(&chunk);

                while let Some(sep) = find_double_newline(&buf) {
                    let frame = buf.drain(..sep + 2).collect::<Vec<_>>();
                    let frame_str = match std::str::from_utf8(&frame) {
                        Ok(s) => s,
                        Err(_) => continue,
                    };
                    for line in frame_str.lines() {
                        let line = line.trim_end_matches('\r');
                        let payload = match line.strip_prefix("data: ") {
                            Some(p) => p,
                            None => continue,
                        };
                        if payload == "[DONE]" {
                            done_seen = true;
                            break;
                        }
                        let v: Value = match serde_json::from_str(payload) {
                            Ok(v) => v,
                            Err(e) => {
                                log::warn!(
                                    "malformed llama-swap SSE frame: {} ({})",
                                    payload,
                                    e
                                );
                                continue;
                            }
                        };

                        if let Some(usage) = v.get("usage") {
                            prompt_tokens = usage
                                .get("prompt_tokens")
                                .and_then(|n| n.as_i64())
                                .map(|n| n as i32);
                            completion_tokens = usage
                                .get("completion_tokens")
                                .and_then(|n| n.as_i64())
                                .map(|n| n as i32);
                        }

                        let Some(choices) = v.get("choices").and_then(|c| c.as_array())
                        else {
                            continue;
                        };
                        let Some(choice) = choices.first() else { continue };
                        let delta = match choice.get("delta") {
                            Some(d) => d,
                            None => continue,
                        };
                        if let Some(r) = delta.get("role").and_then(|v| v.as_str()) {
                            role = r.to_string();
                        }
                        if let Some(content) =
                            delta.get("content").and_then(|v| v.as_str())
                            && !content.is_empty()
                        {
                            accumulated_content.push_str(content);
                            yield Ok(LlmStreamEvent::TextDelta(content.to_string()));
                        }
                        if let Some(tcs) = delta.get("tool_calls").and_then(|v| v.as_array()) {
                            for tc_delta in tcs {
                                let idx = tc_delta
                                    .get("index")
                                    .and_then(|n| n.as_u64())
                                    .unwrap_or(0) as usize;
                                let entry = tool_state
                                    .entry(idx)
                                    .or_insert((None, None, String::new()));
                                if let Some(id) =
                                    tc_delta.get("id").and_then(|v| v.as_str())
                                {
                                    entry.0 = Some(id.to_string());
                                }
                                if let Some(func) = tc_delta.get("function") {
                                    if let Some(name) =
                                        func.get("name").and_then(|v| v.as_str())
                                    {
                                        entry.1 = Some(name.to_string());
                                    }
                                    if let Some(args) =
                                        func.get("arguments").and_then(|v| v.as_str())
                                    {
                                        entry.2.push_str(args);
                                    }
                                }
                            }
                        }
                    }
                    if done_seen {
                        break;
                    }
                }
                if done_seen {
                    break;
                }
            }

            let tool_calls: Option<Vec<ToolCall>> = if tool_state.is_empty() {
                None
            } else {
                let mut v = Vec::with_capacity(tool_state.len());
                for (_idx, (id, name, args)) in tool_state {
                    let arguments: Value = if args.trim().is_empty() {
                        Value::Object(Default::default())
                    } else {
                        serde_json::from_str(&args).unwrap_or_else(|_| {
                            Value::Object(Default::default())
                        })
                    };
                    v.push(ToolCall {
                        id,
                        function: ToolCallFunction {
                            name: name.unwrap_or_default(),
                            arguments,
                        },
                    });
                }
                Some(v)
            };

            let message = ChatMessage {
                role,
                content: accumulated_content,
                tool_calls,
                images: None,
            };
            yield Ok(LlmStreamEvent::Done {
                message,
                prompt_eval_count: prompt_tokens,
                eval_count: completion_tokens,
            });
        };

        Ok(Box::pin(stream))
    }

    async fn generate_embeddings(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>> {
        let url = format!("{}/embeddings", self.base_url);
        let body = json!({
            "model": self.embedding_model,
            "input": texts,
        });

        let resp = self
            .client
            .post(&url)
            .json(&body)
            .send()
            .await
            .with_context(|| format!("POST {} failed", url))?;

        if !resp.status().is_success() {
            let status = resp.status();
            let body = resp.text().await.unwrap_or_default();
            bail!("llama-swap embedding request failed: {} — {}", status, body);
        }

        #[derive(Deserialize)]
        struct EmbedResponse {
            data: Vec<EmbedItem>,
        }
        #[derive(Deserialize)]
        struct EmbedItem {
            embedding: Vec<f32>,
        }

        let parsed: EmbedResponse = resp.json().await.context("parsing embed response")?;
        Ok(parsed.data.into_iter().map(|i| i.embedding).collect())
    }

    async fn describe_image(&self, image_base64: &str) -> Result<String> {
        let prompt = "Briefly describe what you see in this image in 1-2 sentences. \
                      Focus on the people, location, and activity.";
        let system = "You are a scene description assistant. Be concise and factual.";

        let messages = vec![
            ChatMessage::system(system),
            ChatMessage {
                role: "user".to_string(),
                content: prompt.to_string(),
                tool_calls: None,
                images: Some(vec![image_base64.to_string()]),
            },
        ];

        let (reply, _, _) = self
            .chat_completion_with_model(&self.vision_model.clone(), messages, Vec::new())
            .await?;
        Ok(reply.content)
    }

    async fn list_models(&self) -> Result<Vec<ModelCapabilities>> {
        let url = format!("{}/models", self.base_url);
        let resp = self
            .client
            .get(&url)
            .send()
            .await
            .with_context(|| format!("GET {} failed", url))?;

        if !resp.status().is_success() {
            let status = resp.status();
            let body = resp.text().await.unwrap_or_default();
            bail!("llama-swap list_models failed: {} — {}", status, body);
        }

        let parsed: Value = resp.json().await.context("parsing models response")?;
        let data = parsed
            .get("data")
            .and_then(|v| v.as_array())
            .ok_or_else(|| anyhow!("models response missing data[]"))?;

        let caps: Vec<ModelCapabilities> = data
            .iter()
            .map(|m| self.parse_model_capabilities(m))
            .collect();

        Ok(caps)
    }

    async fn model_capabilities(&self, model: &str) -> Result<ModelCapabilities> {
        let all = self.list_models().await?;
        all.into_iter()
            .find(|m| m.name == model)
            .ok_or_else(|| anyhow!("model '{}' not found on llama-swap", model))
    }

    fn primary_model(&self) -> &str {
        &self.primary_model
    }
}

impl LlamaCppClient {
    fn parse_model_capabilities(&self, m: &Value) -> ModelCapabilities {
        let name = m
            .get("id")
            .and_then(|v| v.as_str())
            .unwrap_or_default()
            .to_string();
        let has_vision = name == self.vision_model || self.vision_models.iter().any(|v| v == &name);
        // Tool calling is the default for llama-swap entries we configure
        // (--jinja flag); no negative-list mechanism yet, so report true.
        ModelCapabilities {
            name,
            has_vision,
            has_tool_calling: true,
        }
    }
}

/// Extract a diagnostic fragment from a llama-swap / llama-server response
/// that doesn't match the expected `{choices: [...]}` shape. llama-server
/// returns errors as `{"error": {"message": "...", "code": N, "type": "..."}}`;
/// llama-swap itself sometimes wraps subprocess failures with its own
/// `{"error": "..."}` flat shape. Surface either when present, otherwise fall
/// back to a truncated raw-JSON view.
fn extract_error_detail(parsed: &Value) -> String {
    if let Some(err) = parsed.get("error") {
        match err {
            Value::Object(_) => {
                let message = err
                    .get("message")
                    .and_then(|v| v.as_str())
                    .unwrap_or("(no message)");
                let code = err
                    .get("code")
                    .map(|v| match v {
                        Value::String(s) => s.clone(),
                        other => other.to_string(),
                    })
                    .unwrap_or_else(|| "?".to_string());
                let short_message: String = message.chars().take(240).collect();
                return format!("error code={} message=\"{}\"", code, short_message);
            }
            Value::String(s) => {
                let short: String = s.chars().take(240).collect();
                return format!("error=\"{}\"", short);
            }
            _ => {}
        }
    }
    let raw = parsed.to_string();
    raw.chars().take(300).collect()
}

fn find_double_newline(buf: &[u8]) -> Option<usize> {
    for i in 0..buf.len().saturating_sub(1) {
        if buf[i] == b'\n' && buf[i + 1] == b'\n' {
            return Some(i);
        }
        if i + 3 < buf.len()
            && buf[i] == b'\r'
            && buf[i + 1] == b'\n'
            && buf[i + 2] == b'\r'
            && buf[i + 3] == b'\n'
        {
            return Some(i + 1);
        }
    }
    None
}

fn image_to_data_url(img: &str) -> String {
    if img.starts_with("data:") {
        img.to_string()
    } else {
        format!("data:image/jpeg;base64,{}", img)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn tool_call_arguments_stringified_on_send() {
        let msg = ChatMessage {
            role: "assistant".into(),
            content: String::new(),
            tool_calls: Some(vec![ToolCall {
                id: Some("call_abc".into()),
                function: ToolCallFunction {
                    name: "search_sms".into(),
                    arguments: json!({"query": "hello", "limit": 5}),
                },
            }]),
            images: None,
        };

        let wire = LlamaCppClient::messages_to_openai(&[msg]);
        let tcs = wire[0]
            .get("tool_calls")
            .and_then(|v| v.as_array())
            .expect("tool_calls present");
        let args = tcs[0]
            .get("function")
            .and_then(|f| f.get("arguments"))
            .and_then(|a| a.as_str())
            .expect("arguments stringified");
        let parsed: Value = serde_json::from_str(args).unwrap();
        assert_eq!(parsed["query"], "hello");
        assert_eq!(parsed["limit"], 5);
    }

    #[test]
    fn tool_call_arguments_parsed_on_receive() {
        let response_msg = json!({
            "role": "assistant",
            "content": "",
            "tool_calls": [{
                "id": "call_xyz",
                "type": "function",
                "function": {
                    "name": "get_weather",
                    "arguments": "{\"city\":\"Boston\",\"units\":\"celsius\"}"
                }
            }]
        });
        let parsed = LlamaCppClient::openai_message_to_chat(&response_msg).unwrap();
        let tcs = parsed.tool_calls.unwrap();
        assert_eq!(tcs.len(), 1);
        assert_eq!(tcs[0].function.name, "get_weather");
        assert_eq!(tcs[0].function.arguments["city"], "Boston");
        assert_eq!(tcs[0].function.arguments["units"], "celsius");
        assert_eq!(tcs[0].id.as_deref(), Some("call_xyz"));
    }

    #[test]
    fn tool_call_arguments_accept_native_json_on_receive() {
        // Some llama.cpp builds emit arguments as a JSON object directly when
        // jinja's tool-output strict-string rule isn't applied — accept both.
        let response_msg = json!({
            "role": "assistant",
            "content": "",
            "tool_calls": [{
                "id": "call_1",
                "type": "function",
                "function": {
                    "name": "foo",
                    "arguments": {"nested": {"k": 1}}
                }
            }]
        });
        let parsed = LlamaCppClient::openai_message_to_chat(&response_msg).unwrap();
        let tc = &parsed.tool_calls.unwrap()[0];
        assert_eq!(tc.function.arguments["nested"]["k"], 1);
    }

    #[test]
    fn images_become_content_parts() {
        let mut msg = ChatMessage::user("What is in this photo?");
        msg.images = Some(vec!["BASE64DATA".into()]);

        let wire = LlamaCppClient::messages_to_openai(&[msg]);
        let content = wire[0].get("content").and_then(|v| v.as_array()).unwrap();
        assert_eq!(content.len(), 2);
        assert_eq!(content[0]["type"], "text");
        assert_eq!(content[0]["text"], "What is in this photo?");
        assert_eq!(content[1]["type"], "image_url");
        assert_eq!(
            content[1]["image_url"]["url"],
            "data:image/jpeg;base64,BASE64DATA"
        );
    }

    #[test]
    fn data_url_images_pass_through_unchanged() {
        let mut msg = ChatMessage::user("");
        msg.images = Some(vec!["data:image/png;base64,ABCDEF".into()]);
        let wire = LlamaCppClient::messages_to_openai(&[msg]);
        let content = wire[0].get("content").and_then(|v| v.as_array()).unwrap();
        assert_eq!(content.len(), 1);
        assert_eq!(
            content[0]["image_url"]["url"],
            "data:image/png;base64,ABCDEF"
        );
    }

    #[test]
    fn text_only_message_stays_string() {
        let msg = ChatMessage::user("hello");
        let wire = LlamaCppClient::messages_to_openai(&[msg]);
        assert_eq!(wire[0]["content"], "hello");
        assert!(wire[0]["content"].as_str().is_some());
    }

    #[test]
    fn tool_result_inherits_tool_call_id_from_prior_assistant() {
        let assistant = ChatMessage {
            role: "assistant".into(),
            content: String::new(),
            tool_calls: Some(vec![ToolCall {
                id: Some("call_42".into()),
                function: ToolCallFunction {
                    name: "lookup".into(),
                    arguments: json!({}),
                },
            }]),
            images: None,
        };
        let tool_result = ChatMessage::tool_result("found it");

        let wire = LlamaCppClient::messages_to_openai(&[assistant, tool_result]);
        assert_eq!(wire[1]["role"], "tool");
        assert_eq!(wire[1]["tool_call_id"], "call_42");
    }

    #[test]
    fn multiple_tool_results_map_to_sequential_call_ids() {
        let assistant = ChatMessage {
            role: "assistant".into(),
            content: String::new(),
            tool_calls: Some(vec![
                ToolCall {
                    id: Some("call_A".into()),
                    function: ToolCallFunction {
                        name: "a".into(),
                        arguments: json!({}),
                    },
                },
                ToolCall {
                    id: Some("call_B".into()),
                    function: ToolCallFunction {
                        name: "b".into(),
                        arguments: json!({}),
                    },
                },
            ]),
            images: None,
        };
        let r1 = ChatMessage::tool_result("a result");
        let r2 = ChatMessage::tool_result("b result");

        let wire = LlamaCppClient::messages_to_openai(&[assistant, r1, r2]);
        assert_eq!(wire[1]["tool_call_id"], "call_A");
        assert_eq!(wire[2]["tool_call_id"], "call_B");
    }

    #[test]
    fn missing_tool_call_id_gets_synthetic_fallback() {
        let assistant = ChatMessage {
            role: "assistant".into(),
            content: String::new(),
            tool_calls: Some(vec![ToolCall {
                id: None,
                function: ToolCallFunction {
                    name: "noid".into(),
                    arguments: json!({}),
                },
            }]),
            images: None,
        };
        let wire = LlamaCppClient::messages_to_openai(&[assistant]);
        let tcs = wire[0]
            .get("tool_calls")
            .and_then(|v| v.as_array())
            .unwrap();
        assert_eq!(tcs[0]["id"], "call_0");
    }

    #[test]
    fn capability_inference_uses_vision_model_and_allowlist() {
        let mut c = LlamaCppClient::new(None, Some("chat".into()));
        c.set_vision_model("vision".into());
        c.set_vision_models(vec!["qwen-vl".into()]);

        let m_chat = json!({ "id": "chat" });
        let m_vision = json!({ "id": "vision" });
        let m_qwen = json!({ "id": "qwen-vl" });
        let m_other = json!({ "id": "embed" });

        let chat = c.parse_model_capabilities(&m_chat);
        let vision = c.parse_model_capabilities(&m_vision);
        let qwen = c.parse_model_capabilities(&m_qwen);
        let other = c.parse_model_capabilities(&m_other);

        assert!(!chat.has_vision);
        assert!(chat.has_tool_calling);
        assert!(vision.has_vision);
        assert!(qwen.has_vision);
        assert!(!other.has_vision);
    }
}