ImageApi/src/ai/local_llm.rs

//! Bundle of the local LLM pair (Ollama + optional llama-swap) with the
//! `LLM_BACKEND` dispatch baked in.
//!
//! Exists because passing the pair around as loose values invited the same
//! bug three times: import/backfill tooling embedded corpora via
//! `OllamaClient` directly while the query side dispatched through
//! `embed_one`, so flipping `LLM_BACKEND=llamacpp` silently split queries
//! and corpus into different vector spaces. Anything that writes or reads
//! embeddings should go through this type (or `embed_one`/`embed_many`),
//! never a concrete client.
//!
//! Deliberately knows nothing about chat policy — hybrid/OpenRouter routing
//! is request-scoped and stays in `ResolvedBackend`. This is only the
//! local stack: embeddings and offline single-shot generation.

// Constructed by binaries, not the server — dead code from main.rs's view.
#![allow(dead_code)]

use std::sync::Arc;

use anyhow::Result;

use super::llamacpp::LlamaCppClient;
use super::llm_client::LlmClient;
use super::ollama::{EMBEDDING_MODEL, OllamaClient};

#[derive(Clone)]
pub struct LocalLlm {
    ollama: OllamaClient,
    llamacpp: Option<Arc<LlamaCppClient>>,
}

impl LocalLlm {
    pub fn new(ollama: OllamaClient, llamacpp: Option<Arc<LlamaCppClient>>) -> Self {
        Self { ollama, llamacpp }
    }

    /// Construct from the canonical env wiring shared with `AppState`.
    pub fn from_env() -> Self {
        Self::new(
            crate::state::build_ollama_from_env(),
            crate::state::build_llamacpp_from_env(),
        )
    }

    /// Embed a search query (applies `EMBED_QUERY_PREFIX`). Callers must
    /// pick query vs document — retrieval models treat the two sides
    /// differently and an unmarked embed invites prefix-mismatch bugs.
    pub async fn embed_query(&self, text: &str) -> Result<Vec<f32>> {
        super::embed_query(&self.ollama, self.llamacpp.as_deref(), text).await
    }

    /// Embed corpus text (applies `EMBED_DOCUMENT_PREFIX`).
    pub async fn embed_document(&self, text: &str) -> Result<Vec<f32>> {
        super::embed_document(&self.ollama, self.llamacpp.as_deref(), text).await
    }

    /// Single-shot local text generation via the `LLM_BACKEND`-selected
    /// client (offline tooling; chat turns belong to `ResolvedBackend`).
    pub async fn generate(&self, prompt: &str, system: Option<&str>) -> Result<String> {
        if super::local_backend_is_llamacpp() {
            if let Some(lc) = self.llamacpp.as_deref() {
                return <LlamaCppClient as LlmClient>::generate(lc, prompt, system, None).await;
            }
            anyhow::bail!(
                "LLM_BACKEND=llamacpp but LlamaCppClient is unconfigured — \
                 set LLAMA_SWAP_URL or switch to LLM_BACKEND=ollama"
            );
        }
        self.ollama.generate(prompt, system).await
    }

    /// Label identifying which backend + model produces embeddings right
    /// now. Store it alongside vectors (`model_version` columns) so a
    /// backend flip is detectable in the data, not just in env history.
    pub fn embedding_model_version(&self) -> String {
        if super::local_backend_is_llamacpp() {
            let slot = self
                .llamacpp
                .as_deref()
                .map(|c| c.embedding_model.as_str())
                .unwrap_or("embed");
            format!("llama-swap:{}", slot)
        } else {
            EMBEDDING_MODEL.to_string()
        }
    }
}