//! Bundle of the local LLM pair (Ollama + optional llama-swap) with the //! `LLM_BACKEND` dispatch baked in. //! //! Exists because passing the pair around as loose values invited the same //! bug three times: import/backfill tooling embedded corpora via //! `OllamaClient` directly while the query side dispatched through //! `embed_one`, so flipping `LLM_BACKEND=llamacpp` silently split queries //! and corpus into different vector spaces. Anything that writes or reads //! embeddings should go through this type (or `embed_one`/`embed_many`), //! never a concrete client. //! //! Deliberately knows nothing about chat policy — hybrid/OpenRouter routing //! is request-scoped and stays in `ResolvedBackend`. This is only the //! local stack: embeddings and offline single-shot generation. // Constructed by binaries, not the server — dead code from main.rs's view. #![allow(dead_code)] use std::sync::Arc; use anyhow::Result; use super::llamacpp::LlamaCppClient; use super::llm_client::LlmClient; use super::ollama::{EMBEDDING_MODEL, OllamaClient}; #[derive(Clone)] pub struct LocalLlm { ollama: OllamaClient, llamacpp: Option>, } impl LocalLlm { pub fn new(ollama: OllamaClient, llamacpp: Option>) -> Self { Self { ollama, llamacpp } } /// Construct from the canonical env wiring shared with `AppState`. pub fn from_env() -> Self { Self::new( crate::state::build_ollama_from_env(), crate::state::build_llamacpp_from_env(), ) } /// Embed a search query (applies `EMBED_QUERY_PREFIX`). Callers must /// pick query vs document — retrieval models treat the two sides /// differently and an unmarked embed invites prefix-mismatch bugs. pub async fn embed_query(&self, text: &str) -> Result> { super::embed_query(&self.ollama, self.llamacpp.as_deref(), text).await } /// Embed corpus text (applies `EMBED_DOCUMENT_PREFIX`). pub async fn embed_document(&self, text: &str) -> Result> { super::embed_document(&self.ollama, self.llamacpp.as_deref(), text).await } /// Single-shot local text generation via the `LLM_BACKEND`-selected /// client (offline tooling; chat turns belong to `ResolvedBackend`). pub async fn generate(&self, prompt: &str, system: Option<&str>) -> Result { if super::local_backend_is_llamacpp() { if let Some(lc) = self.llamacpp.as_deref() { return ::generate(lc, prompt, system, None).await; } anyhow::bail!( "LLM_BACKEND=llamacpp but LlamaCppClient is unconfigured — \ set LLAMA_SWAP_URL or switch to LLM_BACKEND=ollama" ); } self.ollama.generate(prompt, system).await } /// Label identifying which backend + model produces embeddings right /// now. Store it alongside vectors (`model_version` columns) so a /// backend flip is detectable in the data, not just in env history. pub fn embedding_model_version(&self) -> String { if super::local_backend_is_llamacpp() { let slot = self .llamacpp .as_deref() .map(|c| c.embedding_model.as_str()) .unwrap_or("embed"); format!("llama-swap:{}", slot) } else { EMBEDDING_MODEL.to_string() } } }