475072810e
New optional SamplingOverride forwarded to llama-server as chat_template_kwargs.enable_thinking (gates Qwen3-style reasoning blocks). None leaves the template default; other backends ignore it. Wired through the agentic-insight and chat-turn request bodies/handlers. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
147 lines
3.8 KiB
Rust
147 lines
3.8 KiB
Rust
use anyhow::{Result, anyhow};
|
|
|
|
use crate::ai::llm_client::LlmClient;
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
pub enum BackendKind {
|
|
Local,
|
|
Hybrid,
|
|
}
|
|
|
|
impl BackendKind {
|
|
pub fn parse(s: &str) -> Result<Self> {
|
|
match s.trim().to_lowercase().as_str() {
|
|
"local" | "" => Ok(Self::Local),
|
|
"hybrid" => Ok(Self::Hybrid),
|
|
other => Err(anyhow!(
|
|
"unknown backend '{}'; expected 'local' or 'hybrid'",
|
|
other
|
|
)),
|
|
}
|
|
}
|
|
|
|
pub fn as_str(&self) -> &'static str {
|
|
match self {
|
|
Self::Local => "local",
|
|
Self::Hybrid => "hybrid",
|
|
}
|
|
}
|
|
}
|
|
|
|
impl std::fmt::Display for BackendKind {
|
|
fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
|
|
f.write_str(self.as_str())
|
|
}
|
|
}
|
|
|
|
pub struct SamplingOverrides {
|
|
pub model: Option<String>,
|
|
pub num_ctx: Option<i32>,
|
|
pub temperature: Option<f32>,
|
|
pub top_p: Option<f32>,
|
|
pub top_k: Option<i32>,
|
|
pub min_p: Option<f32>,
|
|
/// Reasoning toggle. Only the llama.cpp backend honors it (forwarded as
|
|
/// `chat_template_kwargs.enable_thinking`); other backends ignore it.
|
|
/// `None` leaves the model/template default in place.
|
|
pub enable_thinking: Option<bool>,
|
|
}
|
|
|
|
impl SamplingOverrides {
|
|
pub fn has_sampling(&self) -> bool {
|
|
self.temperature.is_some()
|
|
|| self.top_p.is_some()
|
|
|| self.top_k.is_some()
|
|
|| self.min_p.is_some()
|
|
}
|
|
}
|
|
|
|
pub struct ResolvedBackend {
|
|
chat: Box<dyn LlmClient>,
|
|
local: Box<dyn LlmClient>,
|
|
pub kind: BackendKind,
|
|
/// `true` when the chat model receives images directly (Ollama with
|
|
/// vision, or llamacpp). `false` for hybrid where we describe-then-inline.
|
|
pub images_inline: bool,
|
|
}
|
|
|
|
impl ResolvedBackend {
|
|
pub fn new(
|
|
chat: Box<dyn LlmClient>,
|
|
local: Box<dyn LlmClient>,
|
|
kind: BackendKind,
|
|
images_inline: bool,
|
|
) -> Self {
|
|
Self {
|
|
chat,
|
|
local,
|
|
kind,
|
|
images_inline,
|
|
}
|
|
}
|
|
|
|
pub fn chat(&self) -> &dyn LlmClient {
|
|
self.chat.as_ref()
|
|
}
|
|
|
|
pub fn local(&self) -> &dyn LlmClient {
|
|
self.local.as_ref()
|
|
}
|
|
|
|
pub fn model(&self) -> &str {
|
|
self.chat.primary_model()
|
|
}
|
|
}
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn parse_backend_kind() {
|
|
assert_eq!(BackendKind::parse("local").unwrap(), BackendKind::Local);
|
|
assert_eq!(BackendKind::parse("hybrid").unwrap(), BackendKind::Hybrid);
|
|
assert_eq!(BackendKind::parse(" Local ").unwrap(), BackendKind::Local);
|
|
assert_eq!(BackendKind::parse("HYBRID").unwrap(), BackendKind::Hybrid);
|
|
assert_eq!(BackendKind::parse("").unwrap(), BackendKind::Local);
|
|
assert!(BackendKind::parse("vllm").is_err());
|
|
}
|
|
|
|
#[test]
|
|
fn backend_kind_as_str_roundtrips() {
|
|
assert_eq!(
|
|
BackendKind::parse(BackendKind::Local.as_str()).unwrap(),
|
|
BackendKind::Local
|
|
);
|
|
assert_eq!(
|
|
BackendKind::parse(BackendKind::Hybrid.as_str()).unwrap(),
|
|
BackendKind::Hybrid
|
|
);
|
|
}
|
|
|
|
#[test]
|
|
fn sampling_overrides_has_sampling() {
|
|
let empty = SamplingOverrides {
|
|
model: None,
|
|
num_ctx: None,
|
|
temperature: None,
|
|
top_p: None,
|
|
top_k: None,
|
|
min_p: None,
|
|
enable_thinking: None,
|
|
};
|
|
assert!(!empty.has_sampling());
|
|
|
|
let with_temp = SamplingOverrides {
|
|
model: None,
|
|
num_ctx: Some(4096),
|
|
temperature: Some(0.7),
|
|
top_p: None,
|
|
top_k: None,
|
|
min_p: None,
|
|
enable_thinking: None,
|
|
};
|
|
assert!(with_temp.has_sampling());
|
|
}
|
|
}
|