Feature/unified nl search #106

Open
cameron wants to merge 26 commits from feature/unified-nl-search into master
8 changed files with 55 additions and 0 deletions
Showing only changes of commit 48a1b753f0 - Show all commits
+6
View File
@@ -41,6 +41,10 @@ pub struct SamplingOverrides {
pub top_p: Option<f32>, pub top_p: Option<f32>,
pub top_k: Option<i32>, pub top_k: Option<i32>,
pub min_p: Option<f32>, pub min_p: Option<f32>,
/// Reasoning toggle. Only the llama.cpp backend honors it (forwarded as
/// `chat_template_kwargs.enable_thinking`); other backends ignore it.
/// `None` leaves the model/template default in place.
pub enable_thinking: Option<bool>,
} }
impl SamplingOverrides { impl SamplingOverrides {
@@ -124,6 +128,7 @@ mod tests {
top_p: None, top_p: None,
top_k: None, top_k: None,
min_p: None, min_p: None,
enable_thinking: None,
}; };
assert!(!empty.has_sampling()); assert!(!empty.has_sampling());
@@ -134,6 +139,7 @@ mod tests {
top_p: None, top_p: None,
top_k: None, top_k: None,
min_p: None, min_p: None,
enable_thinking: None,
}; };
assert!(with_temp.has_sampling()); assert!(with_temp.has_sampling());
} }
+15
View File
@@ -40,6 +40,12 @@ pub struct GeneratePhotoInsightRequest {
pub top_k: Option<i32>, pub top_k: Option<i32>,
#[serde(default)] #[serde(default)]
pub min_p: Option<f32>, pub min_p: Option<f32>,
/// Reasoning toggle for thinking-capable models. Forwarded to the
/// llama.cpp backend as `chat_template_kwargs.enable_thinking`; ignored
/// by other backends and the non-agentic (Ollama) path. Only the agentic
/// endpoint routes through llama.cpp. None defers to the template default.
#[serde(default)]
pub enable_thinking: Option<bool>,
/// `"local"` (default, Ollama with images) | `"hybrid"` (local vision + /// `"local"` (default, Ollama with images) | `"hybrid"` (local vision +
/// OpenRouter chat). Only respected by the agentic endpoint. /// OpenRouter chat). Only respected by the agentic endpoint.
#[serde(default)] #[serde(default)]
@@ -868,6 +874,7 @@ pub async fn generate_agentic_insight_handler(
request.top_p, request.top_p,
request.top_k, request.top_k,
request.min_p, request.min_p,
request.enable_thinking,
max_iterations, max_iterations,
request.backend.clone(), request.backend.clone(),
fewshot_examples, fewshot_examples,
@@ -1169,6 +1176,11 @@ pub struct ChatTurnHttpRequest {
pub top_k: Option<i32>, pub top_k: Option<i32>,
#[serde(default)] #[serde(default)]
pub min_p: Option<f32>, pub min_p: Option<f32>,
/// Reasoning toggle for thinking-capable models. Forwarded to the
/// llama.cpp backend as `chat_template_kwargs.enable_thinking`; ignored
/// by other backends. None defers to the model/template default.
#[serde(default)]
pub enable_thinking: Option<bool>,
#[serde(default)] #[serde(default)]
pub max_iterations: Option<usize>, pub max_iterations: Option<usize>,
/// Per-turn system-prompt override. Ephemeral in append mode, /// Per-turn system-prompt override. Ephemeral in append mode,
@@ -1247,6 +1259,7 @@ pub async fn chat_turn_handler(
top_p: request.top_p, top_p: request.top_p,
top_k: request.top_k, top_k: request.top_k,
min_p: request.min_p, min_p: request.min_p,
enable_thinking: request.enable_thinking,
max_iterations: request.max_iterations, max_iterations: request.max_iterations,
system_prompt: request.system_prompt.clone(), system_prompt: request.system_prompt.clone(),
persona_id: request.persona_id.clone(), persona_id: request.persona_id.clone(),
@@ -1473,6 +1486,7 @@ pub async fn chat_stream_handler(
top_p: request.top_p, top_p: request.top_p,
top_k: request.top_k, top_k: request.top_k,
min_p: request.min_p, min_p: request.min_p,
enable_thinking: request.enable_thinking,
max_iterations: request.max_iterations, max_iterations: request.max_iterations,
system_prompt: request.system_prompt.clone(), system_prompt: request.system_prompt.clone(),
persona_id: request.persona_id.clone(), persona_id: request.persona_id.clone(),
@@ -1618,6 +1632,7 @@ pub async fn turn_async_handler(
top_p: request.top_p, top_p: request.top_p,
top_k: request.top_k, top_k: request.top_k,
min_p: request.min_p, min_p: request.min_p,
enable_thinking: request.enable_thinking,
max_iterations: request.max_iterations, max_iterations: request.max_iterations,
system_prompt: request.system_prompt.clone(), system_prompt: request.system_prompt.clone(),
persona_id: request.persona_id.clone(), persona_id: request.persona_id.clone(),
+9
View File
@@ -70,6 +70,10 @@ pub struct ChatTurnRequest {
pub top_p: Option<f32>, pub top_p: Option<f32>,
pub top_k: Option<i32>, pub top_k: Option<i32>,
pub min_p: Option<f32>, pub min_p: Option<f32>,
/// Reasoning toggle for thinking-capable models. Forwarded to the
/// llama.cpp backend as `chat_template_kwargs.enable_thinking`; ignored
/// by other backends. None defers to the model/template default.
pub enable_thinking: Option<bool>,
pub max_iterations: Option<usize>, pub max_iterations: Option<usize>,
/// Per-turn system-prompt override. In append mode (default), applied /// Per-turn system-prompt override. In append mode (default), applied
/// ephemerally — original system message restored before persistence. /// ephemerally — original system message restored before persistence.
@@ -344,6 +348,7 @@ impl InsightChatService {
top_p: req.top_p, top_p: req.top_p,
top_k: req.top_k, top_k: req.top_k,
min_p: req.min_p, min_p: req.min_p,
enable_thinking: req.enable_thinking,
}; };
let backend = self.generator.resolve_backend(kind, &overrides).await?; let backend = self.generator.resolve_backend(kind, &overrides).await?;
let model_used = backend.model().to_string(); let model_used = backend.model().to_string();
@@ -847,6 +852,7 @@ impl InsightChatService {
top_p: req.top_p, top_p: req.top_p,
top_k: req.top_k, top_k: req.top_k,
min_p: req.min_p, min_p: req.min_p,
enable_thinking: req.enable_thinking,
}; };
let backend = self.generator.resolve_backend(kind, &overrides).await?; let backend = self.generator.resolve_backend(kind, &overrides).await?;
let model_used = backend.model().to_string(); let model_used = backend.model().to_string();
@@ -1017,6 +1023,7 @@ impl InsightChatService {
top_p: req.top_p, top_p: req.top_p,
top_k: req.top_k, top_k: req.top_k,
min_p: req.min_p, min_p: req.min_p,
enable_thinking: req.enable_thinking,
}; };
let backend = self.generator.resolve_backend(kind, &overrides).await?; let backend = self.generator.resolve_backend(kind, &overrides).await?;
let model_used = backend.model().to_string(); let model_used = backend.model().to_string();
@@ -1425,6 +1432,7 @@ impl InsightChatService {
top_p: req.top_p, top_p: req.top_p,
top_k: req.top_k, top_k: req.top_k,
min_p: req.min_p, min_p: req.min_p,
enable_thinking: req.enable_thinking,
}; };
let backend = self.generator.resolve_backend(kind, &overrides).await?; let backend = self.generator.resolve_backend(kind, &overrides).await?;
let model_used = backend.model().to_string(); let model_used = backend.model().to_string();
@@ -1607,6 +1615,7 @@ impl InsightChatService {
top_p: req.top_p, top_p: req.top_p,
top_k: req.top_k, top_k: req.top_k,
min_p: req.min_p, min_p: req.min_p,
enable_thinking: req.enable_thinking,
}; };
let backend = self.generator.resolve_backend(kind, &overrides).await?; let backend = self.generator.resolve_backend(kind, &overrides).await?;
let model_used = backend.model().to_string(); let model_used = backend.model().to_string();
+3
View File
@@ -3933,6 +3933,7 @@ Return ONLY the summary, nothing else."#,
if let Some(ctx) = overrides.num_ctx { if let Some(ctx) = overrides.num_ctx {
c.set_num_ctx(Some(ctx)); c.set_num_ctx(Some(ctx));
} }
c.set_enable_thinking(overrides.enable_thinking);
Box::new(c) Box::new(c)
} else { } else {
// Pure Ollama local. // Pure Ollama local.
@@ -4064,6 +4065,7 @@ Return ONLY the summary, nothing else."#,
top_p: Option<f32>, top_p: Option<f32>,
top_k: Option<i32>, top_k: Option<i32>,
min_p: Option<f32>, min_p: Option<f32>,
enable_thinking: Option<bool>,
max_iterations: usize, max_iterations: usize,
backend: Option<String>, backend: Option<String>,
fewshot_examples: Vec<Vec<ChatMessage>>, fewshot_examples: Vec<Vec<ChatMessage>>,
@@ -4091,6 +4093,7 @@ Return ONLY the summary, nothing else."#,
top_p, top_p,
top_k, top_k,
min_p, min_p,
enable_thinking,
}; };
let backend = self.resolve_backend(kind, &overrides).await?; let backend = self.resolve_backend(kind, &overrides).await?;
span.set_attribute(KeyValue::new("model", backend.model().to_string())); span.set_attribute(KeyValue::new("model", backend.model().to_string()));
+19
View File
@@ -64,6 +64,12 @@ pub struct LlamaCppClient {
top_p: Option<f32>, top_p: Option<f32>,
top_k: Option<i32>, top_k: Option<i32>,
min_p: Option<f32>, min_p: Option<f32>,
/// When `Some`, forwarded to llama-server as
/// `chat_template_kwargs: {"enable_thinking": <bool>}`. The Jinja chat
/// template (e.g. Qwen3) reads this to gate its reasoning block. `None`
/// omits the key entirely, leaving the template's own default. Templates
/// that don't reference the key ignore it, so sending it is harmless.
enable_thinking: Option<bool>,
} }
impl LlamaCppClient { impl LlamaCppClient {
@@ -89,6 +95,7 @@ impl LlamaCppClient {
top_p: None, top_p: None,
top_k: None, top_k: None,
min_p: None, min_p: None,
enable_thinking: None,
} }
} }
@@ -104,6 +111,12 @@ impl LlamaCppClient {
self.num_ctx = num_ctx; self.num_ctx = num_ctx;
} }
/// Set the reasoning toggle forwarded as `chat_template_kwargs.enable_thinking`.
/// `None` leaves the chat template's own default in place.
pub fn set_enable_thinking(&mut self, enable_thinking: Option<bool>) {
self.enable_thinking = enable_thinking;
}
pub fn set_sampling_params( pub fn set_sampling_params(
&mut self, &mut self,
temperature: Option<f32>, temperature: Option<f32>,
@@ -458,6 +471,12 @@ impl LlamaCppClient {
// via -c, so we silently drop the override here. The config.yaml // via -c, so we silently drop the override here. The config.yaml
// entry is the source of truth for context size. // entry is the source of truth for context size.
let _ = self.num_ctx; let _ = self.num_ctx;
// Reasoning toggle for thinking-capable templates (Qwen3 et al.).
// llama-server forwards chat_template_kwargs into the Jinja render
// (requires --jinja); templates that ignore the key are unaffected.
if let Some(think) = self.enable_thinking {
v.push(("chat_template_kwargs", json!({ "enable_thinking": think })));
}
v v
} }
+1
View File
@@ -336,6 +336,7 @@ async fn main() -> anyhow::Result<()> {
args.top_p, args.top_p,
args.top_k, args.top_k,
args.min_p, args.min_p,
None, // enable_thinking: leave model/template default
args.max_iterations, args.max_iterations,
None, None,
Vec::new(), Vec::new(),
+1
View File
@@ -309,6 +309,7 @@ pub async fn generate_script_agentic(
top_p: None, top_p: None,
top_k: None, top_k: None,
min_p: None, min_p: None,
enable_thinking: None,
}, },
) )
.await .await
+1
View File
@@ -193,6 +193,7 @@ pub async fn unified_search<TagD: TagDao>(
top_p: None, top_p: None,
top_k: None, top_k: None,
min_p: None, min_p: None,
enable_thinking: None,
}; };
let backend = match state let backend = match state
.insight_generator .insight_generator