feat(ai): chat rewind + ollama metrics logging
Rewind: POST /insights/chat/rewind truncates training_messages at a given rendered index, dropping the target message plus any preceding tool-call scaffolding. The initial user prompt is protected. Metrics: log prompt_eval_count/duration and eval_count/duration from every Ollama chat response, rendered as tokens + ms + tok/s. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -691,6 +691,17 @@ Analyze the image and use specific details from both the visual content and the
|
||||
.await
|
||||
.with_context(|| "Failed to parse Ollama chat response")?;
|
||||
|
||||
// Log performance counters returned by Ollama. Durations are
|
||||
// reported in nanoseconds; we render ms + tokens/sec for skim-ability
|
||||
// in the server log. Missing fields are left off the line rather
|
||||
// than printed as `None`.
|
||||
log_chat_metrics(
|
||||
chat_response.prompt_eval_count,
|
||||
chat_response.prompt_eval_duration,
|
||||
chat_response.eval_count,
|
||||
chat_response.eval_duration,
|
||||
);
|
||||
|
||||
Ok((
|
||||
chat_response.message,
|
||||
chat_response.prompt_eval_count,
|
||||
@@ -915,8 +926,14 @@ struct OllamaChatResponse {
|
||||
done_reason: String,
|
||||
#[serde(default)]
|
||||
prompt_eval_count: Option<i32>,
|
||||
/// Nanoseconds spent evaluating the prompt (context ingestion).
|
||||
#[serde(default)]
|
||||
prompt_eval_duration: Option<u64>,
|
||||
#[serde(default)]
|
||||
eval_count: Option<i32>,
|
||||
/// Nanoseconds spent generating the response tokens.
|
||||
#[serde(default)]
|
||||
eval_duration: Option<u64>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
@@ -924,6 +941,52 @@ struct OllamaResponse {
|
||||
response: String,
|
||||
}
|
||||
|
||||
fn log_chat_metrics(
|
||||
prompt_eval_count: Option<i32>,
|
||||
prompt_eval_duration_ns: Option<u64>,
|
||||
eval_count: Option<i32>,
|
||||
eval_duration_ns: Option<u64>,
|
||||
) {
|
||||
// Compute tokens/sec when both count and duration are present.
|
||||
fn tokens_per_sec(count: Option<i32>, duration_ns: Option<u64>) -> Option<f64> {
|
||||
match (count, duration_ns) {
|
||||
(Some(c), Some(d)) if c > 0 && d > 0 => Some((c as f64) * 1_000_000_000.0 / (d as f64)),
|
||||
_ => None,
|
||||
}
|
||||
}
|
||||
let prompt_ms = prompt_eval_duration_ns.map(|ns| ns as f64 / 1_000_000.0);
|
||||
let eval_ms = eval_duration_ns.map(|ns| ns as f64 / 1_000_000.0);
|
||||
let prompt_tps = tokens_per_sec(prompt_eval_count, prompt_eval_duration_ns);
|
||||
let eval_tps = tokens_per_sec(eval_count, eval_duration_ns);
|
||||
|
||||
let mut parts: Vec<String> = Vec::new();
|
||||
if let Some(c) = prompt_eval_count {
|
||||
let mut s = format!("prompt={} tok", c);
|
||||
if let Some(ms) = prompt_ms {
|
||||
s.push_str(&format!(" ({:.0} ms", ms));
|
||||
if let Some(tps) = prompt_tps {
|
||||
s.push_str(&format!(", {:.1} tok/s", tps));
|
||||
}
|
||||
s.push(')');
|
||||
}
|
||||
parts.push(s);
|
||||
}
|
||||
if let Some(c) = eval_count {
|
||||
let mut s = format!("gen={} tok", c);
|
||||
if let Some(ms) = eval_ms {
|
||||
s.push_str(&format!(" ({:.0} ms", ms));
|
||||
if let Some(tps) = eval_tps {
|
||||
s.push_str(&format!(", {:.1} tok/s", tps));
|
||||
}
|
||||
s.push(')');
|
||||
}
|
||||
parts.push(s);
|
||||
}
|
||||
if !parts.is_empty() {
|
||||
log::info!("Ollama chat metrics — {}", parts.join(", "));
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct OllamaTagsResponse {
|
||||
models: Vec<OllamaModel>,
|
||||
|
||||
Reference in New Issue
Block a user