feat(ai): streaming chat endpoint with live tool events

Add LlmClient::chat_with_tools_stream and SSE endpoint
POST /insights/chat/stream that emits text deltas, tool_call /
tool_result pairs, truncated notice, and a terminal done frame as the
agentic loop runs.

- Ollama: parses NDJSON from /api/chat stream, accumulates content
  deltas, emits Done with tool_calls from the final chunk.
- OpenRouter: parses OpenAI-compatible SSE, reassembles tool_call
  argument deltas by index, asks for stream_options.include_usage.
- InsightChatService spawns the loop on a tokio task, feeds events
  through an mpsc channel, persists training_messages at the end.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Cameron
2026-04-21 16:57:41 -04:00
parent c2bd3c08e1
commit 079cd4c5b9
9 changed files with 1071 additions and 9 deletions

View File

@@ -7,7 +7,8 @@ use std::collections::HashMap;
use std::sync::{Arc, Mutex};
use std::time::{Duration, Instant};
use crate::ai::llm_client::LlmClient;
use crate::ai::llm_client::{LlmClient, LlmStreamEvent};
use futures::stream::{BoxStream, StreamExt};
// Re-export shared types so existing `crate::ai::ollama::{...}` imports
// continue to resolve.
@@ -634,6 +635,174 @@ Analyze the image and use specific details from both the visual content and the
}
}
/// Streaming variant of `chat_with_tools`. Tries primary, then falls
/// back if the initial connection fails; once the stream has begun
/// emitting, mid-stream errors propagate to the caller. Emits
/// `TextDelta` events as content tokens arrive and a single terminal
/// `Done` event when the model marks the turn complete (tool_calls, if
/// any, live on the final message).
pub async fn chat_with_tools_stream(
&self,
messages: Vec<ChatMessage>,
tools: Vec<Tool>,
) -> Result<BoxStream<'static, Result<LlmStreamEvent>>> {
// Attempt primary. If it can't be opened at all, try fallback.
match self
.try_chat_with_tools_stream(&self.primary_url, messages.clone(), tools.clone())
.await
{
Ok(s) => Ok(s),
Err(e) => {
if let Some(fallback_url) = self.fallback_url.clone() {
log::warn!(
"Streaming chat primary failed ({}); trying fallback {}",
e,
fallback_url
);
self.try_chat_with_tools_stream(&fallback_url, messages, tools)
.await
} else {
Err(e)
}
}
}
}
async fn try_chat_with_tools_stream(
&self,
base_url: &str,
messages: Vec<ChatMessage>,
tools: Vec<Tool>,
) -> Result<BoxStream<'static, Result<LlmStreamEvent>>> {
let url = format!("{}/api/chat", base_url);
let model = if base_url == self.primary_url {
&self.primary_model
} else {
self.fallback_model
.as_deref()
.unwrap_or(&self.primary_model)
};
let options = self.build_options();
let request_body = OllamaChatRequest {
model,
messages: &messages,
stream: true,
tools,
options,
};
let response = self
.client
.post(&url)
.json(&request_body)
.send()
.await
.with_context(|| format!("Failed to connect to Ollama at {}", url))?;
if !response.status().is_success() {
let status = response.status();
let body = response.text().await.unwrap_or_default();
anyhow::bail!(
"Ollama stream request failed with status {}: {}",
status,
body
);
}
// Ollama streams NDJSON: each line is a full `OllamaStreamChunk`.
// We buffer partial lines across chunks from the byte stream.
let byte_stream = response.bytes_stream();
let stream = async_stream::stream! {
let mut buf: Vec<u8> = Vec::new();
let mut accumulated = String::new();
let mut tool_calls: Option<Vec<crate::ai::llm_client::ToolCall>> = None;
let mut role = "assistant".to_string();
let mut prompt_eval_count: Option<i32> = None;
let mut eval_count: Option<i32> = None;
let mut prompt_eval_duration: Option<u64> = None;
let mut eval_duration: Option<u64> = None;
let mut done_seen = false;
let mut byte_stream = byte_stream;
while let Some(chunk) = byte_stream.next().await {
let chunk = match chunk {
Ok(b) => b,
Err(e) => {
yield Err(anyhow::anyhow!("stream read failed: {}", e));
return;
}
};
buf.extend_from_slice(&chunk);
// Drain complete lines; hold any trailing partial.
while let Some(nl) = buf.iter().position(|b| *b == b'\n') {
let line = buf.drain(..=nl).collect::<Vec<_>>();
let line_str = match std::str::from_utf8(&line) {
Ok(s) => s.trim(),
Err(_) => continue,
};
if line_str.is_empty() {
continue;
}
match serde_json::from_str::<OllamaStreamChunk>(line_str) {
Ok(chunk) => {
// Accumulate content delta.
if !chunk.message.content.is_empty() {
accumulated.push_str(&chunk.message.content);
yield Ok(LlmStreamEvent::TextDelta(chunk.message.content));
}
if !chunk.message.role.is_empty() {
role = chunk.message.role;
}
// Ollama only attaches tool_calls on the final chunk.
if let Some(tcs) = chunk.message.tool_calls
&& !tcs.is_empty()
{
tool_calls = Some(tcs);
}
if chunk.done {
prompt_eval_count = chunk.prompt_eval_count;
eval_count = chunk.eval_count;
prompt_eval_duration = chunk.prompt_eval_duration;
eval_duration = chunk.eval_duration;
done_seen = true;
break;
}
}
Err(e) => {
log::warn!("malformed Ollama stream line: {} ({})", line_str, e);
}
}
}
if done_seen {
break;
}
}
// Emit the terminal Done event with the assembled message.
log_chat_metrics(
prompt_eval_count,
prompt_eval_duration,
eval_count,
eval_duration,
);
let message = ChatMessage {
role,
content: accumulated,
tool_calls,
images: None,
};
yield Ok(LlmStreamEvent::Done {
message,
prompt_eval_count,
eval_count,
});
};
Ok(Box::pin(stream))
}
async fn try_chat_with_tools(
&self,
base_url: &str,
@@ -857,6 +1026,14 @@ impl LlmClient for OllamaClient {
OllamaClient::chat_with_tools(self, messages, tools).await
}
async fn chat_with_tools_stream(
&self,
messages: Vec<ChatMessage>,
tools: Vec<Tool>,
) -> Result<BoxStream<'static, Result<LlmStreamEvent>>> {
OllamaClient::chat_with_tools_stream(self, messages, tools).await
}
async fn generate_embeddings(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>> {
OllamaClient::generate_embeddings(self, texts).await
}
@@ -936,6 +1113,35 @@ struct OllamaChatResponse {
eval_duration: Option<u64>,
}
/// One chunk in the NDJSON stream from `/api/chat` with `stream: true`.
/// Early chunks carry content deltas in `message.content`; the final chunk
/// has `done: true`, optional `tool_calls`, and usage counters.
#[derive(Deserialize, Debug)]
struct OllamaStreamChunk {
#[serde(default)]
message: OllamaStreamMessage,
#[serde(default)]
done: bool,
#[serde(default)]
prompt_eval_count: Option<i32>,
#[serde(default)]
prompt_eval_duration: Option<u64>,
#[serde(default)]
eval_count: Option<i32>,
#[serde(default)]
eval_duration: Option<u64>,
}
#[derive(Deserialize, Debug, Default)]
struct OllamaStreamMessage {
#[serde(default)]
role: String,
#[serde(default)]
content: String,
#[serde(default)]
tool_calls: Option<Vec<crate::ai::llm_client::ToolCall>>,
}
#[derive(Deserialize)]
struct OllamaResponse {
response: String,