feat(ai): streaming chat endpoint with live tool events
Add LlmClient::chat_with_tools_stream and SSE endpoint POST /insights/chat/stream that emits text deltas, tool_call / tool_result pairs, truncated notice, and a terminal done frame as the agentic loop runs. - Ollama: parses NDJSON from /api/chat stream, accumulates content deltas, emits Done with tool_calls from the final chunk. - OpenRouter: parses OpenAI-compatible SSE, reassembles tool_call argument deltas by index, asks for stream_options.include_usage. - InsightChatService spawns the loop on a tokio task, feeds events through an mpsc channel, persists training_messages at the end. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
208
src/ai/ollama.rs
208
src/ai/ollama.rs
@@ -7,7 +7,8 @@ use std::collections::HashMap;
|
||||
use std::sync::{Arc, Mutex};
|
||||
use std::time::{Duration, Instant};
|
||||
|
||||
use crate::ai::llm_client::LlmClient;
|
||||
use crate::ai::llm_client::{LlmClient, LlmStreamEvent};
|
||||
use futures::stream::{BoxStream, StreamExt};
|
||||
|
||||
// Re-export shared types so existing `crate::ai::ollama::{...}` imports
|
||||
// continue to resolve.
|
||||
@@ -634,6 +635,174 @@ Analyze the image and use specific details from both the visual content and the
|
||||
}
|
||||
}
|
||||
|
||||
/// Streaming variant of `chat_with_tools`. Tries primary, then falls
|
||||
/// back if the initial connection fails; once the stream has begun
|
||||
/// emitting, mid-stream errors propagate to the caller. Emits
|
||||
/// `TextDelta` events as content tokens arrive and a single terminal
|
||||
/// `Done` event when the model marks the turn complete (tool_calls, if
|
||||
/// any, live on the final message).
|
||||
pub async fn chat_with_tools_stream(
|
||||
&self,
|
||||
messages: Vec<ChatMessage>,
|
||||
tools: Vec<Tool>,
|
||||
) -> Result<BoxStream<'static, Result<LlmStreamEvent>>> {
|
||||
// Attempt primary. If it can't be opened at all, try fallback.
|
||||
match self
|
||||
.try_chat_with_tools_stream(&self.primary_url, messages.clone(), tools.clone())
|
||||
.await
|
||||
{
|
||||
Ok(s) => Ok(s),
|
||||
Err(e) => {
|
||||
if let Some(fallback_url) = self.fallback_url.clone() {
|
||||
log::warn!(
|
||||
"Streaming chat primary failed ({}); trying fallback {}",
|
||||
e,
|
||||
fallback_url
|
||||
);
|
||||
self.try_chat_with_tools_stream(&fallback_url, messages, tools)
|
||||
.await
|
||||
} else {
|
||||
Err(e)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
async fn try_chat_with_tools_stream(
|
||||
&self,
|
||||
base_url: &str,
|
||||
messages: Vec<ChatMessage>,
|
||||
tools: Vec<Tool>,
|
||||
) -> Result<BoxStream<'static, Result<LlmStreamEvent>>> {
|
||||
let url = format!("{}/api/chat", base_url);
|
||||
let model = if base_url == self.primary_url {
|
||||
&self.primary_model
|
||||
} else {
|
||||
self.fallback_model
|
||||
.as_deref()
|
||||
.unwrap_or(&self.primary_model)
|
||||
};
|
||||
let options = self.build_options();
|
||||
|
||||
let request_body = OllamaChatRequest {
|
||||
model,
|
||||
messages: &messages,
|
||||
stream: true,
|
||||
tools,
|
||||
options,
|
||||
};
|
||||
|
||||
let response = self
|
||||
.client
|
||||
.post(&url)
|
||||
.json(&request_body)
|
||||
.send()
|
||||
.await
|
||||
.with_context(|| format!("Failed to connect to Ollama at {}", url))?;
|
||||
|
||||
if !response.status().is_success() {
|
||||
let status = response.status();
|
||||
let body = response.text().await.unwrap_or_default();
|
||||
anyhow::bail!(
|
||||
"Ollama stream request failed with status {}: {}",
|
||||
status,
|
||||
body
|
||||
);
|
||||
}
|
||||
|
||||
// Ollama streams NDJSON: each line is a full `OllamaStreamChunk`.
|
||||
// We buffer partial lines across chunks from the byte stream.
|
||||
let byte_stream = response.bytes_stream();
|
||||
let stream = async_stream::stream! {
|
||||
let mut buf: Vec<u8> = Vec::new();
|
||||
let mut accumulated = String::new();
|
||||
let mut tool_calls: Option<Vec<crate::ai::llm_client::ToolCall>> = None;
|
||||
let mut role = "assistant".to_string();
|
||||
let mut prompt_eval_count: Option<i32> = None;
|
||||
let mut eval_count: Option<i32> = None;
|
||||
let mut prompt_eval_duration: Option<u64> = None;
|
||||
let mut eval_duration: Option<u64> = None;
|
||||
let mut done_seen = false;
|
||||
|
||||
let mut byte_stream = byte_stream;
|
||||
while let Some(chunk) = byte_stream.next().await {
|
||||
let chunk = match chunk {
|
||||
Ok(b) => b,
|
||||
Err(e) => {
|
||||
yield Err(anyhow::anyhow!("stream read failed: {}", e));
|
||||
return;
|
||||
}
|
||||
};
|
||||
buf.extend_from_slice(&chunk);
|
||||
|
||||
// Drain complete lines; hold any trailing partial.
|
||||
while let Some(nl) = buf.iter().position(|b| *b == b'\n') {
|
||||
let line = buf.drain(..=nl).collect::<Vec<_>>();
|
||||
let line_str = match std::str::from_utf8(&line) {
|
||||
Ok(s) => s.trim(),
|
||||
Err(_) => continue,
|
||||
};
|
||||
if line_str.is_empty() {
|
||||
continue;
|
||||
}
|
||||
match serde_json::from_str::<OllamaStreamChunk>(line_str) {
|
||||
Ok(chunk) => {
|
||||
// Accumulate content delta.
|
||||
if !chunk.message.content.is_empty() {
|
||||
accumulated.push_str(&chunk.message.content);
|
||||
yield Ok(LlmStreamEvent::TextDelta(chunk.message.content));
|
||||
}
|
||||
if !chunk.message.role.is_empty() {
|
||||
role = chunk.message.role;
|
||||
}
|
||||
// Ollama only attaches tool_calls on the final chunk.
|
||||
if let Some(tcs) = chunk.message.tool_calls
|
||||
&& !tcs.is_empty()
|
||||
{
|
||||
tool_calls = Some(tcs);
|
||||
}
|
||||
if chunk.done {
|
||||
prompt_eval_count = chunk.prompt_eval_count;
|
||||
eval_count = chunk.eval_count;
|
||||
prompt_eval_duration = chunk.prompt_eval_duration;
|
||||
eval_duration = chunk.eval_duration;
|
||||
done_seen = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
log::warn!("malformed Ollama stream line: {} ({})", line_str, e);
|
||||
}
|
||||
}
|
||||
}
|
||||
if done_seen {
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// Emit the terminal Done event with the assembled message.
|
||||
log_chat_metrics(
|
||||
prompt_eval_count,
|
||||
prompt_eval_duration,
|
||||
eval_count,
|
||||
eval_duration,
|
||||
);
|
||||
let message = ChatMessage {
|
||||
role,
|
||||
content: accumulated,
|
||||
tool_calls,
|
||||
images: None,
|
||||
};
|
||||
yield Ok(LlmStreamEvent::Done {
|
||||
message,
|
||||
prompt_eval_count,
|
||||
eval_count,
|
||||
});
|
||||
};
|
||||
|
||||
Ok(Box::pin(stream))
|
||||
}
|
||||
|
||||
async fn try_chat_with_tools(
|
||||
&self,
|
||||
base_url: &str,
|
||||
@@ -857,6 +1026,14 @@ impl LlmClient for OllamaClient {
|
||||
OllamaClient::chat_with_tools(self, messages, tools).await
|
||||
}
|
||||
|
||||
async fn chat_with_tools_stream(
|
||||
&self,
|
||||
messages: Vec<ChatMessage>,
|
||||
tools: Vec<Tool>,
|
||||
) -> Result<BoxStream<'static, Result<LlmStreamEvent>>> {
|
||||
OllamaClient::chat_with_tools_stream(self, messages, tools).await
|
||||
}
|
||||
|
||||
async fn generate_embeddings(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>> {
|
||||
OllamaClient::generate_embeddings(self, texts).await
|
||||
}
|
||||
@@ -936,6 +1113,35 @@ struct OllamaChatResponse {
|
||||
eval_duration: Option<u64>,
|
||||
}
|
||||
|
||||
/// One chunk in the NDJSON stream from `/api/chat` with `stream: true`.
|
||||
/// Early chunks carry content deltas in `message.content`; the final chunk
|
||||
/// has `done: true`, optional `tool_calls`, and usage counters.
|
||||
#[derive(Deserialize, Debug)]
|
||||
struct OllamaStreamChunk {
|
||||
#[serde(default)]
|
||||
message: OllamaStreamMessage,
|
||||
#[serde(default)]
|
||||
done: bool,
|
||||
#[serde(default)]
|
||||
prompt_eval_count: Option<i32>,
|
||||
#[serde(default)]
|
||||
prompt_eval_duration: Option<u64>,
|
||||
#[serde(default)]
|
||||
eval_count: Option<i32>,
|
||||
#[serde(default)]
|
||||
eval_duration: Option<u64>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize, Debug, Default)]
|
||||
struct OllamaStreamMessage {
|
||||
#[serde(default)]
|
||||
role: String,
|
||||
#[serde(default)]
|
||||
content: String,
|
||||
#[serde(default)]
|
||||
tool_calls: Option<Vec<crate::ai::llm_client::ToolCall>>,
|
||||
}
|
||||
|
||||
#[derive(Deserialize)]
|
||||
struct OllamaResponse {
|
||||
response: String,
|
||||
|
||||
Reference in New Issue
Block a user