feat(ai): streaming chat endpoint with live tool events

Add LlmClient::chat_with_tools_stream and SSE endpoint POST /insights/chat/stream that emits text deltas, tool_call / tool_result pairs, truncated notice, and a terminal done frame as the agentic loop runs. - Ollama: parses NDJSON from /api/chat stream, accumulates content deltas, emits Done with tool_calls from the final chunk. - OpenRouter: parses OpenAI-compatible SSE, reassembles tool_call argument deltas by index, asks for stream_options.include_usage. - InsightChatService spawns the loop on a tokio task, feeds events through an mpsc channel, persists training_messages at the end. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-21 16:57:41 -04:00
parent c2bd3c08e1
commit 079cd4c5b9
9 changed files with 1071 additions and 9 deletions
--- a/src/ai/ollama.rs
+++ b/src/ai/ollama.rs
@@ -7,7 +7,8 @@ use std::collections::HashMap;
 use std::sync::{Arc, Mutex};
 use std::time::{Duration, Instant};

-use crate::ai::llm_client::LlmClient;
+use crate::ai::llm_client::{LlmClient, LlmStreamEvent};
+use futures::stream::{BoxStream, StreamExt};

 // Re-export shared types so existing `crate::ai::ollama::{...}` imports
 // continue to resolve.
@@ -634,6 +635,174 @@ Analyze the image and use specific details from both the visual content and the
        }
    }

+    /// Streaming variant of `chat_with_tools`. Tries primary, then falls
+    /// back if the initial connection fails; once the stream has begun
+    /// emitting, mid-stream errors propagate to the caller. Emits
+    /// `TextDelta` events as content tokens arrive and a single terminal
+    /// `Done` event when the model marks the turn complete (tool_calls, if
+    /// any, live on the final message).
+    pub async fn chat_with_tools_stream(
+        &self,
+        messages: Vec<ChatMessage>,
+        tools: Vec<Tool>,
+    ) -> Result<BoxStream<'static, Result<LlmStreamEvent>>> {
+        // Attempt primary. If it can't be opened at all, try fallback.
+        match self
+            .try_chat_with_tools_stream(&self.primary_url, messages.clone(), tools.clone())
+            .await
+        {
+            Ok(s) => Ok(s),
+            Err(e) => {
+                if let Some(fallback_url) = self.fallback_url.clone() {
+                    log::warn!(
+                        "Streaming chat primary failed ({}); trying fallback {}",
+                        e,
+                        fallback_url
+                    );
+                    self.try_chat_with_tools_stream(&fallback_url, messages, tools)
+                        .await
+                } else {
+                    Err(e)
+                }
+            }
+        }
+    }
+
+    async fn try_chat_with_tools_stream(
+        &self,
+        base_url: &str,
+        messages: Vec<ChatMessage>,
+        tools: Vec<Tool>,
+    ) -> Result<BoxStream<'static, Result<LlmStreamEvent>>> {
+        let url = format!("{}/api/chat", base_url);
+        let model = if base_url == self.primary_url {
+            &self.primary_model
+        } else {
+            self.fallback_model
+                .as_deref()
+                .unwrap_or(&self.primary_model)
+        };
+        let options = self.build_options();
+
+        let request_body = OllamaChatRequest {
+            model,
+            messages: &messages,
+            stream: true,
+            tools,
+            options,
+        };
+
+        let response = self
+            .client
+            .post(&url)
+            .json(&request_body)
+            .send()
+            .await
+            .with_context(|| format!("Failed to connect to Ollama at {}", url))?;
+
+        if !response.status().is_success() {
+            let status = response.status();
+            let body = response.text().await.unwrap_or_default();
+            anyhow::bail!(
+                "Ollama stream request failed with status {}: {}",
+                status,
+                body
+            );
+        }
+
+        // Ollama streams NDJSON: each line is a full `OllamaStreamChunk`.
+        // We buffer partial lines across chunks from the byte stream.
+        let byte_stream = response.bytes_stream();
+        let stream = async_stream::stream! {
+            let mut buf: Vec<u8> = Vec::new();
+            let mut accumulated = String::new();
+            let mut tool_calls: Option<Vec<crate::ai::llm_client::ToolCall>> = None;
+            let mut role = "assistant".to_string();
+            let mut prompt_eval_count: Option<i32> = None;
+            let mut eval_count: Option<i32> = None;
+            let mut prompt_eval_duration: Option<u64> = None;
+            let mut eval_duration: Option<u64> = None;
+            let mut done_seen = false;
+
+            let mut byte_stream = byte_stream;
+            while let Some(chunk) = byte_stream.next().await {
+                let chunk = match chunk {
+                    Ok(b) => b,
+                    Err(e) => {
+                        yield Err(anyhow::anyhow!("stream read failed: {}", e));
+                        return;
+                    }
+                };
+                buf.extend_from_slice(&chunk);
+
+                // Drain complete lines; hold any trailing partial.
+                while let Some(nl) = buf.iter().position(|b| *b == b'\n') {
+                    let line = buf.drain(..=nl).collect::<Vec<_>>();
+                    let line_str = match std::str::from_utf8(&line) {
+                        Ok(s) => s.trim(),
+                        Err(_) => continue,
+                    };
+                    if line_str.is_empty() {
+                        continue;
+                    }
+                    match serde_json::from_str::<OllamaStreamChunk>(line_str) {
+                        Ok(chunk) => {
+                            // Accumulate content delta.
+                            if !chunk.message.content.is_empty() {
+                                accumulated.push_str(&chunk.message.content);
+                                yield Ok(LlmStreamEvent::TextDelta(chunk.message.content));
+                            }
+                            if !chunk.message.role.is_empty() {
+                                role = chunk.message.role;
+                            }
+                            // Ollama only attaches tool_calls on the final chunk.
+                            if let Some(tcs) = chunk.message.tool_calls
+                                && !tcs.is_empty()
+                            {
+                                tool_calls = Some(tcs);
+                            }
+                            if chunk.done {
+                                prompt_eval_count = chunk.prompt_eval_count;
+                                eval_count = chunk.eval_count;
+                                prompt_eval_duration = chunk.prompt_eval_duration;
+                                eval_duration = chunk.eval_duration;
+                                done_seen = true;
+                                break;
+                            }
+                        }
+                        Err(e) => {
+                            log::warn!("malformed Ollama stream line: {} ({})", line_str, e);
+                        }
+                    }
+                }
+                if done_seen {
+                    break;
+                }
+            }
+
+            // Emit the terminal Done event with the assembled message.
+            log_chat_metrics(
+                prompt_eval_count,
+                prompt_eval_duration,
+                eval_count,
+                eval_duration,
+            );
+            let message = ChatMessage {
+                role,
+                content: accumulated,
+                tool_calls,
+                images: None,
+            };
+            yield Ok(LlmStreamEvent::Done {
+                message,
+                prompt_eval_count,
+                eval_count,
+            });
+        };
+
+        Ok(Box::pin(stream))
+    }
+
    async fn try_chat_with_tools(
        &self,
        base_url: &str,
@@ -857,6 +1026,14 @@ impl LlmClient for OllamaClient {
        OllamaClient::chat_with_tools(self, messages, tools).await
    }

+    async fn chat_with_tools_stream(
+        &self,
+        messages: Vec<ChatMessage>,
+        tools: Vec<Tool>,
+    ) -> Result<BoxStream<'static, Result<LlmStreamEvent>>> {
+        OllamaClient::chat_with_tools_stream(self, messages, tools).await
+    }
+
    async fn generate_embeddings(&self, texts: &[&str]) -> Result<Vec<Vec<f32>>> {
        OllamaClient::generate_embeddings(self, texts).await
    }
@@ -936,6 +1113,35 @@ struct OllamaChatResponse {
    eval_duration: Option<u64>,
 }

+/// One chunk in the NDJSON stream from `/api/chat` with `stream: true`.
+/// Early chunks carry content deltas in `message.content`; the final chunk
+/// has `done: true`, optional `tool_calls`, and usage counters.
+#[derive(Deserialize, Debug)]
+struct OllamaStreamChunk {
+    #[serde(default)]
+    message: OllamaStreamMessage,
+    #[serde(default)]
+    done: bool,
+    #[serde(default)]
+    prompt_eval_count: Option<i32>,
+    #[serde(default)]
+    prompt_eval_duration: Option<u64>,
+    #[serde(default)]
+    eval_count: Option<i32>,
+    #[serde(default)]
+    eval_duration: Option<u64>,
+}
+
+#[derive(Deserialize, Debug, Default)]
+struct OllamaStreamMessage {
+    #[serde(default)]
+    role: String,
+    #[serde(default)]
+    content: String,
+    #[serde(default)]
+    tool_calls: Option<Vec<crate::ai::llm_client::ToolCall>>,
+}
+
 #[derive(Deserialize)]
 struct OllamaResponse {
    response: String,