refactor: runner owns stream routing, suppress tool call XML from display

Split the streaming pipeline: API backends yield StreamEvents through a channel, the runner reads them and routes to the appropriate UI pane. - Add StreamEvent enum (Content, Reasoning, ToolCallDelta, etc.) - API start_stream() spawns backend as a task, returns event receiver - Runner loops over events, sends content to conversation pane but suppresses <tool_call> XML with a buffered tail for partial tags - OpenAI backend refactored to stream_events() — no more UI coupling - Anthropic backend gets a wrapper that synthesizes events from the existing stream() (TODO: native event streaming) - chat_completion_stream() kept for subconscious agents, reimplemented on top of the event stream - Usage derives Clone Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-03-29 21:22:42 -04:00 · 2026-03-29 21:22:42 -04:00 · 13453606ae
commit 13453606ae
parent 912626c5f0
6 changed files with 340 additions and 116 deletions
--- a/src/agent/api/mod.rs
+++ b/src/agent/api/mod.rs
@ -18,8 +18,41 @@ use anyhow::Result;
 use reqwest::Client;
 use std::time::{Duration, Instant};

+use tokio::sync::mpsc;
+
 use crate::agent::types::*;
-use crate::agent::ui_channel::{StreamTarget, UiMessage, UiSender};
+use crate::agent::ui_channel::{UiMessage, UiSender};
+
+// ─────────────────────────────────────────────────────────────
+//  Stream events — yielded by backends, consumed by the runner
+// ─────────────────────────────────────────────────────────────
+
+/// Events produced by the streaming API backends.
+/// The runner reads these and decides what to display where.
+pub enum StreamEvent {
+    /// Content token from the model's response.
+    Content(String),
+    /// Reasoning/thinking token (internal monologue).
+    Reasoning(String),
+    /// Incremental tool call delta (structured, from APIs that support it).
+    ToolCallDelta {
+        index: usize,
+        id: Option<String>,
+        call_type: Option<String>,
+        name: Option<String>,
+        arguments: Option<String>,
+    },
+    /// Token usage stats.
+    Usage(Usage),
+    /// Stream finished.
+    Finished {
+        reason: String,
+        prompt_tokens: u32,
+        completion_tokens: u32,
+    },
+    /// Error from the stream.
+    Error(String),
+}

 enum Backend {
    OpenAi {
@ -58,20 +91,71 @@ impl ApiClient {
        }
    }

+    /// Start a streaming chat completion. Returns a receiver of StreamEvents.
+    /// The caller (runner) reads events and handles routing to the UI.
+    ///
+    /// The old `chat_completion_stream` method is kept for the subconscious
+    /// agents which don't need fine-grained stream control.
+    pub fn start_stream(
+        &self,
+        messages: &[Message],
+        tools: Option<&[ToolDef]>,
+        ui_tx: &UiSender,
+        reasoning_effort: &str,
+        temperature: Option<f32>,
+    ) -> mpsc::UnboundedReceiver<StreamEvent> {
+        let (tx, rx) = mpsc::unbounded_channel();
+        let client = self.client.clone();
+        let api_key = self.api_key.clone();
+        let model = self.model.clone();
+        let messages = messages.to_vec();
+        let tools = tools.map(|t| t.to_vec());
+        let ui_tx = ui_tx.clone();
+        let reasoning_effort = reasoning_effort.to_string();
+        let backend = match &self.backend {
+            Backend::OpenAi { base_url } => Backend::OpenAi { base_url: base_url.clone() },
+            Backend::Anthropic => Backend::Anthropic,
+        };
+
+        tokio::spawn(async move {
+            let result = match &backend {
+                Backend::OpenAi { base_url } => {
+                    openai::stream_events(
+                        &client, base_url, &api_key, &model,
+                        &messages, tools.as_deref(), &tx, &ui_tx,
+                        &reasoning_effort, temperature,
+                    ).await
+                }
+                Backend::Anthropic => {
+                    // Anthropic backend still uses the old path for now —
+                    // wrap it by calling the old stream() and synthesizing events.
+                    anthropic::stream_events(
+                        &client, &api_key, &model,
+                        &messages, tools.as_deref(), &tx, &ui_tx,
+                        &reasoning_effort,
+                    ).await
+                }
+            };
+            if let Err(e) = result {
+                let _ = tx.send(StreamEvent::Error(e.to_string()));
+            }
+        });
+
+        rx
+    }
+
    /// Streaming chat completion. Returns the assembled response message
    /// plus optional usage stats. Text tokens stream through the UI channel.
    ///
-    /// Empty response handling is done at the agent level (agent.rs)
-    /// where the conversation can be modified between retries.
+    /// Used by subconscious agents that don't need per-token routing.
    pub async fn chat_completion_stream(
        &self,
        messages: &[Message],
        tools: Option<&[ToolDef]>,
        ui_tx: &UiSender,
-        target: StreamTarget,
        reasoning_effort: &str,
    ) -> Result<(Message, Option<Usage>)> {
-        self.chat_completion_stream_temp(messages, tools, ui_tx, target, reasoning_effort, None).await
+        self.chat_completion_stream_temp(messages, tools, ui_tx, reasoning_effort, None).await
    }

    pub async fn chat_completion_stream_temp(
@ -79,24 +163,48 @@ impl ApiClient {
        messages: &[Message],
        tools: Option<&[ToolDef]>,
        ui_tx: &UiSender,
-        target: StreamTarget,
        reasoning_effort: &str,
        temperature: Option<f32>,
    ) -> Result<(Message, Option<Usage>)> {
-        match &self.backend {
-            Backend::OpenAi { base_url } => {
-                openai::stream(
-                    &self.client, base_url, &self.api_key, &self.model,
-                    messages, tools, ui_tx, target, reasoning_effort, temperature,
-                ).await
-            }
-            Backend::Anthropic => {
-                anthropic::stream(
-                    &self.client, &self.api_key, &self.model,
-                    messages, tools, ui_tx, target, reasoning_effort,
-                ).await
+        // Use the event stream and accumulate into a message.
+        let mut rx = self.start_stream(messages, tools, ui_tx, reasoning_effort, temperature);
+        let mut content = String::new();
+        let mut tool_calls: Vec<ToolCall> = Vec::new();
+        let mut usage = None;
+        let mut finish_reason = None;
+
+        while let Some(event) = rx.recv().await {
+            match event {
+                StreamEvent::Content(text) => content.push_str(&text),
+                StreamEvent::Reasoning(_) => {}
+                StreamEvent::ToolCallDelta { index, id, call_type, name, arguments } => {
+                    while tool_calls.len() <= index {
+                        tool_calls.push(ToolCall {
+                            id: String::new(),
+                            call_type: "function".to_string(),
+                            function: FunctionCall { name: String::new(), arguments: String::new() },
+                        });
+                    }
+                    if let Some(id) = id { tool_calls[index].id = id; }
+                    if let Some(ct) = call_type { tool_calls[index].call_type = ct; }
+                    if let Some(n) = name { tool_calls[index].function.name = n; }
+                    if let Some(a) = arguments { tool_calls[index].function.arguments.push_str(&a); }
+                }
+                StreamEvent::Usage(u) => usage = Some(u),
+                StreamEvent::Finished { reason, .. } => {
+                    finish_reason = Some(reason);
+                    break;
+                }
+                StreamEvent::Error(e) => anyhow::bail!("{}", e),
            }
        }
+
+        if finish_reason.as_deref() == Some("error") {
+            let detail = if content.is_empty() { "no details".into() } else { content };
+            anyhow::bail!("model stream error: {}", detail);
+        }
+
+        Ok((build_response_message(content, tool_calls), usage))
    }

    /// Return a label for the active backend, used in startup info.
@ -325,7 +433,7 @@ impl SseReader {
 /// from models that emit tool calls as text), parse them out and
 /// promote them to structured tool_calls. This way all consumers
 /// see tool calls uniformly regardless of backend.
-pub(crate) fn build_response_message(
+pub fn build_response_message(
    content: String,
    tool_calls: Vec<ToolCall>,
 ) -> Message {