WIP: Wiring context_new into agent — turn loop, StreamToken, dead code removal

Work in progress. New turn loop uses ResponseParser + StreamToken. Killed StreamEvent, append_streaming, finalize_streaming, streaming_index, assemble_api_messages, working_stack. Many methods still reference old types — fixing next. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 14:55:10 -04:00 · 2026-04-08 14:55:10 -04:00 · 9c79d7a037
commit 9c79d7a037
parent 648356ae40
4 changed files with 202 additions and 424 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -360,6 +360,7 @@ dependencies = [
 "iana-time-zone",
 "js-sys",
 "num-traits",
+ "serde",
 "wasm-bindgen",
 "windows-link",
 ]
--- a/src/agent/api/mod.rs
+++ b/src/agent/api/mod.rs
@ -50,28 +50,10 @@ fn tools_to_json_str(tools: &[agent_tools::Tool]) -> String {
    format!("[{}]", inner.join(","))
 }

-/// Events produced by the streaming API backends.
-/// The runner reads these and decides what to display where.
-pub(crate) enum StreamEvent {
-    /// Content token from the model's response.
-    Content(String),
-    /// Reasoning/thinking token (internal monologue).
-    Reasoning(String),
-    /// Incremental tool call delta (structured, from APIs that support it).
-    ToolCallDelta {
-        index: usize,
-        id: Option<String>,
-        call_type: Option<String>,
-        name: Option<String>,
-        arguments: Option<String>,
-    },
-    /// Token usage stats.
-    Usage(Usage),
-    /// Stream finished.
-    Finished {
-        reason: String,
-    },
-    /// Error from the stream.
+/// One token from the streaming completions API.
+pub(crate) enum StreamToken {
+    Token { text: String, id: u32 },
+    Done { usage: Option<Usage> },
    Error(String),
 }

@ -133,14 +115,14 @@ impl ApiClient {
        (rx, AbortOnDrop(handle))
    }

-    /// Start a streaming completion with raw token IDs.
-    /// No message formatting — the caller provides the complete prompt as tokens.
-    pub(crate) fn start_stream_completions(
+    /// Stream a completion with raw token IDs.
+    /// Returns (text, token_id) per token via channel.
+    pub(crate) fn stream_completion(
        &self,
        prompt_tokens: &[u32],
        sampling: SamplingParams,
        priority: Option<i32>,
-    ) -> (mpsc::UnboundedReceiver<StreamEvent>, AbortOnDrop) {
+    ) -> (mpsc::UnboundedReceiver<StreamToken>, AbortOnDrop) {
        let (tx, rx) = mpsc::unbounded_channel();
        let client = self.client.clone();
        let api_key = self.api_key.clone();
@ -154,7 +136,7 @@ impl ApiClient {
                &prompt_tokens, &tx, sampling, priority,
            ).await;
            if let Err(e) = result {
-                let _ = tx.send(StreamEvent::Error(e.to_string()));
+                let _ = tx.send(StreamToken::Error(e.to_string()));
            }
        });

--- a/src/agent/api/openai.rs
+++ b/src/agent/api/openai.rs
@ -9,7 +9,7 @@ use tokio::sync::mpsc;

 use super::http::HttpClient;
 use super::types::*;
-use super::StreamEvent;
+use super::StreamToken;

 /// Stream SSE events from an OpenAI-compatible endpoint, sending
 /// parsed StreamEvents through the channel. The caller (runner)
@ -186,16 +186,16 @@ pub(super) async fn stream_events(
    Ok(())
 }

-/// Stream from the /v1/completions endpoint using raw token IDs.
-/// Tool calls come as text (<tool_call> tags) and are parsed by the caller.
-/// Thinking content comes as <think> tags and is split into Reasoning events.
+/// Stream from /v1/completions with raw token IDs in and out.
+/// Each SSE chunk yields one token (text + id). All parsing (think tags,
+/// tool calls) is handled by the ResponseParser, not here.
 pub(super) async fn stream_completions(
    client: &HttpClient,
    base_url: &str,
    api_key: &str,
    model: &str,
    prompt_tokens: &[u32],
-    tx: &mpsc::UnboundedSender<StreamEvent>,
+    tx: &mpsc::UnboundedSender<StreamToken>,
    sampling: super::SamplingParams,
    priority: Option<i32>,
 ) -> Result<()> {
@ -207,6 +207,8 @@ pub(super) async fn stream_completions(
        "top_p": sampling.top_p,
        "top_k": sampling.top_k,
        "stream": true,
+        "return_token_ids": true,
+        "skip_special_tokens": false,
        "stop_token_ids": [super::super::tokenizer::IM_END],
    });
    if let Some(p) = priority {
@ -229,20 +231,15 @@ pub(super) async fn stream_completions(

    let mut reader = super::SseReader::new();
    let mut content_len: usize = 0;
-    let mut first_content_at = None;
-    let mut finish_reason = None;
    let mut usage = None;
-    let mut in_think = false;

    while let Some(event) = reader.next_event(&mut response).await? {
        if let Some(err_msg) = event["error"]["message"].as_str() {
            anyhow::bail!("API error in stream: {}", err_msg);
        }

-        // Completions chunks have a simpler structure
        if let Some(u) = event["usage"].as_object() {
            if let Ok(u) = serde_json::from_value::<Usage>(serde_json::Value::Object(u.clone())) {
-                let _ = tx.send(StreamEvent::Usage(u.clone()));
                usage = Some(u);
            }
        }
@ -253,78 +250,27 @@ pub(super) async fn stream_completions(
        };

        for choice in choices {
-            if let Some(reason) = choice["finish_reason"].as_str() {
-                if reason != "null" {
-                    finish_reason = Some(reason.to_string());
-                }
-            }
+            let text = choice["text"].as_str().unwrap_or("");
+            let token_ids = choice["token_ids"].as_array();

-            if let Some(text) = choice["text"].as_str() {
-                if text.is_empty() { continue; }
-
-                // Handle <think> tags — split into Reasoning vs Content
-                if text.contains("<think>") || in_think {
-                    // Simple state machine for think tags
-                    let mut remaining = text;
-                    while !remaining.is_empty() {
-                        if in_think {
-                            if let Some(end) = remaining.find("</think>") {
-                                let thinking = &remaining[..end];
-                                if !thinking.is_empty() {
-                                    let _ = tx.send(StreamEvent::Reasoning(thinking.to_string()));
-                                }
-                                remaining = &remaining[end + 8..];
-                                in_think = false;
-                            } else {
-                                let _ = tx.send(StreamEvent::Reasoning(remaining.to_string()));
-                                break;
-                            }
-                        } else {
-                            if let Some(start) = remaining.find("<think>") {
-                                let content = &remaining[..start];
-                                if !content.is_empty() {
-                                    content_len += content.len();
-                                    if first_content_at.is_none() {
-                                        first_content_at = Some(reader.stream_start.elapsed());
-                                    }
-                                    let _ = tx.send(StreamEvent::Content(content.to_string()));
-                                }
-                                remaining = &remaining[start + 7..];
-                                in_think = true;
-                            } else {
-                                content_len += remaining.len();
-                                if first_content_at.is_none() {
-                                    first_content_at = Some(reader.stream_start.elapsed());
-                                }
-                                let _ = tx.send(StreamEvent::Content(remaining.to_string()));
-                                break;
-                            }
-                        }
-                    }
-                } else {
+            if let Some(ids) = token_ids {
+                for (i, id_val) in ids.iter().enumerate() {
+                    if let Some(id) = id_val.as_u64() {
                        content_len += text.len();
-                    if first_content_at.is_none() {
-                        first_content_at = Some(reader.stream_start.elapsed());
+                        let _ = tx.send(StreamToken::Token {
+                            text: if i == 0 { text.to_string() } else { String::new() },
+                            id: id as u32,
+                        });
                    }
-                    let _ = tx.send(StreamEvent::Content(text.to_string()));
                }
+            } else if !text.is_empty() {
+                // Fallback: text without token IDs (shouldn't happen with return_token_ids)
+                content_len += text.len();
+                let _ = tx.send(StreamToken::Token { text: text.to_string(), id: 0 });
            }
        }
    }

-    let total_elapsed = reader.stream_start.elapsed();
-    super::log_diagnostics(
-        content_len, 0, 0, "none",
-        &finish_reason,
-        reader.chunks_received,
-        reader.sse_lines_parsed,
-        reader.sse_parse_errors,
-        0, total_elapsed, first_content_at,
-        &usage, &[],
-    );
-
-    let reason = finish_reason.unwrap_or_default();
-    let _ = tx.send(StreamEvent::Finished { reason });
-
+    let _ = tx.send(StreamToken::Done { usage });
    Ok(())
 }
--- a/src/agent/mod.rs
+++ b/src/agent/mod.rs
@ -23,13 +23,11 @@ pub mod tools;
 use std::sync::Arc;
 use anyhow::Result;

-use api::{ApiClient, ToolCall};
-use api::{ContentPart, Message, MessageContent, Role};
-use context::{ConversationEntry, ContextEntry, ContextState};
+use api::ApiClient;
+use context_new::{AstNode, NodeBody, ContextState, Section, Ast, PendingToolCall, ResponseParser, Role};
 use tools::{summarize_args, working_stack};

 use crate::mind::log::ConversationLog;
-use crate::agent::context::ContextSection;

 // --- Activity tracking (RAII guards) ---

@ -163,7 +161,6 @@ pub struct Agent {
    pub provenance: String,
    /// Persistent conversation log — append-only record of all messages.
    pub conversation_log: Option<ConversationLog>,
-    /// Mutable context state — personality, working stack, etc.
    pub context: ContextState,
    /// App config — used to reload identity on compaction and model switching.
    pub app_config: crate::config::AppConfig,
@ -190,11 +187,9 @@ impl Agent {
        conversation_log: Option<ConversationLog>,
        active_tools: tools::SharedActiveTools,
    ) -> Self {
-        let mut system = ContextSection::new("System prompt");
-        system.push(ContextEntry::new(
-            ConversationEntry::System(Message::system(&system_prompt)), None));
+        let mut context = ContextState::new();
+        context.push(Section::System, AstNode::system_msg(&system_prompt));

-        // Tool definitions — part of the context, tokenized and scored
        let tool_defs: Vec<String> = tools::tools().iter()
            .map(|t| t.to_json()).collect();
        if !tool_defs.is_empty() {
@ -207,22 +202,12 @@ impl Agent {
                IMPORTANT: Function calls MUST follow the specified format.",
                tool_defs.join("\n"),
            );
-            system.push(ContextEntry::new(
-                ConversationEntry::System(Message::system(&tools_text)), None));
+            context.push(Section::System, AstNode::system_msg(&tools_text));
        }

-        let mut identity = ContextSection::new("Identity");
-        for (_name, content) in &personality {
-            identity.push(ContextEntry::new(
-                ConversationEntry::Message(Message::user(content)), None));
+        for (name, content) in &personality {
+            context.push(Section::Identity, AstNode::memory(name, content));
        }
-        let context = ContextState {
-            system,
-            identity,
-            journal: ContextSection::new("Journal"),
-            conversation: ContextSection::new("Conversation"),
-            working_stack: Vec::new(),
-        };
        let session_id = format!("consciousness-{}", chrono::Utc::now().format("%Y%m%d-%H%M%S"));
        let mut agent = Self {
            client,
@ -250,7 +235,6 @@ impl Agent {
        };

        agent.load_startup_journal();
-        agent.load_working_stack();
        agent
    }

@ -287,165 +271,41 @@ impl Agent {
        }
    }

-    /// Assemble the full message list for the API call from typed sources.
-    /// System prompt + personality context + journal + conversation messages.
-    pub fn assemble_api_messages(&self) -> Vec<Message> {
-        let mut msgs = Vec::new();
-        // System section
-        for e in self.context.system.entries() {
-            msgs.push(e.entry.api_message().clone());
-        }
-        // Identity — render personality files + working stack into one user message
-        let ctx = self.context.render_context_message();
-        if !ctx.is_empty() {
-            msgs.push(Message::user(ctx));
-        }
-        // Journal — render into one user message
-        let jnl = self.context.render_journal();
-        if !jnl.is_empty() {
-            msgs.push(Message::user(jnl));
-        }
-        // Conversation entries
-        msgs.extend(self.context.conversation.entries().iter()
-            .filter(|e| !e.entry.is_log() && !e.entry.is_thinking())
-            .map(|e| e.entry.api_message().clone()));
-        msgs
-    }
-
-    /// Assemble the full prompt as token IDs for the completions API.
-    /// System section (includes tools), identity, journal, conversation,
-    /// then the assistant prompt suffix.
+    /// Assemble the full prompt as token IDs.
+    /// Context sections + assistant prompt suffix.
    pub fn assemble_prompt_tokens(&self) -> Vec<u32> {
-        let mut tokens = Vec::new();
-
-        // System section — includes system prompt + tool definitions
-        for e in self.context.system.entries() {
-            tokens.extend(&e.token_ids);
-        }
-
-        // Identity — rendered as one user message
-        let ctx = self.context.render_context_message();
-        if !ctx.is_empty() {
-            tokens.extend(tokenizer::tokenize_entry("user", &ctx));
-        }
-
-        // Journal — rendered as one user message
-        let jnl = self.context.render_journal();
-        if !jnl.is_empty() {
-            tokens.extend(tokenizer::tokenize_entry("user", &jnl));
-        }
-
-        // Conversation entries — use cached token_ids
-        for e in self.context.conversation.entries() {
-            if e.entry.is_log() || e.entry.is_thinking() { continue; }
-            tokens.extend(&e.token_ids);
-        }
-
-        // Prompt the assistant to respond
+        let mut tokens = self.context.token_ids();
        tokens.push(tokenizer::IM_START);
        tokens.extend(tokenizer::encode("assistant\n"));
-
        tokens
    }

-    /// Run agent orchestration cycle, returning structured output.
-    /// Push a conversation message — stamped and logged.
-    pub fn push_message(&mut self, mut msg: Message) {
-        msg.stamp();
-        let entry = ConversationEntry::Message(msg);
-        self.push_entry(entry);
-    }
-
-    pub fn push_entry(&mut self, entry: ConversationEntry) {
+    /// Push a node into the conversation and log it.
+    pub fn push_node(&mut self, node: AstNode) {
        if let Some(ref log) = self.conversation_log {
-            if let Err(e) = log.append(&entry) {
+            if let Err(e) = log.append_node(&node) {
                eprintln!("warning: failed to log entry: {:#}", e);
            }
        }
-        self.context.conversation.push(ContextEntry::new(
-            entry, Some(chrono::Utc::now())));
-
+        self.context.push(Section::Conversation, node);
        self.changed.notify_one();
    }

-    /// Find the index of the in-progress streaming entry (unstamped assistant message).
-    fn streaming_index(&self) -> Option<usize> {
-        self.context.conversation.entries().iter().rposition(|ce| {
-            if ce.token_ids.is_empty() { return false; }
-            let m = ce.entry.message();
-            m.role == Role::Assistant && m.timestamp.is_none()
-        })
-    }
-
-    /// Append streaming text to the last entry (creating a partial
-    /// assistant entry if needed). Called by collect_stream per token batch.
-    fn append_streaming(&mut self, text: &str) {
-        if let Some(idx) = self.streaming_index() {
-            assert!(!self.context.conversation.entries()[idx].token_ids.is_empty(),
-                "streaming_index returned entry with empty token_ids at {}", idx);
-            let mut msg = self.context.conversation.entries()[idx].entry.message().clone();
-            msg.append_content(text);
-            self.context.conversation.set_message(idx, msg);
-        } else {
-            self.context.conversation.push(ContextEntry::new(
-                ConversationEntry::Message(Message {
-                    role: Role::Assistant,
-                    content: Some(MessageContent::Text(text.to_string())),
-                    tool_calls: None,
-                    tool_call_id: None,
-                    name: None,
-                    timestamp: None,
-                }),
-                None,
-            ));
-        }
-
-        self.changed.notify_one();
-    }
-
-    /// Finalize the streaming entry with the complete response message.
-    /// Finds the unstamped assistant entry, replaces it via set() with proper token count.
-    fn finalize_streaming(&mut self, msg: Message) {
-        if let Some(i) = self.streaming_index() {
-            let mut stamped = msg.clone();
-            stamped.stamp();
-            self.context.conversation.set(i, ContextEntry::new(
-                ConversationEntry::Message(stamped),
-                Some(chrono::Utc::now()),
-            ));
-        } else {
-            self.push_message(msg.clone());
-        }
-
-        // Log the finalized entry
-        if let Some(ref log) = self.conversation_log {
-            let entry = ConversationEntry::Message(msg);
-            if let Err(e) = log.append(&entry) {
-                eprintln!("warning: failed to log finalized entry: {:#}", e);
-            }
-        }
-
-        self.changed.notify_one();
-    }
-
-    /// Send a user message and run the agent loop until the model
-    /// produces a text response (no more tool calls). Streams text
-    /// and tool activity through the UI channel.
-    ///
-    /// Takes Arc<Mutex<Agent>> and manages locking internally so the
-    /// lock is never held across I/O (API streaming, tool dispatch).
+    /// Run the agent turn loop: assemble prompt, stream response,
+    /// parse into AST, dispatch tool calls, repeat until text response.
    pub async fn turn(
        agent: Arc<tokio::sync::Mutex<Agent>>,
    ) -> Result<TurnResult> {
-        // --- Pre-loop setup (lock 1): collect finished tools ---
        let active_tools = {
-            let mut finished = Vec::new();
-            let tools = {
            let me = agent.lock().await;
+            me.active_tools.clone()
+        };

-                // Collect completed background tool handles — remove from active list
-                // but don't await yet (MutexGuard isn't Send).
-                let mut tools = me.active_tools.lock().unwrap();
+        // Collect finished background tools
+        {
+            let mut finished = Vec::new();
+            {
+                let mut tools = active_tools.lock().unwrap();
                let mut i = 0;
                while i < tools.len() {
                    if tools[i].handle.is_finished() {
@ -454,94 +314,117 @@ impl Agent {
                        i += 1;
                    }
                }
-
-                me.active_tools.clone()
-            };
-
-            // Await finished handles without holding the agent lock
-            let mut bg_results = Vec::new();
+            }
+            if !finished.is_empty() {
+                let mut results = Vec::new();
                for entry in finished {
                    if let Ok((call, output)) = entry.handle.await {
-                    bg_results.push((call, output));
+                        results.push((call, output));
                    }
                }
-
-            // Re-acquire to apply background tool results
-            if !bg_results.is_empty() {
                let mut me = agent.lock().await;
                let mut bg_ds = DispatchState::new();
-                for (call, output) in bg_results {
+                for (call, output) in results {
                    me.apply_tool_result(&call, output, &mut bg_ds);
                }
            }
-
-            tools
-        };
+        }

        let mut overflow_retries: u32 = 0;
        let mut empty_retries: u32 = 0;
        let mut ds = DispatchState::new();

        loop {
-            // --- Lock 2: assemble messages, start stream ---
            let _thinking = start_activity(&agent, "thinking...").await;
+
+            // Assemble prompt and start stream (brief lock)
            let (mut rx, _stream_guard) = {
                let me = agent.lock().await;
-                let sampling = api::SamplingParams {
+                let prompt_tokens = me.assemble_prompt_tokens();
+                me.client.stream_completion(
+                    &prompt_tokens,
+                    api::SamplingParams {
                        temperature: me.temperature,
                        top_p: me.top_p,
                        top_k: me.top_k,
-                };
-                if tokenizer::is_initialized() {
-                    let prompt_tokens = me.assemble_prompt_tokens();
-                    me.client.start_stream_completions(
-                        &prompt_tokens,
-                        sampling,
+                    },
                    None,
                )
-                } else {
-                    let api_messages = me.assemble_api_messages();
-                    me.client.start_stream(
-                        &api_messages,
-                        &me.tools,
-                        &me.reasoning_effort,
-                        sampling,
-                        None,
-                    )
-                }
            };
-            // --- Lock released ---

-            // --- Stream loop (no lock) ---
-            let sr = api::collect_stream(
-                &mut rx, &agent, &active_tools,
+            // Create assistant branch and parser (brief lock)
+            let branch_idx = {
+                let mut me = agent.lock().await;
+                let idx = me.context.len(Section::Conversation);
+                me.context.push(Section::Conversation,
+                    AstNode::branch(Role::Assistant, vec![]));
+                idx
+            };
+            let mut parser = ResponseParser::new(branch_idx);
+            let mut pending_calls: Vec<PendingToolCall> = Vec::new();
+            let mut had_content = false;
+            let mut stream_error: Option<String> = None;
+
+            // Stream loop — no lock held across I/O
+            while let Some(event) = rx.recv().await {
+                match event {
+                    api::StreamToken::Token { text, id: _ } => {
+                        had_content = true;
+                        let mut me = agent.lock().await;
+                        let calls = parser.feed(&text, &mut me.context);
+                        for call in calls {
+                            // Dispatch tool call immediately
+                            let call_clone = call.clone();
+                            let agent_handle = agent.clone();
+                            let handle = tokio::spawn(async move {
+                                let args: serde_json::Value =
+                                    serde_json::from_str(&call_clone.arguments).unwrap_or_default();
+                                let output = tools::dispatch_with_agent(
+                                    &call_clone.name, &args, Some(agent_handle),
                                ).await;
-            let api::StreamResult {
-                content, tool_calls, usage, finish_reason,
-                error: stream_error, display_buf, in_tool_call, reasoning,
-            } = sr;
-            // --- Stream complete ---
-
-            // Push thinking entry if model produced reasoning
-            if !reasoning.is_empty() {
-                let mut me = agent.lock().await;
-                me.push_entry(context::ConversationEntry::Thinking(reasoning));
+                                (call_clone, output)
+                            });
+                            active_tools.lock().unwrap().push(tools::ActiveToolCall {
+                                id: call.id.clone(),
+                                name: call.name.clone(),
+                                detail: call.arguments.clone(),
+                                started: std::time::Instant::now(),
+                                background: false,
+                                handle,
+                            });
+                            pending_calls.push(call);
+                        }
+                    }
+                    api::StreamToken::Error(e) => {
+                        stream_error = Some(e);
+                        break;
+                    }
+                    api::StreamToken::Done { usage } => {
+                        if let Some(u) = usage {
+                            agent.lock().await.last_prompt_tokens = u.prompt_tokens;
+                        }
+                        break;
+                    }
+                }
            }

-            // --- Lock 3: process results ---
-            let (msg, pending) = {
+            // Flush parser remainder
+            {
                let mut me = agent.lock().await;
+                parser.finish(&mut me.context);
+            }

-                // Handle stream errors with retry logic
+            // Handle errors
            if let Some(e) = stream_error {
                let err = anyhow::anyhow!("{}", e);
-                    if crate::agent::context::is_context_overflow(&err) && overflow_retries < 2 {
+                let mut me = agent.lock().await;
+                if context_new::is_context_overflow(&err) && overflow_retries < 2 {
                    overflow_retries += 1;
                    me.notify(format!("context overflow — retrying ({}/2)", overflow_retries));
                    me.compact();
                    continue;
                }
-                    if crate::agent::context::is_stream_error(&err) && empty_retries < 2 {
+                if context_new::is_stream_error(&err) && empty_retries < 2 {
                    empty_retries += 1;
                    me.notify(format!("stream error — retrying ({}/2)", empty_retries));
                    drop(me);
@ -551,33 +434,12 @@ impl Agent {
                return Err(err);
            }

-                if finish_reason.as_deref() == Some("error") {
-                    let detail = if content.is_empty() { "no details".into() } else { content };
-                    return Err(anyhow::anyhow!("model stream error: {}", detail));
-                }
-
-                // Flush remaining display buffer to streaming entry
-                if !in_tool_call && !display_buf.is_empty() {
-                    me.append_streaming(&display_buf);
-                }
-
-                let msg = api::build_response_message(content, tool_calls);
-
-                if let Some(usage) = &usage {
-                    me.last_prompt_tokens = usage.prompt_tokens;
-                }
-
            // Empty response — nudge and retry
-                let has_content = msg.content.is_some();
-                let has_tools = msg.tool_calls.as_ref().map_or(false, |tc| !tc.is_empty());
-                if !has_content && !has_tools {
+            if !had_content && pending_calls.is_empty() {
                if empty_retries < 2 {
                    empty_retries += 1;
-                        dbglog!(
-                            "empty response, injecting nudge and retrying ({}/2)",
-                            empty_retries,
-                        );
-                        me.push_message(Message::user(
+                    let mut me = agent.lock().await;
+                    me.push_node(AstNode::user_msg(
                        "[system] Your previous response was empty. \
                         Please respond with text or use a tool."
                    ));
@ -587,59 +449,46 @@ impl Agent {
                empty_retries = 0;
            }

-                // Collect non-background tool calls fired during streaming
+            // Wait for tool calls to complete
+            if !pending_calls.is_empty() {
+                ds.had_tool_calls = true;
+
+                // Collect non-background tool handles
+                let mut handles = Vec::new();
+                {
                    let mut tools_guard = active_tools.lock().unwrap();
-                let mut non_bg = Vec::new();
                    let mut i = 0;
                    while i < tools_guard.len() {
                        if !tools_guard[i].background {
-                        non_bg.push(tools_guard.remove(i));
+                            handles.push(tools_guard.remove(i));
                        } else {
                            i += 1;
                        }
                    }
-                (msg, non_bg)
-            };
-
-            if !pending.is_empty() {
-                agent.lock().await.finalize_streaming(msg.clone());
-
-                // Drop lock before awaiting tool handles
-                let mut results = Vec::new();
-                for entry in pending {
-                    if let Ok(r) = entry.handle.await {
-                        results.push(r);
                }
-                }
-                // Reacquire to apply results
+
+                for entry in handles {
+                    if let Ok((call, output)) = entry.handle.await {
                        let mut me = agent.lock().await;
-                for (call, output) in results {
                        me.apply_tool_result(&call, output, &mut ds);
                    }
-                continue;
-            }
-
-            // Tool calls (structured API path)
-            if let Some(ref tool_calls) = msg.tool_calls {
-                if !tool_calls.is_empty() {
-                    agent.lock().await.finalize_streaming(msg.clone());
-                    let calls: Vec<ToolCall> = tool_calls.clone();
-                    // Drop lock before tool dispatch
-                    for call in &calls {
-                        Agent::dispatch_tool_call_unlocked(
-                            &agent, &active_tools, call, &mut ds,
-                        ).await;
                }
                continue;
            }
-            }

-            // Genuinely text-only response
-            let text = msg.content_text().to_string();
+            // Text-only response — extract text and return
+            let text = {
+                let me = agent.lock().await;
+                let children = me.context.conversation()[branch_idx].children();
+                children.iter()
+                    .filter_map(|c| c.leaf())
+                    .filter(|l| matches!(l.body(), NodeBody::Content(_)))
+                    .map(|l| l.body().text())
+                    .collect::<Vec<_>>()
+                    .join("")
+            };
+
            let mut me = agent.lock().await;
-            me.finalize_streaming(msg);
-
-            // Drain pending control flags
            if me.pending_yield { ds.yield_requested = true; me.pending_yield = false; }
            if me.pending_model_switch.is_some() { ds.model_switch = me.pending_model_switch.take(); }
            if me.pending_dmn_pause { ds.dmn_pause = true; me.pending_dmn_pause = false; }