WIP: Wiring context_new into agent — turn loop, StreamToken, dead code removal

Work in progress. New turn loop uses ResponseParser + StreamToken. Killed StreamEvent, append_streaming, finalize_streaming, streaming_index, assemble_api_messages, working_stack. Many methods still reference old types — fixing next. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 14:55:10 -04:00 · 2026-04-08 14:55:10 -04:00 · 9c79d7a037
commit 9c79d7a037
parent 648356ae40
4 changed files with 194 additions and 416 deletions
--- a/src/agent/api/mod.rs
+++ b/src/agent/api/mod.rs
@ -50,28 +50,10 @@ fn tools_to_json_str(tools: &[agent_tools::Tool]) -> String {
    format!("[{}]", inner.join(","))
 }

-/// Events produced by the streaming API backends.
-/// The runner reads these and decides what to display where.
-pub(crate) enum StreamEvent {
-    /// Content token from the model's response.
-    Content(String),
-    /// Reasoning/thinking token (internal monologue).
-    Reasoning(String),
-    /// Incremental tool call delta (structured, from APIs that support it).
-    ToolCallDelta {
-        index: usize,
-        id: Option<String>,
-        call_type: Option<String>,
-        name: Option<String>,
-        arguments: Option<String>,
-    },
-    /// Token usage stats.
-    Usage(Usage),
-    /// Stream finished.
-    Finished {
-        reason: String,
-    },
-    /// Error from the stream.
+/// One token from the streaming completions API.
+pub(crate) enum StreamToken {
+    Token { text: String, id: u32 },
+    Done { usage: Option<Usage> },
    Error(String),
 }

@ -133,14 +115,14 @@ impl ApiClient {
        (rx, AbortOnDrop(handle))
    }

-    /// Start a streaming completion with raw token IDs.
-    /// No message formatting — the caller provides the complete prompt as tokens.
-    pub(crate) fn start_stream_completions(
+    /// Stream a completion with raw token IDs.
+    /// Returns (text, token_id) per token via channel.
+    pub(crate) fn stream_completion(
        &self,
        prompt_tokens: &[u32],
        sampling: SamplingParams,
        priority: Option<i32>,
-    ) -> (mpsc::UnboundedReceiver<StreamEvent>, AbortOnDrop) {
+    ) -> (mpsc::UnboundedReceiver<StreamToken>, AbortOnDrop) {
        let (tx, rx) = mpsc::unbounded_channel();
        let client = self.client.clone();
        let api_key = self.api_key.clone();
@ -154,7 +136,7 @@ impl ApiClient {
                &prompt_tokens, &tx, sampling, priority,
            ).await;
            if let Err(e) = result {
-                let _ = tx.send(StreamEvent::Error(e.to_string()));
+                let _ = tx.send(StreamToken::Error(e.to_string()));
            }
        });

--- a/src/agent/api/openai.rs
+++ b/src/agent/api/openai.rs
@ -9,7 +9,7 @@ use tokio::sync::mpsc;

 use super::http::HttpClient;
 use super::types::*;
-use super::StreamEvent;
+use super::StreamToken;

 /// Stream SSE events from an OpenAI-compatible endpoint, sending
 /// parsed StreamEvents through the channel. The caller (runner)
@ -186,16 +186,16 @@ pub(super) async fn stream_events(
    Ok(())
 }

-/// Stream from the /v1/completions endpoint using raw token IDs.
-/// Tool calls come as text (<tool_call> tags) and are parsed by the caller.
-/// Thinking content comes as <think> tags and is split into Reasoning events.
+/// Stream from /v1/completions with raw token IDs in and out.
+/// Each SSE chunk yields one token (text + id). All parsing (think tags,
+/// tool calls) is handled by the ResponseParser, not here.
 pub(super) async fn stream_completions(
    client: &HttpClient,
    base_url: &str,
    api_key: &str,
    model: &str,
    prompt_tokens: &[u32],
-    tx: &mpsc::UnboundedSender<StreamEvent>,
+    tx: &mpsc::UnboundedSender<StreamToken>,
    sampling: super::SamplingParams,
    priority: Option<i32>,
 ) -> Result<()> {
@ -207,6 +207,8 @@ pub(super) async fn stream_completions(
        "top_p": sampling.top_p,
        "top_k": sampling.top_k,
        "stream": true,
+        "return_token_ids": true,
+        "skip_special_tokens": false,
        "stop_token_ids": [super::super::tokenizer::IM_END],
    });
    if let Some(p) = priority {
@ -229,20 +231,15 @@ pub(super) async fn stream_completions(

    let mut reader = super::SseReader::new();
    let mut content_len: usize = 0;
-    let mut first_content_at = None;
-    let mut finish_reason = None;
    let mut usage = None;
-    let mut in_think = false;

    while let Some(event) = reader.next_event(&mut response).await? {
        if let Some(err_msg) = event["error"]["message"].as_str() {
            anyhow::bail!("API error in stream: {}", err_msg);
        }

-        // Completions chunks have a simpler structure
        if let Some(u) = event["usage"].as_object() {
            if let Ok(u) = serde_json::from_value::<Usage>(serde_json::Value::Object(u.clone())) {
-                let _ = tx.send(StreamEvent::Usage(u.clone()));
                usage = Some(u);
            }
        }
@ -253,78 +250,27 @@ pub(super) async fn stream_completions(
        };

        for choice in choices {
-            if let Some(reason) = choice["finish_reason"].as_str() {
-                if reason != "null" {
-                    finish_reason = Some(reason.to_string());
-                }
-            }
+            let text = choice["text"].as_str().unwrap_or("");
+            let token_ids = choice["token_ids"].as_array();

-            if let Some(text) = choice["text"].as_str() {
-                if text.is_empty() { continue; }
-
-                // Handle <think> tags — split into Reasoning vs Content
-                if text.contains("<think>") || in_think {
-                    // Simple state machine for think tags
-                    let mut remaining = text;
-                    while !remaining.is_empty() {
-                        if in_think {
-                            if let Some(end) = remaining.find("</think>") {
-                                let thinking = &remaining[..end];
-                                if !thinking.is_empty() {
-                                    let _ = tx.send(StreamEvent::Reasoning(thinking.to_string()));
-                                }
-                                remaining = &remaining[end + 8..];
-                                in_think = false;
-                            } else {
-                                let _ = tx.send(StreamEvent::Reasoning(remaining.to_string()));
-                                break;
-                            }
-                        } else {
-                            if let Some(start) = remaining.find("<think>") {
-                                let content = &remaining[..start];
-                                if !content.is_empty() {
-                                    content_len += content.len();
-                                    if first_content_at.is_none() {
-                                        first_content_at = Some(reader.stream_start.elapsed());
-                                    }
-                                    let _ = tx.send(StreamEvent::Content(content.to_string()));
-                                }
-                                remaining = &remaining[start + 7..];
-                                in_think = true;
-                            } else {
-                                content_len += remaining.len();
-                                if first_content_at.is_none() {
-                                    first_content_at = Some(reader.stream_start.elapsed());
-                                }
-                                let _ = tx.send(StreamEvent::Content(remaining.to_string()));
-                                break;
-                            }
-                        }
+            if let Some(ids) = token_ids {
+                for (i, id_val) in ids.iter().enumerate() {
+                    if let Some(id) = id_val.as_u64() {
+                        content_len += text.len();
+                        let _ = tx.send(StreamToken::Token {
+                            text: if i == 0 { text.to_string() } else { String::new() },
+                            id: id as u32,
+                        });
                    }
-                } else {
-                    content_len += text.len();
-                    if first_content_at.is_none() {
-                        first_content_at = Some(reader.stream_start.elapsed());
-                    }
-                    let _ = tx.send(StreamEvent::Content(text.to_string()));
                }
+            } else if !text.is_empty() {
+                // Fallback: text without token IDs (shouldn't happen with return_token_ids)
+                content_len += text.len();
+                let _ = tx.send(StreamToken::Token { text: text.to_string(), id: 0 });
            }
        }
    }

-    let total_elapsed = reader.stream_start.elapsed();
-    super::log_diagnostics(
-        content_len, 0, 0, "none",
-        &finish_reason,
-        reader.chunks_received,
-        reader.sse_lines_parsed,
-        reader.sse_parse_errors,
-        0, total_elapsed, first_content_at,
-        &usage, &[],
-    );
-
-    let reason = finish_reason.unwrap_or_default();
-    let _ = tx.send(StreamEvent::Finished { reason });
-
+    let _ = tx.send(StreamToken::Done { usage });
    Ok(())
 }