From 371b40078dcff9c668791f7c0933383f25c110c5 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 24 Apr 2026 23:32:44 -0400 Subject: [PATCH] context: salvage in-flight tag accumulators on premature stream end MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ResponseParser.finish() was only flushing self.buf — the rolling tail window — and silently dropping self.think_buf and self.tool_call_buf. When a stream ended inside an unterminated ... or ... block (max_tokens reached, EOS before the close tag, server-side cancel), all the accumulated in-tag content was discarded and only the trailing ~8 bytes survived (drain_safe keeps `close_tag.len()` bytes at the tail of buf to handle across-chunk tag splits — and `` is exactly 8 chars). Symptom: assistant responses cut off, only the last few characters come through. Especially severe in native-think mode where in_think is set from prefill, so the entire response accumulates in think_buf and gets wiped on premature stop. In finish(): if in_think, drain buf into think_buf and emit as a Thinking node (preserving the partial thought). If in_tool_call, attempt to parse the body; on parse failure, wrap the partial as content with the leading open tag so the model sees its own truncated attempt next turn rather than losing it. Co-Authored-By: Proof of Concept --- src/agent/context.rs | 38 +++++++++++++++++++++++++++++++++++++- 1 file changed, 37 insertions(+), 1 deletion(-) diff --git a/src/agent/context.rs b/src/agent/context.rs index d61136f..a42beeb 100644 --- a/src/agent/context.rs +++ b/src/agent/context.rs @@ -900,7 +900,43 @@ impl ResponseParser { } pub fn finish(mut self, ctx: &mut ContextState) { - if !self.buf.is_empty() { + // Salvage any in-flight tag accumulators if the stream ended + // before the close tag arrived (max_tokens, premature EOS, + // server-side cancel). Without this, an unterminated + // ... drops all of self.think_buf and only the + // trailing rolling window in self.buf survives — observed as + // "responses cut off, only the last ~8 characters come + // through" because drain_safe keeps `close_tag.len()` bytes + // (8 for ``) at the tail of buf. + if self.in_think { + if !self.buf.is_empty() { + self.think_buf.push_str(&std::mem::take(&mut self.buf)); + } + let text = std::mem::take(&mut self.think_buf).trim().to_string(); + if !text.is_empty() { + self.push_child(ctx, AstNode::thinking(text)); + } + self.in_think = false; + } else if self.in_tool_call { + if !self.buf.is_empty() { + self.tool_call_buf.push_str(&std::mem::take(&mut self.buf)); + } + let body = std::mem::take(&mut self.tool_call_buf); + match parse_tool_call_body(&body) { + Some((name, args)) => { + self.flush_content(ctx); + self.push_child(ctx, AstNode::tool_call(&name, &args)); + } + None => { + // Body's likely incomplete (no `` ever + // arrived). Wrap as content with the open tag so the + // model can see its own truncated attempt next turn + // rather than losing it silently. + self.content_parts.push(format!("\n{}", body)); + } + } + self.in_tool_call = false; + } else if !self.buf.is_empty() { self.content_parts.push(std::mem::take(&mut self.buf)); } self.flush_content(ctx);