forked from kent/consciousness
context: salvage in-flight tag accumulators on premature stream end
ResponseParser.finish() was only flushing self.buf — the rolling tail window — and silently dropping self.think_buf and self.tool_call_buf. When a stream ended inside an unterminated <think>...</think> or <tool_call>...</tool_call> block (max_tokens reached, EOS before the close tag, server-side cancel), all the accumulated in-tag content was discarded and only the trailing ~8 bytes survived (drain_safe keeps `close_tag.len()` bytes at the tail of buf to handle across-chunk tag splits — and `</think>` is exactly 8 chars). Symptom: assistant responses cut off, only the last few characters come through. Especially severe in native-think mode where in_think is set from prefill, so the entire response accumulates in think_buf and gets wiped on premature stop. In finish(): if in_think, drain buf into think_buf and emit as a Thinking node (preserving the partial thought). If in_tool_call, attempt to parse the body; on parse failure, wrap the partial as content with the leading <tool_call> open tag so the model sees its own truncated attempt next turn rather than losing it. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
parent
c2433c1773
commit
371b40078d
1 changed files with 37 additions and 1 deletions
|
|
@ -900,7 +900,43 @@ impl ResponseParser {
|
|||
}
|
||||
|
||||
pub fn finish(mut self, ctx: &mut ContextState) {
|
||||
if !self.buf.is_empty() {
|
||||
// Salvage any in-flight tag accumulators if the stream ended
|
||||
// before the close tag arrived (max_tokens, premature EOS,
|
||||
// server-side cancel). Without this, an unterminated
|
||||
// <think>...</think> drops all of self.think_buf and only the
|
||||
// trailing rolling window in self.buf survives — observed as
|
||||
// "responses cut off, only the last ~8 characters come
|
||||
// through" because drain_safe keeps `close_tag.len()` bytes
|
||||
// (8 for `</think>`) at the tail of buf.
|
||||
if self.in_think {
|
||||
if !self.buf.is_empty() {
|
||||
self.think_buf.push_str(&std::mem::take(&mut self.buf));
|
||||
}
|
||||
let text = std::mem::take(&mut self.think_buf).trim().to_string();
|
||||
if !text.is_empty() {
|
||||
self.push_child(ctx, AstNode::thinking(text));
|
||||
}
|
||||
self.in_think = false;
|
||||
} else if self.in_tool_call {
|
||||
if !self.buf.is_empty() {
|
||||
self.tool_call_buf.push_str(&std::mem::take(&mut self.buf));
|
||||
}
|
||||
let body = std::mem::take(&mut self.tool_call_buf);
|
||||
match parse_tool_call_body(&body) {
|
||||
Some((name, args)) => {
|
||||
self.flush_content(ctx);
|
||||
self.push_child(ctx, AstNode::tool_call(&name, &args));
|
||||
}
|
||||
None => {
|
||||
// Body's likely incomplete (no `</tool_call>` ever
|
||||
// arrived). Wrap as content with the open tag so the
|
||||
// model can see its own truncated attempt next turn
|
||||
// rather than losing it silently.
|
||||
self.content_parts.push(format!("<tool_call>\n{}", body));
|
||||
}
|
||||
}
|
||||
self.in_tool_call = false;
|
||||
} else if !self.buf.is_empty() {
|
||||
self.content_parts.push(std::mem::take(&mut self.buf));
|
||||
}
|
||||
self.flush_content(ctx);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue