consciousness/src/user/parsing.rs
ProofOfConcept d25033b9f4 fire XML tool calls as they arrive during streaming
When </tool_call> is detected in the content stream, parse and
dispatch immediately via FuturesOrdered. Tool calls execute
concurrently while the stream continues. Results collected in
order after the stream ends.

Structured API path (ToolCallDelta) unchanged — still uses
post-stream parallel dispatch.

Co-Developed-By: Kent Overstreet <kent.overstreet@linux.dev>
2026-04-03 22:38:30 -04:00

209 lines
7.6 KiB
Rust

// parsing.rs — Tool call parsing for leaked/streamed XML
//
// When models stream tool calls as XML text (Qwen-style <tool_call>
// blocks) rather than structured tool_calls, this module extracts
// them from the response text.
//
// Handles two wire formats:
// - Qwen XML: <function=name><parameter=key>value</parameter></function>
// - JSON: {"name": "...", "arguments": {...}}
//
// Also handles streaming artifacts: whitespace inside XML tags from
// token boundaries, </think> tags, etc.
use crate::user::types::*;
/// Parse leaked tool calls from response text.
/// Looks for `<tool_call>...</tool_call>` blocks and tries both
/// XML and JSON formats for the body.
/// Parse a single tool call body (content between `<tool_call>` and `</tool_call>`).
pub fn parse_tool_call_body(body: &str) -> Option<ToolCall> {
let normalized = normalize_xml_tags(body);
let body = normalized.trim();
let mut counter = 0u32;
parse_xml_tool_call(body, &mut counter)
.or_else(|| parse_json_tool_call(body, &mut counter))
}
pub fn parse_leaked_tool_calls(text: &str) -> Vec<ToolCall> {
// Normalize whitespace inside XML tags: "<\nfunction\n=\nbash\n>" → "<function=bash>"
// This handles streaming tokenizers that split tags across tokens.
let normalized = normalize_xml_tags(text);
let text = &normalized;
let mut calls = Vec::new();
let mut search_from = 0;
let mut call_counter: u32 = 0;
while let Some(start) = text[search_from..].find("<tool_call>") {
let abs_start = search_from + start;
let after_tag = abs_start + "<tool_call>".len();
let end = match text[after_tag..].find("</tool_call>") {
Some(pos) => after_tag + pos,
None => break,
};
let body = text[after_tag..end].trim();
search_from = end + "</tool_call>".len();
// Try XML format first, then JSON
if let Some(call) = parse_xml_tool_call(body, &mut call_counter) {
calls.push(call);
} else if let Some(call) = parse_json_tool_call(body, &mut call_counter) {
calls.push(call);
}
}
calls
}
/// Normalize whitespace inside XML-like tags for streaming tokenizers.
/// Collapses whitespace between `<` and `>` so that `<\nfunction\n=\nbash\n>`
/// becomes `<function=bash>`, and `</\nparameter\n>` becomes `</parameter>`.
/// Leaves content between tags untouched.
fn normalize_xml_tags(text: &str) -> String {
let mut result = String::with_capacity(text.len());
let mut chars = text.chars().peekable();
while let Some(ch) = chars.next() {
if ch == '<' {
let mut tag = String::from('<');
for inner in chars.by_ref() {
if inner == '>' {
tag.push('>');
break;
} else if inner.is_whitespace() {
// Skip whitespace inside tags
} else {
tag.push(inner);
}
}
result.push_str(&tag);
} else {
result.push(ch);
}
}
result
}
/// Parse a Qwen-style `<tag=value>body</tag>` pseudo-XML element.
/// Returns `(value, body, rest)` on success.
fn parse_qwen_tag<'a>(s: &'a str, tag: &str) -> Option<(&'a str, &'a str, &'a str)> {
let open = format!("<{}=", tag);
let close = format!("</{}>", tag);
let start = s.find(&open)? + open.len();
let name_end = start + s[start..].find('>')?;
let body_start = name_end + 1;
let body_end = body_start + s[body_start..].find(&close)?;
Some((
s[start..name_end].trim(),
s[body_start..body_end].trim(),
&s[body_end + close.len()..],
))
}
/// Parse Qwen's XML tool call format.
fn parse_xml_tool_call(body: &str, counter: &mut u32) -> Option<ToolCall> {
let (func_name, func_body, _) = parse_qwen_tag(body, "function")?;
let func_name = func_name.to_string();
let mut args = serde_json::Map::new();
let mut rest = func_body;
while let Some((key, val, remainder)) = parse_qwen_tag(rest, "parameter") {
args.insert(key.to_string(), serde_json::Value::String(val.to_string()));
rest = remainder;
}
*counter += 1;
Some(ToolCall {
id: format!("leaked_{}", counter),
call_type: "function".to_string(),
function: FunctionCall {
name: func_name,
arguments: serde_json::to_string(&args).unwrap_or_default(),
},
})
}
/// Parse JSON tool call format (some models emit this).
fn parse_json_tool_call(body: &str, counter: &mut u32) -> Option<ToolCall> {
let v: serde_json::Value = serde_json::from_str(body).ok()?;
let name = v["name"].as_str()?;
let arguments = &v["arguments"];
*counter += 1;
Some(ToolCall {
id: format!("leaked_{}", counter),
call_type: "function".to_string(),
function: FunctionCall {
name: name.to_string(),
arguments: serde_json::to_string(arguments).unwrap_or_default(),
},
})
}
/// Strip tool call XML and thinking tokens from text so the conversation
/// history stays clean. Removes `<tool_call>...</tool_call>` blocks and
/// `</think>` tags (thinking content before them is kept — it's useful context).
pub fn strip_leaked_artifacts(text: &str) -> String {
let normalized = normalize_xml_tags(text);
let mut result = normalized.clone();
// Remove <tool_call>...</tool_call> blocks
while let Some(start) = result.find("<tool_call>") {
if let Some(end_pos) = result[start..].find("</tool_call>") {
let end = start + end_pos + "</tool_call>".len();
result = format!("{}{}", &result[..start], &result[end..]);
} else {
break;
}
}
// Remove </think> tags (but keep the thinking text before them)
result = result.replace("</think>", "");
result.trim().to_string()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_leaked_tool_call_clean() {
let text = "thinking\n</think>\n<tool_call>\n<function=bash>\n<parameter=command>poc-memory used core-personality</parameter>\n</function>\n</tool_call>";
let calls = parse_leaked_tool_calls(text);
assert_eq!(calls.len(), 1);
assert_eq!(calls[0].function.name, "bash");
let args: serde_json::Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
assert_eq!(args["command"], "poc-memory used core-personality");
}
#[test]
fn test_leaked_tool_call_streamed_whitespace() {
// Streaming tokenizer splits XML tags across tokens with newlines
let text = "<tool_call>\n<\nfunction\n=\nbash\n>\n<\nparameter\n=\ncommand\n>pwd</\nparameter\n>\n</\nfunction\n>\n</tool_call>";
let calls = parse_leaked_tool_calls(text);
assert_eq!(calls.len(), 1, "should parse streamed format");
assert_eq!(calls[0].function.name, "bash");
let args: serde_json::Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
assert_eq!(args["command"], "pwd");
}
#[test]
fn test_normalize_preserves_content() {
let text = "<function=bash>\n<parameter=command>echo hello world</parameter>\n</function>";
let normalized = normalize_xml_tags(text);
// Newlines between tags are not inside tags, so preserved
assert_eq!(normalized, "<function=bash>\n<parameter=command>echo hello world</parameter>\n</function>");
}
#[test]
fn test_normalize_strips_tag_internal_whitespace() {
let text = "<\nfunction\n=\nbash\n>";
let normalized = normalize_xml_tags(text);
assert_eq!(normalized, "<function=bash>");
}
}