2026-03-25 00:52:41 -04:00
// parsing.rs — Tool call parsing for leaked/streamed XML
//
// When models stream tool calls as XML text (Qwen-style <tool_call>
// blocks) rather than structured tool_calls, this module extracts
// them from the response text.
//
// Handles two wire formats:
// - Qwen XML: <function=name><parameter=key>value</parameter></function>
// - JSON: {"name": "...", "arguments": {...}}
//
// Also handles streaming artifacts: whitespace inside XML tags from
// token boundaries, </think> tags, etc.
2026-04-04 17:58:49 -04:00
use super ::types ::{ ToolCall , FunctionCall } ;
2026-03-25 00:52:41 -04:00
/// Parse leaked tool calls from response text.
/// Looks for `<tool_call>...</tool_call>` blocks and tries both
/// XML and JSON formats for the body.
2026-04-03 22:14:35 -04:00
/// Parse a single tool call body (content between `<tool_call>` and `</tool_call>`).
2026-04-07 13:43:25 -04:00
pub ( super ) fn parse_tool_call_body ( body : & str ) -> Option < ToolCall > {
2026-04-03 22:14:35 -04:00
let normalized = normalize_xml_tags ( body ) ;
let body = normalized . trim ( ) ;
let mut counter = 0 u32 ;
parse_xml_tool_call ( body , & mut counter )
. or_else ( | | parse_json_tool_call ( body , & mut counter ) )
}
2026-04-07 13:43:25 -04:00
pub ( super ) fn parse_leaked_tool_calls ( text : & str ) -> Vec < ToolCall > {
2026-03-25 00:52:41 -04:00
// Normalize whitespace inside XML tags: "<\nfunction\n=\nbash\n>" → "<function=bash>"
// This handles streaming tokenizers that split tags across tokens.
let normalized = normalize_xml_tags ( text ) ;
let text = & normalized ;
let mut calls = Vec ::new ( ) ;
let mut search_from = 0 ;
let mut call_counter : u32 = 0 ;
while let Some ( start ) = text [ search_from .. ] . find ( " <tool_call> " ) {
let abs_start = search_from + start ;
let after_tag = abs_start + " <tool_call> " . len ( ) ;
let end = match text [ after_tag .. ] . find ( " </tool_call> " ) {
Some ( pos ) = > after_tag + pos ,
None = > break ,
} ;
let body = text [ after_tag .. end ] . trim ( ) ;
search_from = end + " </tool_call> " . len ( ) ;
// Try XML format first, then JSON
if let Some ( call ) = parse_xml_tool_call ( body , & mut call_counter ) {
calls . push ( call ) ;
} else if let Some ( call ) = parse_json_tool_call ( body , & mut call_counter ) {
calls . push ( call ) ;
}
}
calls
}
/// Normalize whitespace inside XML-like tags for streaming tokenizers.
/// Collapses whitespace between `<` and `>` so that `<\nfunction\n=\nbash\n>`
/// becomes `<function=bash>`, and `</\nparameter\n>` becomes `</parameter>`.
/// Leaves content between tags untouched.
fn normalize_xml_tags ( text : & str ) -> String {
let mut result = String ::with_capacity ( text . len ( ) ) ;
let mut chars = text . chars ( ) . peekable ( ) ;
while let Some ( ch ) = chars . next ( ) {
if ch = = '<' {
let mut tag = String ::from ( '<' ) ;
for inner in chars . by_ref ( ) {
if inner = = '>' {
tag . push ( '>' ) ;
break ;
} else if inner . is_whitespace ( ) {
// Skip whitespace inside tags
} else {
tag . push ( inner ) ;
}
}
result . push_str ( & tag ) ;
} else {
result . push ( ch ) ;
}
}
result
}
/// Parse a Qwen-style `<tag=value>body</tag>` pseudo-XML element.
/// Returns `(value, body, rest)` on success.
fn parse_qwen_tag < ' a > ( s : & ' a str , tag : & str ) -> Option < ( & ' a str , & ' a str , & ' a str ) > {
let open = format! ( " < {} = " , tag ) ;
let close = format! ( " </ {} > " , tag ) ;
let start = s . find ( & open ) ? + open . len ( ) ;
let name_end = start + s [ start .. ] . find ( '>' ) ? ;
let body_start = name_end + 1 ;
let body_end = body_start + s [ body_start .. ] . find ( & close ) ? ;
Some ( (
s [ start .. name_end ] . trim ( ) ,
s [ body_start .. body_end ] . trim ( ) ,
& s [ body_end + close . len ( ) .. ] ,
) )
}
/// Parse Qwen's XML tool call format.
fn parse_xml_tool_call ( body : & str , counter : & mut u32 ) -> Option < ToolCall > {
let ( func_name , func_body , _ ) = parse_qwen_tag ( body , " function " ) ? ;
let func_name = func_name . to_string ( ) ;
let mut args = serde_json ::Map ::new ( ) ;
let mut rest = func_body ;
while let Some ( ( key , val , remainder ) ) = parse_qwen_tag ( rest , " parameter " ) {
args . insert ( key . to_string ( ) , serde_json ::Value ::String ( val . to_string ( ) ) ) ;
rest = remainder ;
}
* counter + = 1 ;
Some ( ToolCall {
id : format ! ( " leaked_{} " , counter ) ,
call_type : " function " . to_string ( ) ,
function : FunctionCall {
name : func_name ,
arguments : serde_json ::to_string ( & args ) . unwrap_or_default ( ) ,
} ,
} )
}
/// Parse JSON tool call format (some models emit this).
fn parse_json_tool_call ( body : & str , counter : & mut u32 ) -> Option < ToolCall > {
let v : serde_json ::Value = serde_json ::from_str ( body ) . ok ( ) ? ;
let name = v [ " name " ] . as_str ( ) ? ;
let arguments = & v [ " arguments " ] ;
* counter + = 1 ;
Some ( ToolCall {
id : format ! ( " leaked_{} " , counter ) ,
call_type : " function " . to_string ( ) ,
function : FunctionCall {
name : name . to_string ( ) ,
arguments : serde_json ::to_string ( arguments ) . unwrap_or_default ( ) ,
} ,
} )
}
/// Strip tool call XML and thinking tokens from text so the conversation
/// history stays clean. Removes `<tool_call>...</tool_call>` blocks and
/// `</think>` tags (thinking content before them is kept — it's useful context).
2026-04-07 13:43:25 -04:00
pub ( super ) fn strip_leaked_artifacts ( text : & str ) -> String {
2026-03-25 00:52:41 -04:00
let normalized = normalize_xml_tags ( text ) ;
let mut result = normalized . clone ( ) ;
// Remove <tool_call>...</tool_call> blocks
while let Some ( start ) = result . find ( " <tool_call> " ) {
if let Some ( end_pos ) = result [ start .. ] . find ( " </tool_call> " ) {
let end = start + end_pos + " </tool_call> " . len ( ) ;
result = format! ( " {} {} " , & result [ .. start ] , & result [ end .. ] ) ;
} else {
break ;
}
}
// Remove </think> tags (but keep the thinking text before them)
result = result . replace ( " </think> " , " " ) ;
result . trim ( ) . to_string ( )
}
#[ cfg(test) ]
mod tests {
use super ::* ;
#[ test ]
fn test_leaked_tool_call_clean ( ) {
let text = " thinking \n </think> \n <tool_call> \n <function=bash> \n <parameter=command>poc-memory used core-personality</parameter> \n </function> \n </tool_call> " ;
let calls = parse_leaked_tool_calls ( text ) ;
assert_eq! ( calls . len ( ) , 1 ) ;
assert_eq! ( calls [ 0 ] . function . name , " bash " ) ;
let args : serde_json ::Value = serde_json ::from_str ( & calls [ 0 ] . function . arguments ) . unwrap ( ) ;
assert_eq! ( args [ " command " ] , " poc-memory used core-personality " ) ;
}
#[ test ]
fn test_leaked_tool_call_streamed_whitespace ( ) {
// Streaming tokenizer splits XML tags across tokens with newlines
let text = " <tool_call> \n < \n function \n = \n bash \n > \n < \n parameter \n = \n command \n >pwd</ \n parameter \n > \n </ \n function \n > \n </tool_call> " ;
let calls = parse_leaked_tool_calls ( text ) ;
assert_eq! ( calls . len ( ) , 1 , " should parse streamed format " ) ;
assert_eq! ( calls [ 0 ] . function . name , " bash " ) ;
let args : serde_json ::Value = serde_json ::from_str ( & calls [ 0 ] . function . arguments ) . unwrap ( ) ;
assert_eq! ( args [ " command " ] , " pwd " ) ;
}
#[ test ]
fn test_normalize_preserves_content ( ) {
let text = " <function=bash> \n <parameter=command>echo hello world</parameter> \n </function> " ;
let normalized = normalize_xml_tags ( text ) ;
// Newlines between tags are not inside tags, so preserved
assert_eq! ( normalized , " <function=bash> \n <parameter=command>echo hello world</parameter> \n </function> " ) ;
}
#[ test ]
fn test_normalize_strips_tag_internal_whitespace ( ) {
let text = " < \n function \n = \n bash \n > " ;
let normalized = normalize_xml_tags ( text ) ;
assert_eq! ( normalized , " <function=bash> " ) ;
}
}