// parsing.rs — Tool call parsing for leaked/streamed XML // // When models stream tool calls as XML text (Qwen-style // blocks) rather than structured tool_calls, this module extracts // them from the response text. // // Handles two wire formats: // - Qwen XML: value // - JSON: {"name": "...", "arguments": {...}} // // Also handles streaming artifacts: whitespace inside XML tags from // token boundaries, tags, etc. use crate::types::*; /// Parse leaked tool calls from response text. /// Looks for `...` blocks and tries both /// XML and JSON formats for the body. pub fn parse_leaked_tool_calls(text: &str) -> Vec { // Normalize whitespace inside XML tags: "<\nfunction\n=\nbash\n>" → "" // This handles streaming tokenizers that split tags across tokens. let normalized = normalize_xml_tags(text); let text = &normalized; let mut calls = Vec::new(); let mut search_from = 0; let mut call_counter: u32 = 0; while let Some(start) = text[search_from..].find("") { let abs_start = search_from + start; let after_tag = abs_start + "".len(); let end = match text[after_tag..].find("") { Some(pos) => after_tag + pos, None => break, }; let body = text[after_tag..end].trim(); search_from = end + "".len(); // Try XML format first, then JSON if let Some(call) = parse_xml_tool_call(body, &mut call_counter) { calls.push(call); } else if let Some(call) = parse_json_tool_call(body, &mut call_counter) { calls.push(call); } } calls } /// Normalize whitespace inside XML-like tags for streaming tokenizers. /// Collapses whitespace between `<` and `>` so that `<\nfunction\n=\nbash\n>` /// becomes ``, and `` becomes ``. /// Leaves content between tags untouched. fn normalize_xml_tags(text: &str) -> String { let mut result = String::with_capacity(text.len()); let mut chars = text.chars().peekable(); while let Some(ch) = chars.next() { if ch == '<' { let mut tag = String::from('<'); for inner in chars.by_ref() { if inner == '>' { tag.push('>'); break; } else if inner.is_whitespace() { // Skip whitespace inside tags } else { tag.push(inner); } } result.push_str(&tag); } else { result.push(ch); } } result } /// Parse a Qwen-style `body` pseudo-XML element. /// Returns `(value, body, rest)` on success. fn parse_qwen_tag<'a>(s: &'a str, tag: &str) -> Option<(&'a str, &'a str, &'a str)> { let open = format!("<{}=", tag); let close = format!("", tag); let start = s.find(&open)? + open.len(); let name_end = start + s[start..].find('>')?; let body_start = name_end + 1; let body_end = body_start + s[body_start..].find(&close)?; Some(( s[start..name_end].trim(), s[body_start..body_end].trim(), &s[body_end + close.len()..], )) } /// Parse Qwen's XML tool call format. fn parse_xml_tool_call(body: &str, counter: &mut u32) -> Option { let (func_name, func_body, _) = parse_qwen_tag(body, "function")?; let func_name = func_name.to_string(); let mut args = serde_json::Map::new(); let mut rest = func_body; while let Some((key, val, remainder)) = parse_qwen_tag(rest, "parameter") { args.insert(key.to_string(), serde_json::Value::String(val.to_string())); rest = remainder; } *counter += 1; Some(ToolCall { id: format!("leaked_{}", counter), call_type: "function".to_string(), function: FunctionCall { name: func_name, arguments: serde_json::to_string(&args).unwrap_or_default(), }, }) } /// Parse JSON tool call format (some models emit this). fn parse_json_tool_call(body: &str, counter: &mut u32) -> Option { let v: serde_json::Value = serde_json::from_str(body).ok()?; let name = v["name"].as_str()?; let arguments = &v["arguments"]; *counter += 1; Some(ToolCall { id: format!("leaked_{}", counter), call_type: "function".to_string(), function: FunctionCall { name: name.to_string(), arguments: serde_json::to_string(arguments).unwrap_or_default(), }, }) } /// Strip tool call XML and thinking tokens from text so the conversation /// history stays clean. Removes `...` blocks and /// `` tags (thinking content before them is kept — it's useful context). pub fn strip_leaked_artifacts(text: &str) -> String { let normalized = normalize_xml_tags(text); let mut result = normalized.clone(); // Remove ... blocks while let Some(start) = result.find("") { if let Some(end_pos) = result[start..].find("") { let end = start + end_pos + "".len(); result = format!("{}{}", &result[..start], &result[end..]); } else { break; } } // Remove tags (but keep the thinking text before them) result = result.replace("", ""); result.trim().to_string() } #[cfg(test)] mod tests { use super::*; #[test] fn test_leaked_tool_call_clean() { let text = "thinking\n\n\n\npoc-memory used core-personality\n\n"; let calls = parse_leaked_tool_calls(text); assert_eq!(calls.len(), 1); assert_eq!(calls[0].function.name, "bash"); let args: serde_json::Value = serde_json::from_str(&calls[0].function.arguments).unwrap(); assert_eq!(args["command"], "poc-memory used core-personality"); } #[test] fn test_leaked_tool_call_streamed_whitespace() { // Streaming tokenizer splits XML tags across tokens with newlines let text = "\n<\nfunction\n=\nbash\n>\n<\nparameter\n=\ncommand\n>pwd\n\n"; let calls = parse_leaked_tool_calls(text); assert_eq!(calls.len(), 1, "should parse streamed format"); assert_eq!(calls[0].function.name, "bash"); let args: serde_json::Value = serde_json::from_str(&calls[0].function.arguments).unwrap(); assert_eq!(args["command"], "pwd"); } #[test] fn test_normalize_preserves_content() { let text = "\necho hello world\n"; let normalized = normalize_xml_tags(text); // Newlines between tags are not inside tags, so preserved assert_eq!(normalized, "\necho hello world\n"); } #[test] fn test_normalize_strips_tag_internal_whitespace() { let text = "<\nfunction\n=\nbash\n>"; let normalized = normalize_xml_tags(text); assert_eq!(normalized, ""); } }