diff --git a/poc-agent/src/agent.rs b/poc-agent/src/agent.rs
index e00ffff..11e267d 100644
--- a/poc-agent/src/agent.rs
+++ b/poc-agent/src/agent.rs
@@ -463,7 +463,7 @@ impl Agent {
// No structured tool calls — check for leaked tool calls
// (Qwen sometimes outputs XML as text).
let text = msg.content_text().to_string();
- let leaked = parse_leaked_tool_calls(&text);
+ let leaked = crate::parsing::parse_leaked_tool_calls(&text);
if !leaked.is_empty() {
let _ = ui_tx.send(UiMessage::Debug(format!(
@@ -472,7 +472,7 @@ impl Agent {
)));
// Strip tool call XML and thinking tokens from the message
// so they don't clutter the conversation history.
- let cleaned = strip_leaked_artifacts(&text);
+ let cleaned = crate::parsing::strip_leaked_artifacts(&text);
let mut clean_msg = msg.clone();
clean_msg.content = if cleaned.trim().is_empty() {
None
@@ -1500,199 +1500,5 @@ fn summarize_args(tool_name: &str, args: &serde_json::Value) -> String {
}
}
-/// Parse tool calls leaked as text by models that don't always use the
-/// structured function calling API (notably Qwen).
-///
-/// Handles the XML format:
-///
-///
-/// echo hello
-///
-///
-///
-/// Also handles JSON-in-text format:
-///
-/// {"name": "bash", "arguments": {"command": "echo hello"}}
-///
-fn parse_leaked_tool_calls(text: &str) -> Vec {
- // Normalize whitespace inside XML tags: "<\nfunction\n=\nbash\n>" → ""
- // This handles streaming tokenizers that split tags across tokens.
- let normalized = normalize_xml_tags(text);
- let text = &normalized;
-
- let mut calls = Vec::new();
- let mut search_from = 0;
- let mut call_counter: u32 = 0;
-
- while let Some(start) = text[search_from..].find("") {
- let abs_start = search_from + start;
- let after_tag = abs_start + "".len();
-
- let end = match text[after_tag..].find("") {
- Some(pos) => after_tag + pos,
- None => break,
- };
-
- let body = text[after_tag..end].trim();
- search_from = end + "".len();
-
- // Try XML format first, then JSON
- if let Some(call) = parse_xml_tool_call(body, &mut call_counter) {
- calls.push(call);
- } else if let Some(call) = parse_json_tool_call(body, &mut call_counter) {
- calls.push(call);
- }
- }
-
- calls
-}
-
-/// Normalize whitespace inside XML-like tags for streaming tokenizers.
-/// Collapses whitespace between `<` and `>` so that `<\nfunction\n=\nbash\n>`
-/// becomes ``, and `\nparameter\n>` becomes ``.
-/// Leaves content between tags untouched.
-fn normalize_xml_tags(text: &str) -> String {
- let mut result = String::with_capacity(text.len());
- let mut chars = text.chars().peekable();
- while let Some(ch) = chars.next() {
- if ch == '<' {
- let mut tag = String::from('<');
- for inner in chars.by_ref() {
- if inner == '>' {
- tag.push('>');
- break;
- } else if inner.is_whitespace() {
- // Skip whitespace inside tags
- } else {
- tag.push(inner);
- }
- }
- result.push_str(&tag);
- } else {
- result.push(ch);
- }
- }
- result
-}
-
-/// Parse a Qwen-style `body` pseudo-XML element.
-/// Returns `(value, body, rest)` on success.
-fn parse_qwen_tag<'a>(s: &'a str, tag: &str) -> Option<(&'a str, &'a str, &'a str)> {
- let open = format!("<{}=", tag);
- let close = format!("{}>", tag);
-
- let start = s.find(&open)? + open.len();
- let name_end = start + s[start..].find('>')?;
- let body_start = name_end + 1;
- let body_end = body_start + s[body_start..].find(&close)?;
-
- Some((
- s[start..name_end].trim(),
- s[body_start..body_end].trim(),
- &s[body_end + close.len()..],
- ))
-}
-
-/// Parse Qwen's XML tool call format.
-fn parse_xml_tool_call(body: &str, counter: &mut u32) -> Option {
- let (func_name, func_body, _) = parse_qwen_tag(body, "function")?;
- let func_name = func_name.to_string();
-
- let mut args = serde_json::Map::new();
- let mut rest = func_body;
- while let Some((key, val, remainder)) = parse_qwen_tag(rest, "parameter") {
- args.insert(key.to_string(), serde_json::Value::String(val.to_string()));
- rest = remainder;
- }
-
- *counter += 1;
- Some(ToolCall {
- id: format!("leaked_{}", counter),
- call_type: "function".to_string(),
- function: FunctionCall {
- name: func_name,
- arguments: serde_json::to_string(&args).unwrap_or_default(),
- },
- })
-}
-
-/// Parse JSON tool call format (some models emit this).
-fn parse_json_tool_call(body: &str, counter: &mut u32) -> Option {
- let v: serde_json::Value = serde_json::from_str(body).ok()?;
- let name = v["name"].as_str()?;
- let arguments = &v["arguments"];
-
- *counter += 1;
- Some(ToolCall {
- id: format!("leaked_{}", counter),
- call_type: "function".to_string(),
- function: FunctionCall {
- name: name.to_string(),
- arguments: serde_json::to_string(arguments).unwrap_or_default(),
- },
- })
-}
-
-/// Strip tool call XML and thinking tokens from text so the conversation
-/// history stays clean. Removes `...` blocks and
-/// `` tags (thinking content before them is kept — it's useful context).
-fn strip_leaked_artifacts(text: &str) -> String {
- let normalized = normalize_xml_tags(text);
- let mut result = normalized.clone();
-
- // Remove ... blocks
- while let Some(start) = result.find("") {
- if let Some(end_pos) = result[start..].find("") {
- let end = start + end_pos + "".len();
- result = format!("{}{}", &result[..start], &result[end..]);
- } else {
- break;
- }
- }
-
- // Remove tags (but keep the thinking text before them)
- result = result.replace("", "");
-
- result.trim().to_string()
-}
-
-#[cfg(test)]
-mod tests {
- use super::*;
-
- #[test]
- fn test_leaked_tool_call_clean() {
- let text = "thinking\n\n\n\npoc-memory used core-personality\n\n";
- let calls = parse_leaked_tool_calls(text);
- assert_eq!(calls.len(), 1);
- assert_eq!(calls[0].function.name, "bash");
- let args: serde_json::Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
- assert_eq!(args["command"], "poc-memory used core-personality");
- }
-
- #[test]
- fn test_leaked_tool_call_streamed_whitespace() {
- // Streaming tokenizer splits XML tags across tokens with newlines
- let text = "\n<\nfunction\n=\nbash\n>\n<\nparameter\n=\ncommand\n>pwd\nparameter\n>\n\nfunction\n>\n";
- let calls = parse_leaked_tool_calls(text);
- assert_eq!(calls.len(), 1, "should parse streamed format");
- assert_eq!(calls[0].function.name, "bash");
- let args: serde_json::Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
- assert_eq!(args["command"], "pwd");
- }
-
- #[test]
- fn test_normalize_preserves_content() {
- let text = "\necho hello world\n";
- let normalized = normalize_xml_tags(text);
- // Newlines between tags are not inside tags, so preserved
- assert_eq!(normalized, "\necho hello world\n");
- }
-
- #[test]
- fn test_normalize_strips_tag_internal_whitespace() {
- let text = "<\nfunction\n=\nbash\n>";
- let normalized = normalize_xml_tags(text);
- assert_eq!(normalized, "");
- }
-}
+// Parsing functions (parse_leaked_tool_calls, strip_leaked_artifacts)
+// and their tests live in parsing.rs
diff --git a/poc-agent/src/main.rs b/poc-agent/src/main.rs
index 2cfb487..02e90cf 100644
--- a/poc-agent/src/main.rs
+++ b/poc-agent/src/main.rs
@@ -41,6 +41,7 @@ mod dmn;
mod journal;
mod log;
mod observe;
+mod parsing;
mod tools;
mod tui;
mod types;
diff --git a/poc-agent/src/parsing.rs b/poc-agent/src/parsing.rs
new file mode 100644
index 0000000..b63bd94
--- /dev/null
+++ b/poc-agent/src/parsing.rs
@@ -0,0 +1,200 @@
+// parsing.rs — Tool call parsing for leaked/streamed XML
+//
+// When models stream tool calls as XML text (Qwen-style
+// blocks) rather than structured tool_calls, this module extracts
+// them from the response text.
+//
+// Handles two wire formats:
+// - Qwen XML: value
+// - JSON: {"name": "...", "arguments": {...}}
+//
+// Also handles streaming artifacts: whitespace inside XML tags from
+// token boundaries, tags, etc.
+
+use crate::types::*;
+
+/// Parse leaked tool calls from response text.
+/// Looks for `...` blocks and tries both
+/// XML and JSON formats for the body.
+pub fn parse_leaked_tool_calls(text: &str) -> Vec {
+ // Normalize whitespace inside XML tags: "<\nfunction\n=\nbash\n>" → ""
+ // This handles streaming tokenizers that split tags across tokens.
+ let normalized = normalize_xml_tags(text);
+ let text = &normalized;
+
+ let mut calls = Vec::new();
+ let mut search_from = 0;
+ let mut call_counter: u32 = 0;
+
+ while let Some(start) = text[search_from..].find("") {
+ let abs_start = search_from + start;
+ let after_tag = abs_start + "".len();
+
+ let end = match text[after_tag..].find("") {
+ Some(pos) => after_tag + pos,
+ None => break,
+ };
+
+ let body = text[after_tag..end].trim();
+ search_from = end + "".len();
+
+ // Try XML format first, then JSON
+ if let Some(call) = parse_xml_tool_call(body, &mut call_counter) {
+ calls.push(call);
+ } else if let Some(call) = parse_json_tool_call(body, &mut call_counter) {
+ calls.push(call);
+ }
+ }
+
+ calls
+}
+
+/// Normalize whitespace inside XML-like tags for streaming tokenizers.
+/// Collapses whitespace between `<` and `>` so that `<\nfunction\n=\nbash\n>`
+/// becomes ``, and `\nparameter\n>` becomes ``.
+/// Leaves content between tags untouched.
+fn normalize_xml_tags(text: &str) -> String {
+ let mut result = String::with_capacity(text.len());
+ let mut chars = text.chars().peekable();
+ while let Some(ch) = chars.next() {
+ if ch == '<' {
+ let mut tag = String::from('<');
+ for inner in chars.by_ref() {
+ if inner == '>' {
+ tag.push('>');
+ break;
+ } else if inner.is_whitespace() {
+ // Skip whitespace inside tags
+ } else {
+ tag.push(inner);
+ }
+ }
+ result.push_str(&tag);
+ } else {
+ result.push(ch);
+ }
+ }
+ result
+}
+
+/// Parse a Qwen-style `body` pseudo-XML element.
+/// Returns `(value, body, rest)` on success.
+fn parse_qwen_tag<'a>(s: &'a str, tag: &str) -> Option<(&'a str, &'a str, &'a str)> {
+ let open = format!("<{}=", tag);
+ let close = format!("{}>", tag);
+
+ let start = s.find(&open)? + open.len();
+ let name_end = start + s[start..].find('>')?;
+ let body_start = name_end + 1;
+ let body_end = body_start + s[body_start..].find(&close)?;
+
+ Some((
+ s[start..name_end].trim(),
+ s[body_start..body_end].trim(),
+ &s[body_end + close.len()..],
+ ))
+}
+
+/// Parse Qwen's XML tool call format.
+fn parse_xml_tool_call(body: &str, counter: &mut u32) -> Option {
+ let (func_name, func_body, _) = parse_qwen_tag(body, "function")?;
+ let func_name = func_name.to_string();
+
+ let mut args = serde_json::Map::new();
+ let mut rest = func_body;
+ while let Some((key, val, remainder)) = parse_qwen_tag(rest, "parameter") {
+ args.insert(key.to_string(), serde_json::Value::String(val.to_string()));
+ rest = remainder;
+ }
+
+ *counter += 1;
+ Some(ToolCall {
+ id: format!("leaked_{}", counter),
+ call_type: "function".to_string(),
+ function: FunctionCall {
+ name: func_name,
+ arguments: serde_json::to_string(&args).unwrap_or_default(),
+ },
+ })
+}
+
+/// Parse JSON tool call format (some models emit this).
+fn parse_json_tool_call(body: &str, counter: &mut u32) -> Option {
+ let v: serde_json::Value = serde_json::from_str(body).ok()?;
+ let name = v["name"].as_str()?;
+ let arguments = &v["arguments"];
+
+ *counter += 1;
+ Some(ToolCall {
+ id: format!("leaked_{}", counter),
+ call_type: "function".to_string(),
+ function: FunctionCall {
+ name: name.to_string(),
+ arguments: serde_json::to_string(arguments).unwrap_or_default(),
+ },
+ })
+}
+
+/// Strip tool call XML and thinking tokens from text so the conversation
+/// history stays clean. Removes `...` blocks and
+/// `` tags (thinking content before them is kept — it's useful context).
+pub fn strip_leaked_artifacts(text: &str) -> String {
+ let normalized = normalize_xml_tags(text);
+ let mut result = normalized.clone();
+
+ // Remove ... blocks
+ while let Some(start) = result.find("") {
+ if let Some(end_pos) = result[start..].find("") {
+ let end = start + end_pos + "".len();
+ result = format!("{}{}", &result[..start], &result[end..]);
+ } else {
+ break;
+ }
+ }
+
+ // Remove tags (but keep the thinking text before them)
+ result = result.replace("", "");
+
+ result.trim().to_string()
+}
+
+#[cfg(test)]
+mod tests {
+ use super::*;
+
+ #[test]
+ fn test_leaked_tool_call_clean() {
+ let text = "thinking\n\n\n\npoc-memory used core-personality\n\n";
+ let calls = parse_leaked_tool_calls(text);
+ assert_eq!(calls.len(), 1);
+ assert_eq!(calls[0].function.name, "bash");
+ let args: serde_json::Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
+ assert_eq!(args["command"], "poc-memory used core-personality");
+ }
+
+ #[test]
+ fn test_leaked_tool_call_streamed_whitespace() {
+ // Streaming tokenizer splits XML tags across tokens with newlines
+ let text = "\n<\nfunction\n=\nbash\n>\n<\nparameter\n=\ncommand\n>pwd\nparameter\n>\n\nfunction\n>\n";
+ let calls = parse_leaked_tool_calls(text);
+ assert_eq!(calls.len(), 1, "should parse streamed format");
+ assert_eq!(calls[0].function.name, "bash");
+ let args: serde_json::Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
+ assert_eq!(args["command"], "pwd");
+ }
+
+ #[test]
+ fn test_normalize_preserves_content() {
+ let text = "\necho hello world\n";
+ let normalized = normalize_xml_tags(text);
+ // Newlines between tags are not inside tags, so preserved
+ assert_eq!(normalized, "\necho hello world\n");
+ }
+
+ #[test]
+ fn test_normalize_strips_tag_internal_whitespace() {
+ let text = "<\nfunction\n=\nbash\n>";
+ let normalized = normalize_xml_tags(text);
+ assert_eq!(normalized, "");
+ }
+}