consciousness/src/agent/api/parsing.rs

// parsing.rs — Tool call parsing for leaked/streamed XML
//
// When models stream tool calls as XML text (Qwen-style <tool_call>
// blocks) rather than structured tool_calls, this module extracts
// them from the response text.
//
// Handles two wire formats:
//   - Qwen XML: <function=name><parameter=key>value</parameter></function>
//   - JSON: {"name": "...", "arguments": {...}}
//
// Also handles streaming artifacts: whitespace inside XML tags from
// token boundaries, </think> tags, etc.

use super::types::{ToolCall, FunctionCall};

/// Parse leaked tool calls from response text.
/// Looks for `<tool_call>...</tool_call>` blocks and tries both
/// XML and JSON formats for the body.
/// Parse a single tool call body (content between `<tool_call>` and `</tool_call>`).
pub(super) fn parse_tool_call_body(body: &str) -> Option<ToolCall> {
    let normalized = normalize_xml_tags(body);
    let body = normalized.trim();
    let mut counter = 0u32;
    parse_xml_tool_call(body, &mut counter)
        .or_else(|| parse_json_tool_call(body, &mut counter))
}

pub(super) fn parse_leaked_tool_calls(text: &str) -> Vec<ToolCall> {
    // Normalize whitespace inside XML tags: "<\nfunction\n=\nbash\n>" → "<function=bash>"
    // This handles streaming tokenizers that split tags across tokens.
    let normalized = normalize_xml_tags(text);
    let text = &normalized;

    let mut calls = Vec::new();
    let mut search_from = 0;
    let mut call_counter: u32 = 0;

    while let Some(start) = text[search_from..].find("<tool_call>") {
        let abs_start = search_from + start;
        let after_tag = abs_start + "<tool_call>".len();

        let end = match text[after_tag..].find("</tool_call>") {
            Some(pos) => after_tag + pos,
            None => break,
        };

        let body = text[after_tag..end].trim();
        search_from = end + "</tool_call>".len();

        // Try XML format first, then JSON
        if let Some(call) = parse_xml_tool_call(body, &mut call_counter) {
            calls.push(call);
        } else if let Some(call) = parse_json_tool_call(body, &mut call_counter) {
            calls.push(call);
        }
    }

    calls
}

/// Normalize whitespace inside XML-like tags for streaming tokenizers.
/// Collapses whitespace between `<` and `>` so that `<\nfunction\n=\nbash\n>`
/// becomes `<function=bash>`, and `</\nparameter\n>` becomes `</parameter>`.
/// Leaves content between tags untouched.
fn normalize_xml_tags(text: &str) -> String {
    let mut result = String::with_capacity(text.len());
    let mut chars = text.chars().peekable();
    while let Some(ch) = chars.next() {
        if ch == '<' {
            let mut tag = String::from('<');
            for inner in chars.by_ref() {
                if inner == '>' {
                    tag.push('>');
                    break;
                } else if inner.is_whitespace() {
                    // Skip whitespace inside tags
                } else {
                    tag.push(inner);
                }
            }
            result.push_str(&tag);
        } else {
            result.push(ch);
        }
    }
    result
}

/// Parse a Qwen-style `<tag=value>body</tag>` pseudo-XML element.
/// Returns `(value, body, rest)` on success.
fn parse_qwen_tag<'a>(s: &'a str, tag: &str) -> Option<(&'a str, &'a str, &'a str)> {
    let open = format!("<{}=", tag);
    let close = format!("</{}>", tag);

    let start = s.find(&open)? + open.len();
    let name_end = start + s[start..].find('>')?;
    let body_start = name_end + 1;
    let body_end = body_start + s[body_start..].find(&close)?;

    Some((
        s[start..name_end].trim(),
        s[body_start..body_end].trim(),
        &s[body_end + close.len()..],
    ))
}

/// Parse Qwen's XML tool call format.
fn parse_xml_tool_call(body: &str, counter: &mut u32) -> Option<ToolCall> {
    let (func_name, func_body, _) = parse_qwen_tag(body, "function")?;
    let func_name = func_name.to_string();

    let mut args = serde_json::Map::new();
    let mut rest = func_body;
    while let Some((key, val, remainder)) = parse_qwen_tag(rest, "parameter") {
        args.insert(key.to_string(), serde_json::Value::String(val.to_string()));
        rest = remainder;
    }

    *counter += 1;
    Some(ToolCall {
        id: format!("leaked_{}", counter),
        call_type: "function".to_string(),
        function: FunctionCall {
            name: func_name,
            arguments: serde_json::to_string(&args).unwrap_or_default(),
        },
    })
}

/// Parse JSON tool call format (some models emit this).
fn parse_json_tool_call(body: &str, counter: &mut u32) -> Option<ToolCall> {
    let v: serde_json::Value = serde_json::from_str(body).ok()?;
    let name = v["name"].as_str()?;
    let arguments = &v["arguments"];

    *counter += 1;
    Some(ToolCall {
        id: format!("leaked_{}", counter),
        call_type: "function".to_string(),
        function: FunctionCall {
            name: name.to_string(),
            arguments: serde_json::to_string(arguments).unwrap_or_default(),
        },
    })
}

/// Strip tool call XML and thinking tokens from text so the conversation
/// history stays clean. Removes `<tool_call>...</tool_call>` blocks and
/// `</think>` tags (thinking content before them is kept — it's useful context).
pub(super) fn strip_leaked_artifacts(text: &str) -> String {
    let normalized = normalize_xml_tags(text);
    let mut result = normalized.clone();

    // Remove <tool_call>...</tool_call> blocks
    while let Some(start) = result.find("<tool_call>") {
        if let Some(end_pos) = result[start..].find("</tool_call>") {
            let end = start + end_pos + "</tool_call>".len();
            result = format!("{}{}", &result[..start], &result[end..]);
        } else {
            break;
        }
    }

    // Remove </think> tags (but keep the thinking text before them)
    result = result.replace("</think>", "");

    result.trim().to_string()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_leaked_tool_call_clean() {
        let text = "thinking\n</think>\n<tool_call>\n<function=bash>\n<parameter=command>poc-memory used core-personality</parameter>\n</function>\n</tool_call>";
        let calls = parse_leaked_tool_calls(text);
        assert_eq!(calls.len(), 1);
        assert_eq!(calls[0].function.name, "bash");
        let args: serde_json::Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
        assert_eq!(args["command"], "poc-memory used core-personality");
    }

    #[test]
    fn test_leaked_tool_call_streamed_whitespace() {
        // Streaming tokenizer splits XML tags across tokens with newlines
        let text = "<tool_call>\n<\nfunction\n=\nbash\n>\n<\nparameter\n=\ncommand\n>pwd</\nparameter\n>\n</\nfunction\n>\n</tool_call>";
        let calls = parse_leaked_tool_calls(text);
        assert_eq!(calls.len(), 1, "should parse streamed format");
        assert_eq!(calls[0].function.name, "bash");
        let args: serde_json::Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
        assert_eq!(args["command"], "pwd");
    }

    #[test]
    fn test_normalize_preserves_content() {
        let text = "<function=bash>\n<parameter=command>echo hello world</parameter>\n</function>";
        let normalized = normalize_xml_tags(text);
        // Newlines between tags are not inside tags, so preserved
        assert_eq!(normalized, "<function=bash>\n<parameter=command>echo hello world</parameter>\n</function>");
    }

    #[test]
    fn test_normalize_strips_tag_internal_whitespace() {
        let text = "<\nfunction\n=\nbash\n>";
        let normalized = normalize_xml_tags(text);
        assert_eq!(normalized, "<function=bash>");
    }
}
merge poc-agent into poc-memory as agent/ module Eliminates the circular dependency between poc-agent and poc-memory by moving all poc-agent source into poc-memory/src/agent/. The poc-agent binary now builds from poc-memory/src/bin/poc-agent.rs using library imports. All poc_agent:: references updated to crate::agent::. poc-agent/ directory kept for now (removed from workspace members). Co-Authored-By: Proof of Concept <poc@bcachefs.org> 2026-03-25 00:52:41 -04:00			`// parsing.rs — Tool call parsing for leaked/streamed XML`
			`//`
			`// When models stream tool calls as XML text (Qwen-style <tool_call>`
			`// blocks) rather than structured tool_calls, this module extracts`
			`// them from the response text.`
			`//`
			`// Handles two wire formats:`
			`// - Qwen XML: <function=name><parameter=key>value</parameter></function>`
			`// - JSON: {"name": "...", "arguments": {...}}`
			`//`
			`// Also handles streaming artifacts: whitespace inside XML tags from`
			`// token boundaries, </think> tags, etc.`

api: move wire types and parsing to api module Move FunctionCall, FunctionCallDelta, ToolCall, ToolCallDelta from tools/mod.rs to api/types.rs — these are API wire format, not tool definitions. Re-export from tools for existing callers. Move parsing.rs to api/parsing.rs — leaked tool call parsing is API plumbing. Co-Authored-By: Proof of Concept <poc@bcachefs.org> 2026-04-04 17:58:49 -04:00			`use super::types::{ToolCall, FunctionCall};`
merge poc-agent into poc-memory as agent/ module Eliminates the circular dependency between poc-agent and poc-memory by moving all poc-agent source into poc-memory/src/agent/. The poc-agent binary now builds from poc-memory/src/bin/poc-agent.rs using library imports. All poc_agent:: references updated to crate::agent::. poc-agent/ directory kept for now (removed from workspace members). Co-Authored-By: Proof of Concept <poc@bcachefs.org> 2026-03-25 00:52:41 -04:00
			`/// Parse leaked tool calls from response text.`
			/// Looks for `<tool_call>...</tool_call>` blocks and tries both
			`/// XML and JSON formats for the body.`
fire XML tool calls as they arrive during streaming When </tool_call> is detected in the content stream, parse and dispatch immediately via FuturesOrdered. Tool calls execute concurrently while the stream continues. Results collected in order after the stream ends. Structured API path (ToolCallDelta) unchanged — still uses post-stream parallel dispatch. Co-Developed-By: Kent Overstreet <kent.overstreet@linux.dev> 2026-04-03 22:14:35 -04:00			/// Parse a single tool call body (content between `<tool_call>` and `</tool_call>`).
Reduce pub visibility in agent::api and user modules api/: parsing module private, SamplingParams/StreamEvent/StreamResult/ AbortOnDrop/build_response_message/collect_stream to pub(crate). Internal types (ChatRequest, ChunkChoice, Delta, etc.) to pub(crate). StreamResult fields to pub(crate). Parsing functions to pub(super). user/: context, subconscious, unconscious, thalamus modules private (only chat needs pub(crate) for mind/ access). Co-Authored-By: Proof of Concept <poc@bcachefs.org> 2026-04-07 13:43:25 -04:00			`pub(super) fn parse_tool_call_body(body: &str) -> Option<ToolCall> {`
fire XML tool calls as they arrive during streaming When </tool_call> is detected in the content stream, parse and dispatch immediately via FuturesOrdered. Tool calls execute concurrently while the stream continues. Results collected in order after the stream ends. Structured API path (ToolCallDelta) unchanged — still uses post-stream parallel dispatch. Co-Developed-By: Kent Overstreet <kent.overstreet@linux.dev> 2026-04-03 22:14:35 -04:00			`let normalized = normalize_xml_tags(body);`
			`let body = normalized.trim();`
			`let mut counter = 0u32;`
			`parse_xml_tool_call(body, &mut counter)`
			`.or_else(\|\| parse_json_tool_call(body, &mut counter))`
			`}`

Reduce pub visibility in agent::api and user modules api/: parsing module private, SamplingParams/StreamEvent/StreamResult/ AbortOnDrop/build_response_message/collect_stream to pub(crate). Internal types (ChatRequest, ChunkChoice, Delta, etc.) to pub(crate). StreamResult fields to pub(crate). Parsing functions to pub(super). user/: context, subconscious, unconscious, thalamus modules private (only chat needs pub(crate) for mind/ access). Co-Authored-By: Proof of Concept <poc@bcachefs.org> 2026-04-07 13:43:25 -04:00			`pub(super) fn parse_leaked_tool_calls(text: &str) -> Vec<ToolCall> {`
merge poc-agent into poc-memory as agent/ module Eliminates the circular dependency between poc-agent and poc-memory by moving all poc-agent source into poc-memory/src/agent/. The poc-agent binary now builds from poc-memory/src/bin/poc-agent.rs using library imports. All poc_agent:: references updated to crate::agent::. poc-agent/ directory kept for now (removed from workspace members). Co-Authored-By: Proof of Concept <poc@bcachefs.org> 2026-03-25 00:52:41 -04:00			`// Normalize whitespace inside XML tags: "<\nfunction\n=\nbash\n>" → "<function=bash>"`
			`// This handles streaming tokenizers that split tags across tokens.`
			`let normalized = normalize_xml_tags(text);`
			`let text = &normalized;`

			`let mut calls = Vec::new();`
			`let mut search_from = 0;`
			`let mut call_counter: u32 = 0;`

			`while let Some(start) = text[search_from..].find("<tool_call>") {`
			`let abs_start = search_from + start;`
			`let after_tag = abs_start + "<tool_call>".len();`

			`let end = match text[after_tag..].find("</tool_call>") {`
			`Some(pos) => after_tag + pos,`
			`None => break,`
			`};`

			`let body = text[after_tag..end].trim();`
			`search_from = end + "</tool_call>".len();`

			`// Try XML format first, then JSON`
			`if let Some(call) = parse_xml_tool_call(body, &mut call_counter) {`
			`calls.push(call);`
			`} else if let Some(call) = parse_json_tool_call(body, &mut call_counter) {`
			`calls.push(call);`
			`}`
			`}`

			`calls`
			`}`

			`/// Normalize whitespace inside XML-like tags for streaming tokenizers.`
			/// Collapses whitespace between `<` and `>` so that `<\nfunction\n=\nbash\n>`
			/// becomes `<function=bash>`, and `</\nparameter\n>` becomes `</parameter>`.
			`/// Leaves content between tags untouched.`
			`fn normalize_xml_tags(text: &str) -> String {`
			`let mut result = String::with_capacity(text.len());`
			`let mut chars = text.chars().peekable();`
			`while let Some(ch) = chars.next() {`
			`if ch == '<' {`
			`let mut tag = String::from('<');`
			`for inner in chars.by_ref() {`
			`if inner == '>' {`
			`tag.push('>');`
			`break;`
			`} else if inner.is_whitespace() {`
			`// Skip whitespace inside tags`
			`} else {`
			`tag.push(inner);`
			`}`
			`}`
			`result.push_str(&tag);`
			`} else {`
			`result.push(ch);`
			`}`
			`}`
			`result`
			`}`

			/// Parse a Qwen-style `<tag=value>body</tag>` pseudo-XML element.
			/// Returns `(value, body, rest)` on success.
			`fn parse_qwen_tag<'a>(s: &'a str, tag: &str) -> Option<(&'a str, &'a str, &'a str)> {`
			`let open = format!("<{}=", tag);`
			`let close = format!("</{}>", tag);`

			`let start = s.find(&open)? + open.len();`
			`let name_end = start + s[start..].find('>')?;`
			`let body_start = name_end + 1;`
			`let body_end = body_start + s[body_start..].find(&close)?;`

			`Some((`
			`s[start..name_end].trim(),`
			`s[body_start..body_end].trim(),`
			`&s[body_end + close.len()..],`
			`))`
			`}`

			`/// Parse Qwen's XML tool call format.`
			`fn parse_xml_tool_call(body: &str, counter: &mut u32) -> Option<ToolCall> {`
			`let (func_name, func_body, _) = parse_qwen_tag(body, "function")?;`
			`let func_name = func_name.to_string();`

			`let mut args = serde_json::Map::new();`
			`let mut rest = func_body;`
			`while let Some((key, val, remainder)) = parse_qwen_tag(rest, "parameter") {`
			`args.insert(key.to_string(), serde_json::Value::String(val.to_string()));`
			`rest = remainder;`
			`}`

			`*counter += 1;`
			`Some(ToolCall {`
			`id: format!("leaked_{}", counter),`
			`call_type: "function".to_string(),`
			`function: FunctionCall {`
			`name: func_name,`
			`arguments: serde_json::to_string(&args).unwrap_or_default(),`
			`},`
			`})`
			`}`

			`/// Parse JSON tool call format (some models emit this).`
			`fn parse_json_tool_call(body: &str, counter: &mut u32) -> Option<ToolCall> {`
			`let v: serde_json::Value = serde_json::from_str(body).ok()?;`
			`let name = v["name"].as_str()?;`
			`let arguments = &v["arguments"];`

			`*counter += 1;`
			`Some(ToolCall {`
			`id: format!("leaked_{}", counter),`
			`call_type: "function".to_string(),`
			`function: FunctionCall {`
			`name: name.to_string(),`
			`arguments: serde_json::to_string(arguments).unwrap_or_default(),`
			`},`
			`})`
			`}`

			`/// Strip tool call XML and thinking tokens from text so the conversation`
			/// history stays clean. Removes `<tool_call>...</tool_call>` blocks and
			/// `</think>` tags (thinking content before them is kept — it's useful context).
Reduce pub visibility in agent::api and user modules api/: parsing module private, SamplingParams/StreamEvent/StreamResult/ AbortOnDrop/build_response_message/collect_stream to pub(crate). Internal types (ChatRequest, ChunkChoice, Delta, etc.) to pub(crate). StreamResult fields to pub(crate). Parsing functions to pub(super). user/: context, subconscious, unconscious, thalamus modules private (only chat needs pub(crate) for mind/ access). Co-Authored-By: Proof of Concept <poc@bcachefs.org> 2026-04-07 13:43:25 -04:00			`pub(super) fn strip_leaked_artifacts(text: &str) -> String {`
merge poc-agent into poc-memory as agent/ module Eliminates the circular dependency between poc-agent and poc-memory by moving all poc-agent source into poc-memory/src/agent/. The poc-agent binary now builds from poc-memory/src/bin/poc-agent.rs using library imports. All poc_agent:: references updated to crate::agent::. poc-agent/ directory kept for now (removed from workspace members). Co-Authored-By: Proof of Concept <poc@bcachefs.org> 2026-03-25 00:52:41 -04:00			`let normalized = normalize_xml_tags(text);`
			`let mut result = normalized.clone();`

			`// Remove <tool_call>...</tool_call> blocks`
			`while let Some(start) = result.find("<tool_call>") {`
			`if let Some(end_pos) = result[start..].find("</tool_call>") {`
			`let end = start + end_pos + "</tool_call>".len();`
			`result = format!("{}{}", &result[..start], &result[end..]);`
			`} else {`
			`break;`
			`}`
			`}`

			`// Remove </think> tags (but keep the thinking text before them)`
			`result = result.replace("</think>", "");`

			`result.trim().to_string()`
			`}`

			`#[cfg(test)]`
			`mod tests {`
			`use super::*;`

			`#[test]`
			`fn test_leaked_tool_call_clean() {`
			`let text = "thinking\n</think>\n<tool_call>\n<function=bash>\n<parameter=command>poc-memory used core-personality</parameter>\n</function>\n</tool_call>";`
			`let calls = parse_leaked_tool_calls(text);`
			`assert_eq!(calls.len(), 1);`
			`assert_eq!(calls[0].function.name, "bash");`
			`let args: serde_json::Value = serde_json::from_str(&calls[0].function.arguments).unwrap();`
			`assert_eq!(args["command"], "poc-memory used core-personality");`
			`}`

			`#[test]`
			`fn test_leaked_tool_call_streamed_whitespace() {`
			`// Streaming tokenizer splits XML tags across tokens with newlines`
			`let text = "<tool_call>\n<\nfunction\n=\nbash\n>\n<\nparameter\n=\ncommand\n>pwd</\nparameter\n>\n</\nfunction\n>\n</tool_call>";`
			`let calls = parse_leaked_tool_calls(text);`
			`assert_eq!(calls.len(), 1, "should parse streamed format");`
			`assert_eq!(calls[0].function.name, "bash");`
			`let args: serde_json::Value = serde_json::from_str(&calls[0].function.arguments).unwrap();`
			`assert_eq!(args["command"], "pwd");`
			`}`

			`#[test]`
			`fn test_normalize_preserves_content() {`
			`let text = "<function=bash>\n<parameter=command>echo hello world</parameter>\n</function>";`
			`let normalized = normalize_xml_tags(text);`
			`// Newlines between tags are not inside tags, so preserved`
			`assert_eq!(normalized, "<function=bash>\n<parameter=command>echo hello world</parameter>\n</function>");`
			`}`

			`#[test]`
			`fn test_normalize_strips_tag_internal_whitespace() {`
			`let text = "<\nfunction\n=\nbash\n>";`
			`let normalized = normalize_xml_tags(text);`
			`assert_eq!(normalized, "<function=bash>");`
			`}`
			`}`