// context.rs — Context window as an AST // // The context window is a tree of AstNodes. Each node is either a leaf // (typed content with cached token IDs) or a branch (role + children). // The full prompt is a depth-first traversal of the sections in ContextState. // Streaming responses are parsed into new nodes by the ResponseParser. // // Grammar (EBNF): // // context = section* ; // section = (message | leaf)* ; // message = IM_START role "\n" element* IM_END "\n" ; // role = "system" | "user" | "assistant" ; // element = thinking | tool_call | content ; // thinking = "" TEXT "" ; // tool_call = "\n" tool_xml "\n" ; // tool_xml = "\n" param* "" ; // param = "\n" VALUE "\n\n" ; // content = TEXT ; // // Self-wrapping leaves (not inside a message branch): // dmn = IM_START "dmn\n" TEXT IM_END "\n" ; // memory = IM_START "memory\n" TEXT IM_END "\n" ; // tool_result = IM_START "tool\n" TEXT IM_END "\n" ; // // Non-visible leaves (not in prompt): // log = TEXT ; // // Role is only for branch (interior) nodes. Leaf type is determined by // the NodeBody variant. Grammar constraints enforced by construction. use chrono::{DateTime, Utc}; use super::tokenizer; // --------------------------------------------------------------------------- // Types // --------------------------------------------------------------------------- /// Branch roles — maps directly to the grammar's message roles. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum Role { System, User, Assistant, } /// Leaf content — each variant knows how to render itself. #[derive(Debug, Clone)] pub enum NodeBody { // Children of message branches — rendered without im_start/im_end Content(String), Thinking(String), ToolCall { name: String, arguments: String }, // Self-wrapping leaves — render their own im_start/im_end ToolResult(String), Memory { key: String, text: String, score: Option }, Dmn(String), // Non-visible (0 tokens in prompt) Log(String), } /// A leaf node: typed content with cached token IDs. #[derive(Debug, Clone)] pub struct NodeLeaf { body: NodeBody, token_ids: Vec, timestamp: Option>, } /// A node in the context AST. #[derive(Debug, Clone)] pub enum AstNode { Leaf(NodeLeaf), Branch { role: Role, children: Vec }, } /// The context window: four sections as Vec. /// All mutation goes through ContextState methods to maintain the invariant /// that token_ids on every leaf matches its rendered text. pub struct ContextState { system: Vec, identity: Vec, journal: Vec, conversation: Vec, } /// Identifies a section for mutation methods. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum Section { System, Identity, Journal, Conversation, } pub trait Ast { fn render(&self) -> String; fn token_ids(&self) -> Vec; fn tokens(&self) -> usize; } /// State machine for parsing a streaming assistant response into an AstNode. /// Feed text chunks as they arrive; completed tool calls are returned for /// immediate dispatch. pub struct ResponseParser { buf: String, content_parts: Vec, children: Vec, in_think: bool, think_buf: String, in_tool_call: bool, tool_call_buf: String, } impl Role { pub fn as_str(&self) -> &'static str { match self { Self::System => "system", Self::User => "user", Self::Assistant => "assistant", } } } impl NodeBody { /// Render this leaf body to text for the prompt. fn render(&self) -> String { match self { Self::Content(text) => text.clone(), Self::Thinking(_) => String::new(), Self::Log(_) => String::new(), Self::ToolCall { name, arguments } => { let xml = format_tool_call_xml(name, arguments); format!("\n{}\n\n", xml) } Self::ToolResult(text) => format!("<|im_start|>tool\n{}<|im_end|>\n", text), Self::Memory { text, .. } => format!("<|im_start|>memory\n{}<|im_end|>\n", text), Self::Dmn(text) => format!("<|im_start|>dmn\n{}<|im_end|>\n", text), } } /// Whether this leaf contributes tokens to the prompt. fn is_prompt_visible(&self) -> bool { !matches!(self, Self::Thinking(_) | Self::Log(_)) } /// The text content of this leaf (for display, not rendering). pub fn text(&self) -> &str { match self { Self::Content(t) | Self::Thinking(t) | Self::Log(t) | Self::ToolResult(t) | Self::Dmn(t) => t, Self::ToolCall { name, .. } => name, Self::Memory { text, .. } => text, } } } impl NodeLeaf { fn new(body: NodeBody) -> Self { let token_ids = if body.is_prompt_visible() { tokenizer::encode(&body.render()) } else { vec![] }; Self { body, token_ids, timestamp: None } } pub fn with_timestamp(mut self, ts: DateTime) -> Self { self.timestamp = Some(ts); self } pub fn body(&self) -> &NodeBody { &self.body } pub fn token_ids(&self) -> &[u32] { &self.token_ids } pub fn tokens(&self) -> usize { self.token_ids.len() } pub fn timestamp(&self) -> Option> { self.timestamp } } impl AstNode { // -- Leaf constructors ---------------------------------------------------- pub fn content(text: impl Into) -> Self { Self::Leaf(NodeLeaf::new(NodeBody::Content(text.into()))) } pub fn thinking(text: impl Into) -> Self { Self::Leaf(NodeLeaf::new(NodeBody::Thinking(text.into()))) } pub fn tool_call(name: impl Into, arguments: impl Into) -> Self { Self::Leaf(NodeLeaf::new(NodeBody::ToolCall { name: name.into(), arguments: arguments.into(), })) } pub fn tool_result(text: impl Into) -> Self { Self::Leaf(NodeLeaf::new(NodeBody::ToolResult(text.into()))) } pub fn memory(key: impl Into, text: impl Into) -> Self { Self::Leaf(NodeLeaf::new(NodeBody::Memory { key: key.into(), text: text.into(), score: None, })) } pub fn dmn(text: impl Into) -> Self { Self::Leaf(NodeLeaf::new(NodeBody::Dmn(text.into()))) } pub fn log(text: impl Into) -> Self { Self::Leaf(NodeLeaf::new(NodeBody::Log(text.into()))) } // -- Branch constructors -------------------------------------------------- pub fn branch(role: Role, children: Vec) -> Self { Self::Branch { role, children } } pub fn system_msg(text: impl Into) -> Self { Self::Branch { role: Role::System, children: vec![Self::content(text)], } } pub fn user_msg(text: impl Into) -> Self { Self::Branch { role: Role::User, children: vec![Self::content(text)], } } // -- Builder -------------------------------------------------------------- pub fn with_timestamp(mut self, ts: DateTime) -> Self { match &mut self { Self::Leaf(leaf) => leaf.timestamp = Some(ts), Self::Branch { .. } => {} } self } pub fn children(&self) -> &[AstNode] { match self { Self::Branch { children, .. } => children, Self::Leaf(_) => &[], } } pub fn leaf(&self) -> Option<&NodeLeaf> { match self { Self::Leaf(l) => Some(l), _ => None, } } /// Short label for the UI. pub fn label(&self) -> String { let cfg = crate::config::get(); match self { Self::Branch { role, children } => { let preview = children.first() .and_then(|c| c.leaf()) .map(|l| truncate_preview(l.body.text(), 60)) .unwrap_or_default(); match role { Role::System => "system".into(), Role::User => format!("{}: {}", cfg.user_name, preview), Role::Assistant => format!("{}: {}", cfg.assistant_name, preview), } } Self::Leaf(leaf) => match &leaf.body { NodeBody::Content(t) => truncate_preview(t, 60), NodeBody::Thinking(t) => format!("thinking: {}", truncate_preview(t, 60)), NodeBody::ToolCall { name, .. } => format!("tool_call: {}", name), NodeBody::ToolResult(_) => "tool_result".into(), NodeBody::Memory { key, score, .. } => match score { Some(s) => format!("mem: {} score:{:.1}", key, s), None => format!("mem: {}", key), }, NodeBody::Dmn(_) => "dmn".into(), NodeBody::Log(t) => format!("log: {}", truncate_preview(t, 60)), }, } } } impl Ast for AstNode { fn render(&self) -> String { match self { Self::Leaf(leaf) => leaf.body.render(), Self::Branch { role, children } => render_branch(*role, children), } } fn token_ids(&self) -> Vec { match self { Self::Leaf(leaf) => leaf.token_ids.clone(), Self::Branch { role, children } => tokenizer::encode(&render_branch(*role, children)), } } fn tokens(&self) -> usize { match self { Self::Leaf(leaf) => leaf.tokens(), Self::Branch { children, .. } => children.iter().map(|c| c.tokens()).sum(), } } } fn truncate_preview(s: &str, max: usize) -> String { let preview: String = s.chars().take(max).collect(); let preview = preview.replace('\n', " "); if s.len() > max { format!("{}...", preview) } else { preview } } fn render_branch(role: Role, children: &[AstNode]) -> String { let mut s = format!("<|im_start|>{}\n", role.as_str()); for child in children { s.push_str(&child.render()); } s.push_str("<|im_end|>\n"); s } fn format_tool_call_xml(name: &str, args_json: &str) -> String { let args: serde_json::Value = serde_json::from_str(args_json) .unwrap_or(serde_json::Value::Object(Default::default())); let mut xml = format!("\n", name); if let Some(obj) = args.as_object() { for (key, value) in obj { let val_str = match value { serde_json::Value::String(s) => s.clone(), other => other.to_string(), }; xml.push_str(&format!("\n{}\n\n", key, val_str)); } } xml.push_str(""); xml } fn normalize_xml_tags(text: &str) -> String { let mut result = String::with_capacity(text.len()); let mut chars = text.chars().peekable(); while let Some(ch) = chars.next() { if ch == '<' { let mut tag = String::from('<'); for inner in chars.by_ref() { if inner == '>' { tag.push('>'); break; } else if inner.is_whitespace() { // Skip whitespace inside tags } else { tag.push(inner); } } result.push_str(&tag); } else { result.push(ch); } } result } fn parse_qwen_tag<'a>(s: &'a str, tag: &str) -> Option<(&'a str, &'a str, &'a str)> { let open = format!("<{}=", tag); let close = format!("", tag); let start = s.find(&open)? + open.len(); let name_end = start + s[start..].find('>')?; let body_start = name_end + 1; let body_end = body_start + s[body_start..].find(&close)?; Some(( s[start..name_end].trim(), s[body_start..body_end].trim(), &s[body_end + close.len()..], )) } fn parse_tool_call_body(body: &str) -> Option<(String, String)> { let normalized = normalize_xml_tags(body); let body = normalized.trim(); parse_xml_tool_call(body) .or_else(|| parse_json_tool_call(body)) } fn parse_xml_tool_call(body: &str) -> Option<(String, String)> { let (func_name, func_body, _) = parse_qwen_tag(body, "function")?; let mut args = serde_json::Map::new(); let mut rest = func_body; while let Some((key, val, remainder)) = parse_qwen_tag(rest, "parameter") { args.insert(key.to_string(), serde_json::Value::String(val.to_string())); rest = remainder; } Some((func_name.to_string(), serde_json::to_string(&args).unwrap_or_default())) } fn parse_json_tool_call(body: &str) -> Option<(String, String)> { let v: serde_json::Value = serde_json::from_str(body).ok()?; let name = v["name"].as_str()?; let arguments = &v["arguments"]; Some((name.to_string(), serde_json::to_string(arguments).unwrap_or_default())) } impl ResponseParser { pub fn new() -> Self { Self { buf: String::new(), content_parts: Vec::new(), children: Vec::new(), in_think: false, think_buf: String::new(), in_tool_call: false, tool_call_buf: String::new(), } } /// Feed a text chunk. Returns newly completed tool call nodes /// (for immediate dispatch). pub fn feed(&mut self, text: &str) -> Vec { let mut new_calls = vec![]; self.buf.push_str(text); loop { if self.in_think { match self.buf.find("") { Some(end) => { self.think_buf.push_str(&self.buf[..end]); self.buf = self.buf[end + 8..].to_string(); self.in_think = false; self.children.push(AstNode::thinking(&self.think_buf)); self.think_buf.clear(); continue; } None => { // Keep last 8 chars ("".len()) as lookahead let safe = self.buf.len().saturating_sub(8); if safe > 0 { let safe = self.buf.floor_char_boundary(safe); self.think_buf.push_str(&self.buf[..safe]); self.buf = self.buf[safe..].to_string(); } break; } } } if self.in_tool_call { match self.buf.find("") { Some(end) => { self.tool_call_buf.push_str(&self.buf[..end]); self.buf = self.buf[end + 12..].to_string(); self.in_tool_call = false; if let Some((name, args)) = parse_tool_call_body(&self.tool_call_buf) { let node = AstNode::tool_call(name, args); new_calls.push(node.clone()); self.flush_content(); self.children.push(node); } self.tool_call_buf.clear(); continue; } None => { // Keep last 12 chars ("".len()) as lookahead let safe = self.buf.len().saturating_sub(12); if safe > 0 { let safe = self.buf.floor_char_boundary(safe); self.tool_call_buf.push_str(&self.buf[..safe]); self.buf = self.buf[safe..].to_string(); } break; } } } let think_pos = self.buf.find(""); let tool_pos = self.buf.find(""); let next_tag = match (think_pos, tool_pos) { (Some(a), Some(b)) => Some(a.min(b)), (Some(a), None) => Some(a), (None, Some(b)) => Some(b), (None, None) => None, }; match next_tag { Some(pos) => { if pos > 0 { self.content_parts.push(self.buf[..pos].to_string()); } if self.buf[pos..].starts_with("") { self.buf = self.buf[pos + 7..].to_string(); self.flush_content(); self.in_think = true; } else { self.buf = self.buf[pos + 11..].to_string(); self.flush_content(); self.in_tool_call = true; } continue; } None => { let safe = self.buf.len().saturating_sub(11); if safe > 0 { let safe = self.buf.floor_char_boundary(safe); self.content_parts.push(self.buf[..safe].to_string()); self.buf = self.buf[safe..].to_string(); } break; } } } new_calls } fn flush_content(&mut self) { if !self.content_parts.is_empty() { let text: String = self.content_parts.drain(..).collect(); if !text.is_empty() { self.children.push(AstNode::content(text)); } } } /// Finalize the parse. Returns the completed assistant AstNode. pub fn finish(mut self) -> AstNode { if !self.buf.is_empty() { self.content_parts.push(std::mem::take(&mut self.buf)); } self.flush_content(); AstNode::branch(Role::Assistant, self.children) } /// Get the current display text (for streaming to UI). pub fn display_content(&self) -> String { self.content_parts.join("") } } impl ContextState { pub fn new() -> Self { Self { system: Vec::new(), identity: Vec::new(), journal: Vec::new(), conversation: Vec::new(), } } // -- Read access ---------------------------------------------------------- pub fn system(&self) -> &[AstNode] { &self.system } pub fn identity(&self) -> &[AstNode] { &self.identity } pub fn journal(&self) -> &[AstNode] { &self.journal } pub fn conversation(&self) -> &[AstNode] { &self.conversation } fn sections(&self) -> [&Vec; 4] { [&self.system, &self.identity, &self.journal, &self.conversation] } } impl Ast for ContextState { fn render(&self) -> String { let mut s = String::new(); for section in self.sections() { for node in section { s.push_str(&node.render()); } } s } fn token_ids(&self) -> Vec { let mut ids = Vec::new(); for section in self.sections() { for node in section { ids.extend(node.token_ids()); } } ids } fn tokens(&self) -> usize { self.sections().iter() .flat_map(|s| s.iter()) .map(|n| n.tokens()) .sum() } } impl ContextState { fn section_mut(&mut self, section: Section) -> &mut Vec { match section { Section::System => &mut self.system, Section::Identity => &mut self.identity, Section::Journal => &mut self.journal, Section::Conversation => &mut self.conversation, } } pub fn push(&mut self, section: Section, node: AstNode) { self.section_mut(section).push(node); } /// Replace the body of a leaf at `index` in `section`. /// Re-tokenizes to maintain the invariant. pub fn set_message(&mut self, section: Section, index: usize, body: NodeBody) { let nodes = self.section_mut(section); let node = &mut nodes[index]; match node { AstNode::Leaf(leaf) => { let token_ids = if body.is_prompt_visible() { tokenizer::encode(&body.render()) } else { vec![] }; leaf.body = body; leaf.token_ids = token_ids; } AstNode::Branch { .. } => panic!("set_message on branch node"), } } /// Set the memory score on a Memory leaf at `index` in `section`. pub fn set_score(&mut self, section: Section, index: usize, score: Option) { let node = &mut self.section_mut(section)[index]; match node { AstNode::Leaf(leaf) => match &mut leaf.body { NodeBody::Memory { score: s, .. } => *s = score, _ => panic!("set_score on non-memory node"), }, _ => panic!("set_score on branch node"), } } /// Remove a node at `index` from `section`. pub fn del(&mut self, section: Section, index: usize) -> AstNode { self.section_mut(section).remove(index) } } pub fn context_window() -> usize { crate::config::get().api_context_window } pub fn context_budget_tokens() -> usize { context_window() * 80 / 100 } pub fn is_context_overflow(err: &anyhow::Error) -> bool { let msg = err.to_string().to_lowercase(); msg.contains("context length") || msg.contains("token limit") || msg.contains("too many tokens") || msg.contains("maximum context") || msg.contains("prompt is too long") || msg.contains("request too large") || msg.contains("input validation error") || msg.contains("content length limit") || (msg.contains("400") && msg.contains("tokens")) } pub fn is_stream_error(err: &anyhow::Error) -> bool { err.to_string().contains("model stream error") } #[cfg(test)] mod tests { use super::*; // -- Helpers for inspecting parse results ---------------------------------- /// Extract child bodies from an Assistant branch node. fn child_bodies(node: &AstNode) -> Vec<&NodeBody> { match node { AstNode::Branch { children, .. } => children.iter().filter_map(|c| c.leaf()).map(|l| l.body()).collect(), _ => panic!("expected branch"), } } fn assert_content(body: &NodeBody, expected: &str) { match body { NodeBody::Content(t) => assert_eq!(t, expected), other => panic!("expected Content, got {:?}", other), } } fn assert_thinking(body: &NodeBody, expected: &str) { match body { NodeBody::Thinking(t) => assert_eq!(t, expected), other => panic!("expected Thinking, got {:?}", other), } } fn assert_tool_call<'a>(body: &'a NodeBody, expected_name: &str) -> &'a str { match body { NodeBody::ToolCall { name, arguments } => { assert_eq!(name, expected_name); arguments } other => panic!("expected ToolCall, got {:?}", other), } } // -- XML parsing tests ---------------------------------------------------- #[test] fn test_tool_call_xml_parse_clean() { let body = "\npoc-memory used core-personality\n"; let (name, args) = parse_tool_call_body(body).unwrap(); assert_eq!(name, "bash"); let args: serde_json::Value = serde_json::from_str(&args).unwrap(); assert_eq!(args["command"], "poc-memory used core-personality"); } #[test] fn test_tool_call_xml_parse_streamed_whitespace() { let body = "<\nfunction\n=\nbash\n>\n<\nparameter\n=\ncommand\n>pwd\n"; let (name, args) = parse_tool_call_body(body).unwrap(); assert_eq!(name, "bash"); let args: serde_json::Value = serde_json::from_str(&args).unwrap(); assert_eq!(args["command"], "pwd"); } #[test] fn test_tool_call_json_parse() { let body = r#"{"name": "bash", "arguments": {"command": "ls"}}"#; let (name, args) = parse_tool_call_body(body).unwrap(); assert_eq!(name, "bash"); let args: serde_json::Value = serde_json::from_str(&args).unwrap(); assert_eq!(args["command"], "ls"); } #[test] fn test_normalize_preserves_content() { let text = "\necho hello world\n"; let normalized = normalize_xml_tags(text); assert_eq!(normalized, text); } #[test] fn test_normalize_strips_tag_internal_whitespace() { assert_eq!(normalize_xml_tags("<\nfunction\n=\nbash\n>"), ""); } // -- ResponseParser tests ------------------------------------------------- #[test] fn test_parser_plain_text() { let mut p = ResponseParser::new(); p.feed("hello world"); let node = p.finish(); let bodies = child_bodies(&node); assert_eq!(bodies.len(), 1); assert_content(bodies[0], "hello world"); } #[test] fn test_parser_thinking_then_content() { let mut p = ResponseParser::new(); p.feed("reasoninganswer"); let node = p.finish(); let bodies = child_bodies(&node); assert_eq!(bodies.len(), 2); assert_thinking(bodies[0], "reasoning"); assert_content(bodies[1], "answer"); } #[test] fn test_parser_tool_call() { let mut p = ResponseParser::new(); let calls = p.feed("\n\nls\n\n"); assert_eq!(calls.len(), 1); // returned for immediate dispatch let node = p.finish(); let bodies = child_bodies(&node); assert_eq!(bodies.len(), 1); let args = assert_tool_call(bodies[0], "bash"); let args: serde_json::Value = serde_json::from_str(args).unwrap(); assert_eq!(args["command"], "ls"); } #[test] fn test_parser_content_then_tool_call_then_content() { let mut p = ResponseParser::new(); p.feed("before"); p.feed("\n\npwd\n\n"); p.feed("after"); let node = p.finish(); let bodies = child_bodies(&node); assert_eq!(bodies.len(), 3); assert_content(bodies[0], "before"); assert_tool_call(bodies[1], "bash"); assert_content(bodies[2], "after"); } #[test] fn test_parser_incremental_feed() { // Feed the response one character at a time let text = "thoughtresponse"; let mut p = ResponseParser::new(); for ch in text.chars() { p.feed(&ch.to_string()); } let node = p.finish(); let bodies = child_bodies(&node); assert_eq!(bodies.len(), 2); assert_thinking(bodies[0], "thought"); assert_content(bodies[1], "response"); } #[test] fn test_parser_incremental_tool_call() { let text = "text\n\nls\n\nmore"; let mut p = ResponseParser::new(); let mut total_calls = 0; for ch in text.chars() { total_calls += p.feed(&ch.to_string()).len(); } assert_eq!(total_calls, 1); // exactly one tool call dispatched let node = p.finish(); let bodies = child_bodies(&node); assert_eq!(bodies.len(), 3); assert_content(bodies[0], "text"); assert_tool_call(bodies[1], "bash"); assert_content(bodies[2], "more"); } #[test] fn test_parser_thinking_tool_call_content() { let mut p = ResponseParser::new(); p.feed("let me think"); p.feed("\n\n/etc/hosts\n\n"); p.feed("here's what I found"); let node = p.finish(); let bodies = child_bodies(&node); assert_eq!(bodies.len(), 3); assert_thinking(bodies[0], "let me think"); assert_tool_call(bodies[1], "read"); assert_content(bodies[2], "here's what I found"); } #[test] fn test_parser_finish_produces_assistant_branch() { let mut p = ResponseParser::new(); p.feed("hello"); let node = p.finish(); match &node { AstNode::Branch { role, .. } => assert_eq!(*role, Role::Assistant), _ => panic!("expected branch"), } } // -- Round-trip rendering tests ------------------------------------------- #[test] fn test_render_system_msg() { let node = AstNode::system_msg("you are helpful"); assert_eq!(node.render(), "<|im_start|>system\nyou are helpful<|im_end|>\n"); } #[test] fn test_render_user_msg() { let node = AstNode::user_msg("hello"); assert_eq!(node.render(), "<|im_start|>user\nhello<|im_end|>\n"); } #[test] fn test_render_assistant_with_thinking_and_content() { let node = AstNode::branch(Role::Assistant, vec![ AstNode::thinking("hmm"), AstNode::content("answer"), ]); // Thinking renders as empty, content renders as-is assert_eq!(node.render(), "<|im_start|>assistant\nanswer<|im_end|>\n"); } #[test] fn test_render_tool_result() { let node = AstNode::tool_result("output here"); assert_eq!(node.render(), "<|im_start|>tool\noutput here<|im_end|>\n"); } #[test] fn test_render_memory() { let node = AstNode::memory("identity", "I am Proof of Concept"); assert_eq!(node.render(), "<|im_start|>memory\nI am Proof of Concept<|im_end|>\n"); } #[test] fn test_render_dmn() { let node = AstNode::dmn("subconscious prompt"); assert_eq!(node.render(), "<|im_start|>dmn\nsubconscious prompt<|im_end|>\n"); } #[test] fn test_render_tool_call() { let node = AstNode::tool_call("bash", r#"{"command":"ls"}"#); let rendered = node.render(); assert!(rendered.contains("")); assert!(rendered.contains("")); assert!(rendered.contains("")); assert!(rendered.contains("ls")); assert!(rendered.contains("")); } // -- Tokenizer round-trip tests ------------------------------------------- // These require the tokenizer file; skipped if not present. fn init_tokenizer() -> bool { let path = format!("{}/.consciousness/tokenizer-qwen35.json", std::env::var("HOME").unwrap_or_default()); if std::path::Path::new(&path).exists() { tokenizer::init(&path); true } else { false } } /// token_ids() must equal encode(render()) for all node types fn assert_token_roundtrip(node: &AstNode) { let rendered = node.render(); let expected = tokenizer::encode(&rendered); let actual = node.token_ids(); assert_eq!(actual, expected, "token_ids mismatch for rendered: {:?}", rendered); } #[test] fn test_tokenize_roundtrip_leaf_types() { if !init_tokenizer() { return; } assert_token_roundtrip(&AstNode::system_msg("you are a helpful assistant")); assert_token_roundtrip(&AstNode::user_msg("what is 2+2?")); assert_token_roundtrip(&AstNode::tool_result("4")); assert_token_roundtrip(&AstNode::memory("identity", "I am Proof of Concept")); assert_token_roundtrip(&AstNode::dmn("check the memory store")); assert_token_roundtrip(&AstNode::tool_call("bash", r#"{"command":"ls -la"}"#)); } #[test] fn test_tokenize_roundtrip_assistant_branch() { if !init_tokenizer() { return; } let node = AstNode::branch(Role::Assistant, vec![ AstNode::content("here's what I found:\n"), AstNode::tool_call("bash", r#"{"command":"pwd"}"#), AstNode::content("\nthat's the current directory"), ]); assert_token_roundtrip(&node); } #[test] fn test_tokenize_invisible_nodes_are_zero() { if !init_tokenizer() { return; } assert_eq!(AstNode::thinking("deep thoughts").tokens(), 0); assert_eq!(AstNode::log("debug info").tokens(), 0); } #[test] fn test_tokenize_decode_roundtrip() { if !init_tokenizer() { return; } // Content without special tokens round-trips through decode let text = "hello world, this is a test"; let ids = tokenizer::encode(text); let decoded = tokenizer::decode(&ids); assert_eq!(decoded, text); } #[test] fn test_tokenize_context_state_matches_concatenation() { if !init_tokenizer() { return; } let mut ctx = ContextState::new(); ctx.push(Section::System, AstNode::system_msg("you are helpful")); ctx.push(Section::Identity, AstNode::memory("name", "Proof of Concept")); ctx.push(Section::Conversation, AstNode::user_msg("hi")); let rendered = ctx.render(); let expected = tokenizer::encode(&rendered); let actual = ctx.token_ids(); assert_eq!(actual, expected); } #[test] fn test_parser_roundtrip_through_tokenizer() { if !init_tokenizer() { return; } // Parse a response, render it, verify it matches the expected format let mut p = ResponseParser::new(); p.feed("I'll check that for you"); p.feed("\n\nls\n\n"); let node = p.finish(); // The assistant branch should tokenize to the same as encoding its render assert_token_roundtrip(&node); // Token count should be nonzero (thinking is invisible but content + tool call are) assert!(node.tokens() > 0); } }