2026-04-08 12:46:44 -04:00
|
|
|
// context.rs — Context window as an AST
|
|
|
|
|
//
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
// The context window is a tree of AstNodes. Each node is either a leaf
|
|
|
|
|
// (typed content with cached token IDs) or a branch (role + children).
|
|
|
|
|
// The full prompt is a depth-first traversal of the sections in ContextState.
|
|
|
|
|
// Streaming responses are parsed into new nodes by the ResponseParser.
|
2026-04-08 12:46:44 -04:00
|
|
|
//
|
|
|
|
|
// Grammar (EBNF):
|
|
|
|
|
//
|
|
|
|
|
// context = section* ;
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
// section = (message | leaf)* ;
|
|
|
|
|
// message = IM_START role "\n" element* IM_END "\n" ;
|
|
|
|
|
// role = "system" | "user" | "assistant" ;
|
2026-04-08 12:46:44 -04:00
|
|
|
// element = thinking | tool_call | content ;
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
// thinking = "<think>" TEXT "</think>" ;
|
2026-04-08 12:46:44 -04:00
|
|
|
// tool_call = "<tool_call>\n" tool_xml "\n</tool_call>" ;
|
|
|
|
|
// tool_xml = "<function=" NAME ">\n" param* "</function>" ;
|
|
|
|
|
// param = "<parameter=" NAME ">\n" VALUE "\n</parameter>\n" ;
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
// content = TEXT ;
|
2026-04-08 12:46:44 -04:00
|
|
|
//
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
// Self-wrapping leaves (not inside a message branch):
|
|
|
|
|
// dmn = IM_START "dmn\n" TEXT IM_END "\n" ;
|
|
|
|
|
// memory = IM_START "memory\n" TEXT IM_END "\n" ;
|
|
|
|
|
// tool_result = IM_START "tool\n" TEXT IM_END "\n" ;
|
|
|
|
|
//
|
|
|
|
|
// Non-visible leaves (not in prompt):
|
|
|
|
|
// log = TEXT ;
|
|
|
|
|
//
|
|
|
|
|
// Role is only for branch (interior) nodes. Leaf type is determined by
|
|
|
|
|
// the NodeBody variant. Grammar constraints enforced by construction.
|
2026-04-08 12:46:44 -04:00
|
|
|
|
|
|
|
|
use chrono::{DateTime, Utc};
|
|
|
|
|
use super::tokenizer;
|
|
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
// Types
|
|
|
|
|
// ---------------------------------------------------------------------------
|
|
|
|
|
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
/// Branch roles — maps directly to the grammar's message roles.
|
2026-04-08 12:46:44 -04:00
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
|
|
|
pub enum Role {
|
|
|
|
|
System,
|
|
|
|
|
User,
|
|
|
|
|
Assistant,
|
|
|
|
|
}
|
|
|
|
|
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
/// Leaf content — each variant knows how to render itself.
|
2026-04-08 12:46:44 -04:00
|
|
|
#[derive(Debug, Clone)]
|
|
|
|
|
pub enum NodeBody {
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
// Children of message branches — rendered without im_start/im_end
|
|
|
|
|
Content(String),
|
|
|
|
|
Thinking(String),
|
|
|
|
|
ToolCall { name: String, arguments: String },
|
|
|
|
|
|
|
|
|
|
// Self-wrapping leaves — render their own im_start/im_end
|
|
|
|
|
ToolResult(String),
|
|
|
|
|
Memory { key: String, text: String, score: Option<f64> },
|
|
|
|
|
Dmn(String),
|
|
|
|
|
|
|
|
|
|
// Non-visible (0 tokens in prompt)
|
|
|
|
|
Log(String),
|
2026-04-08 12:46:44 -04:00
|
|
|
}
|
|
|
|
|
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
/// A leaf node: typed content with cached token IDs.
|
2026-04-08 12:46:44 -04:00
|
|
|
#[derive(Debug, Clone)]
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
pub struct NodeLeaf {
|
2026-04-08 12:58:59 -04:00
|
|
|
body: NodeBody,
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
token_ids: Vec<u32>,
|
2026-04-08 12:58:59 -04:00
|
|
|
timestamp: Option<DateTime<Utc>>,
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// A node in the context AST.
|
|
|
|
|
#[derive(Debug, Clone)]
|
|
|
|
|
pub enum AstNode {
|
|
|
|
|
Leaf(NodeLeaf),
|
|
|
|
|
Branch { role: Role, children: Vec<AstNode> },
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// The context window: four sections as Vec<AstNode>.
|
|
|
|
|
/// All mutation goes through ContextState methods to maintain the invariant
|
|
|
|
|
/// that token_ids on every leaf matches its rendered text.
|
|
|
|
|
pub struct ContextState {
|
|
|
|
|
system: Vec<AstNode>,
|
|
|
|
|
identity: Vec<AstNode>,
|
|
|
|
|
journal: Vec<AstNode>,
|
|
|
|
|
conversation: Vec<AstNode>,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Identifies a section for mutation methods.
|
|
|
|
|
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
|
|
|
pub enum Section {
|
|
|
|
|
System,
|
|
|
|
|
Identity,
|
|
|
|
|
Journal,
|
|
|
|
|
Conversation,
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-08 13:38:00 -04:00
|
|
|
pub trait Ast {
|
|
|
|
|
fn render(&self) -> String;
|
|
|
|
|
fn token_ids(&self) -> Vec<u32>;
|
|
|
|
|
fn tokens(&self) -> usize;
|
|
|
|
|
}
|
|
|
|
|
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
/// State machine for parsing a streaming assistant response into an AstNode.
|
|
|
|
|
/// Feed text chunks as they arrive; completed tool calls are returned for
|
|
|
|
|
/// immediate dispatch.
|
|
|
|
|
pub struct ResponseParser {
|
|
|
|
|
buf: String,
|
|
|
|
|
content_parts: Vec<String>,
|
|
|
|
|
children: Vec<AstNode>,
|
|
|
|
|
in_think: bool,
|
|
|
|
|
think_buf: String,
|
|
|
|
|
in_tool_call: bool,
|
|
|
|
|
tool_call_buf: String,
|
2026-04-08 12:46:44 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl Role {
|
|
|
|
|
pub fn as_str(&self) -> &'static str {
|
|
|
|
|
match self {
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
Self::System => "system",
|
|
|
|
|
Self::User => "user",
|
2026-04-08 12:46:44 -04:00
|
|
|
Self::Assistant => "assistant",
|
|
|
|
|
}
|
|
|
|
|
}
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
}
|
2026-04-08 12:46:44 -04:00
|
|
|
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
impl NodeBody {
|
|
|
|
|
/// Render this leaf body to text for the prompt.
|
|
|
|
|
fn render(&self) -> String {
|
|
|
|
|
match self {
|
|
|
|
|
Self::Content(text) => text.clone(),
|
|
|
|
|
Self::Thinking(_) => String::new(),
|
|
|
|
|
Self::Log(_) => String::new(),
|
|
|
|
|
Self::ToolCall { name, arguments } => {
|
|
|
|
|
let xml = format_tool_call_xml(name, arguments);
|
|
|
|
|
format!("<tool_call>\n{}\n</tool_call>\n", xml)
|
|
|
|
|
}
|
|
|
|
|
Self::ToolResult(text) =>
|
|
|
|
|
format!("<|im_start|>tool\n{}<|im_end|>\n", text),
|
|
|
|
|
Self::Memory { text, .. } =>
|
|
|
|
|
format!("<|im_start|>memory\n{}<|im_end|>\n", text),
|
|
|
|
|
Self::Dmn(text) =>
|
|
|
|
|
format!("<|im_start|>dmn\n{}<|im_end|>\n", text),
|
|
|
|
|
}
|
2026-04-08 12:46:44 -04:00
|
|
|
}
|
|
|
|
|
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
/// Whether this leaf contributes tokens to the prompt.
|
|
|
|
|
fn is_prompt_visible(&self) -> bool {
|
|
|
|
|
!matches!(self, Self::Thinking(_) | Self::Log(_))
|
2026-04-08 12:46:44 -04:00
|
|
|
}
|
|
|
|
|
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
/// The text content of this leaf (for display, not rendering).
|
|
|
|
|
pub fn text(&self) -> &str {
|
|
|
|
|
match self {
|
|
|
|
|
Self::Content(t) | Self::Thinking(t) | Self::Log(t)
|
|
|
|
|
| Self::ToolResult(t) | Self::Dmn(t) => t,
|
|
|
|
|
Self::ToolCall { name, .. } => name,
|
|
|
|
|
Self::Memory { text, .. } => text,
|
|
|
|
|
}
|
2026-04-08 12:46:44 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
impl NodeLeaf {
|
|
|
|
|
fn new(body: NodeBody) -> Self {
|
|
|
|
|
let token_ids = if body.is_prompt_visible() {
|
|
|
|
|
tokenizer::encode(&body.render())
|
|
|
|
|
} else {
|
|
|
|
|
vec![]
|
|
|
|
|
};
|
|
|
|
|
Self { body, token_ids, timestamp: None }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn with_timestamp(mut self, ts: DateTime<Utc>) -> Self {
|
|
|
|
|
self.timestamp = Some(ts);
|
|
|
|
|
self
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn body(&self) -> &NodeBody { &self.body }
|
|
|
|
|
pub fn token_ids(&self) -> &[u32] { &self.token_ids }
|
|
|
|
|
pub fn tokens(&self) -> usize { self.token_ids.len() }
|
|
|
|
|
pub fn timestamp(&self) -> Option<DateTime<Utc>> { self.timestamp }
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-08 12:46:44 -04:00
|
|
|
impl AstNode {
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
// -- Leaf constructors ----------------------------------------------------
|
|
|
|
|
|
|
|
|
|
pub fn content(text: impl Into<String>) -> Self {
|
|
|
|
|
Self::Leaf(NodeLeaf::new(NodeBody::Content(text.into())))
|
2026-04-08 12:46:44 -04:00
|
|
|
}
|
|
|
|
|
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
pub fn thinking(text: impl Into<String>) -> Self {
|
|
|
|
|
Self::Leaf(NodeLeaf::new(NodeBody::Thinking(text.into())))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn tool_call(name: impl Into<String>, arguments: impl Into<String>) -> Self {
|
|
|
|
|
Self::Leaf(NodeLeaf::new(NodeBody::ToolCall {
|
|
|
|
|
name: name.into(),
|
|
|
|
|
arguments: arguments.into(),
|
|
|
|
|
}))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn tool_result(text: impl Into<String>) -> Self {
|
|
|
|
|
Self::Leaf(NodeLeaf::new(NodeBody::ToolResult(text.into())))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn memory(key: impl Into<String>, text: impl Into<String>) -> Self {
|
|
|
|
|
Self::Leaf(NodeLeaf::new(NodeBody::Memory {
|
|
|
|
|
key: key.into(),
|
|
|
|
|
text: text.into(),
|
|
|
|
|
score: None,
|
|
|
|
|
}))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn dmn(text: impl Into<String>) -> Self {
|
|
|
|
|
Self::Leaf(NodeLeaf::new(NodeBody::Dmn(text.into())))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn log(text: impl Into<String>) -> Self {
|
|
|
|
|
Self::Leaf(NodeLeaf::new(NodeBody::Log(text.into())))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// -- Branch constructors --------------------------------------------------
|
|
|
|
|
|
2026-04-08 12:46:44 -04:00
|
|
|
pub fn branch(role: Role, children: Vec<AstNode>) -> Self {
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
Self::Branch { role, children }
|
2026-04-08 12:46:44 -04:00
|
|
|
}
|
|
|
|
|
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
pub fn system_msg(text: impl Into<String>) -> Self {
|
|
|
|
|
Self::Branch {
|
|
|
|
|
role: Role::System,
|
|
|
|
|
children: vec![Self::content(text)],
|
|
|
|
|
}
|
2026-04-08 12:46:44 -04:00
|
|
|
}
|
|
|
|
|
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
pub fn user_msg(text: impl Into<String>) -> Self {
|
|
|
|
|
Self::Branch {
|
|
|
|
|
role: Role::User,
|
|
|
|
|
children: vec![Self::content(text)],
|
|
|
|
|
}
|
2026-04-08 12:46:44 -04:00
|
|
|
}
|
|
|
|
|
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
// -- Builder --------------------------------------------------------------
|
|
|
|
|
|
2026-04-08 12:46:44 -04:00
|
|
|
pub fn with_timestamp(mut self, ts: DateTime<Utc>) -> Self {
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
match &mut self {
|
|
|
|
|
Self::Leaf(leaf) => leaf.timestamp = Some(ts),
|
|
|
|
|
Self::Branch { .. } => {}
|
|
|
|
|
}
|
2026-04-08 12:46:44 -04:00
|
|
|
self
|
|
|
|
|
}
|
|
|
|
|
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
pub fn children(&self) -> &[AstNode] {
|
|
|
|
|
match self {
|
|
|
|
|
Self::Branch { children, .. } => children,
|
|
|
|
|
Self::Leaf(_) => &[],
|
2026-04-08 12:46:44 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
pub fn leaf(&self) -> Option<&NodeLeaf> {
|
|
|
|
|
match self {
|
|
|
|
|
Self::Leaf(l) => Some(l),
|
|
|
|
|
_ => None,
|
2026-04-08 12:46:44 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Short label for the UI.
|
|
|
|
|
pub fn label(&self) -> String {
|
|
|
|
|
let cfg = crate::config::get();
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
match self {
|
|
|
|
|
Self::Branch { role, children } => {
|
|
|
|
|
let preview = children.first()
|
|
|
|
|
.and_then(|c| c.leaf())
|
|
|
|
|
.map(|l| truncate_preview(l.body.text(), 60))
|
|
|
|
|
.unwrap_or_default();
|
|
|
|
|
match role {
|
|
|
|
|
Role::System => "system".into(),
|
|
|
|
|
Role::User => format!("{}: {}", cfg.user_name, preview),
|
|
|
|
|
Role::Assistant => format!("{}: {}", cfg.assistant_name, preview),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
Self::Leaf(leaf) => match &leaf.body {
|
|
|
|
|
NodeBody::Content(t) => truncate_preview(t, 60),
|
|
|
|
|
NodeBody::Thinking(t) => format!("thinking: {}", truncate_preview(t, 60)),
|
|
|
|
|
NodeBody::ToolCall { name, .. } => format!("tool_call: {}", name),
|
|
|
|
|
NodeBody::ToolResult(_) => "tool_result".into(),
|
|
|
|
|
NodeBody::Memory { key, score, .. } => match score {
|
|
|
|
|
Some(s) => format!("mem: {} score:{:.1}", key, s),
|
|
|
|
|
None => format!("mem: {}", key),
|
|
|
|
|
},
|
|
|
|
|
NodeBody::Dmn(_) => "dmn".into(),
|
|
|
|
|
NodeBody::Log(t) => format!("log: {}", truncate_preview(t, 60)),
|
2026-04-08 12:46:44 -04:00
|
|
|
},
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-08 13:38:00 -04:00
|
|
|
impl Ast for AstNode {
|
|
|
|
|
fn render(&self) -> String {
|
|
|
|
|
match self {
|
|
|
|
|
Self::Leaf(leaf) => leaf.body.render(),
|
|
|
|
|
Self::Branch { role, children } =>
|
|
|
|
|
render_branch(*role, children),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn token_ids(&self) -> Vec<u32> {
|
|
|
|
|
match self {
|
|
|
|
|
Self::Leaf(leaf) => leaf.token_ids.clone(),
|
|
|
|
|
Self::Branch { role, children } =>
|
|
|
|
|
tokenizer::encode(&render_branch(*role, children)),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn tokens(&self) -> usize {
|
|
|
|
|
match self {
|
|
|
|
|
Self::Leaf(leaf) => leaf.tokens(),
|
|
|
|
|
Self::Branch { children, .. } =>
|
|
|
|
|
children.iter().map(|c| c.tokens()).sum(),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-08 12:46:44 -04:00
|
|
|
fn truncate_preview(s: &str, max: usize) -> String {
|
|
|
|
|
let preview: String = s.chars().take(max).collect();
|
|
|
|
|
let preview = preview.replace('\n', " ");
|
|
|
|
|
if s.len() > max { format!("{}...", preview) } else { preview }
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn render_branch(role: Role, children: &[AstNode]) -> String {
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
let mut s = format!("<|im_start|>{}\n", role.as_str());
|
|
|
|
|
for child in children {
|
|
|
|
|
s.push_str(&child.render());
|
2026-04-08 12:46:44 -04:00
|
|
|
}
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
s.push_str("<|im_end|>\n");
|
|
|
|
|
s
|
2026-04-08 12:46:44 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn format_tool_call_xml(name: &str, args_json: &str) -> String {
|
|
|
|
|
let args: serde_json::Value = serde_json::from_str(args_json)
|
|
|
|
|
.unwrap_or(serde_json::Value::Object(Default::default()));
|
|
|
|
|
let mut xml = format!("<function={}>\n", name);
|
|
|
|
|
if let Some(obj) = args.as_object() {
|
|
|
|
|
for (key, value) in obj {
|
|
|
|
|
let val_str = match value {
|
|
|
|
|
serde_json::Value::String(s) => s.clone(),
|
|
|
|
|
other => other.to_string(),
|
|
|
|
|
};
|
|
|
|
|
xml.push_str(&format!("<parameter={}>\n{}\n</parameter>\n", key, val_str));
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
xml.push_str("</function>");
|
|
|
|
|
xml
|
|
|
|
|
}
|
|
|
|
|
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
fn normalize_xml_tags(text: &str) -> String {
|
|
|
|
|
let mut result = String::with_capacity(text.len());
|
|
|
|
|
let mut chars = text.chars().peekable();
|
|
|
|
|
while let Some(ch) = chars.next() {
|
|
|
|
|
if ch == '<' {
|
|
|
|
|
let mut tag = String::from('<');
|
|
|
|
|
for inner in chars.by_ref() {
|
|
|
|
|
if inner == '>' {
|
|
|
|
|
tag.push('>');
|
|
|
|
|
break;
|
|
|
|
|
} else if inner.is_whitespace() {
|
|
|
|
|
// Skip whitespace inside tags
|
|
|
|
|
} else {
|
|
|
|
|
tag.push(inner);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
result.push_str(&tag);
|
|
|
|
|
} else {
|
|
|
|
|
result.push(ch);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
result
|
|
|
|
|
}
|
2026-04-08 12:46:44 -04:00
|
|
|
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
fn parse_qwen_tag<'a>(s: &'a str, tag: &str) -> Option<(&'a str, &'a str, &'a str)> {
|
|
|
|
|
let open = format!("<{}=", tag);
|
|
|
|
|
let close = format!("</{}>", tag);
|
|
|
|
|
|
|
|
|
|
let start = s.find(&open)? + open.len();
|
|
|
|
|
let name_end = start + s[start..].find('>')?;
|
|
|
|
|
let body_start = name_end + 1;
|
|
|
|
|
let body_end = body_start + s[body_start..].find(&close)?;
|
|
|
|
|
|
|
|
|
|
Some((
|
|
|
|
|
s[start..name_end].trim(),
|
|
|
|
|
s[body_start..body_end].trim(),
|
|
|
|
|
&s[body_end + close.len()..],
|
|
|
|
|
))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn parse_tool_call_body(body: &str) -> Option<(String, String)> {
|
|
|
|
|
let normalized = normalize_xml_tags(body);
|
|
|
|
|
let body = normalized.trim();
|
|
|
|
|
parse_xml_tool_call(body)
|
|
|
|
|
.or_else(|| parse_json_tool_call(body))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn parse_xml_tool_call(body: &str) -> Option<(String, String)> {
|
|
|
|
|
let (func_name, func_body, _) = parse_qwen_tag(body, "function")?;
|
|
|
|
|
let mut args = serde_json::Map::new();
|
|
|
|
|
let mut rest = func_body;
|
|
|
|
|
while let Some((key, val, remainder)) = parse_qwen_tag(rest, "parameter") {
|
|
|
|
|
args.insert(key.to_string(), serde_json::Value::String(val.to_string()));
|
|
|
|
|
rest = remainder;
|
|
|
|
|
}
|
|
|
|
|
Some((func_name.to_string(), serde_json::to_string(&args).unwrap_or_default()))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn parse_json_tool_call(body: &str) -> Option<(String, String)> {
|
|
|
|
|
let v: serde_json::Value = serde_json::from_str(body).ok()?;
|
|
|
|
|
let name = v["name"].as_str()?;
|
|
|
|
|
let arguments = &v["arguments"];
|
|
|
|
|
Some((name.to_string(), serde_json::to_string(arguments).unwrap_or_default()))
|
2026-04-08 12:46:44 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
impl ResponseParser {
|
|
|
|
|
pub fn new() -> Self {
|
|
|
|
|
Self {
|
|
|
|
|
buf: String::new(),
|
|
|
|
|
content_parts: Vec::new(),
|
|
|
|
|
children: Vec::new(),
|
|
|
|
|
in_think: false,
|
|
|
|
|
think_buf: String::new(),
|
|
|
|
|
in_tool_call: false,
|
|
|
|
|
tool_call_buf: String::new(),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Feed a text chunk. Returns newly completed tool call nodes
|
|
|
|
|
/// (for immediate dispatch).
|
|
|
|
|
pub fn feed(&mut self, text: &str) -> Vec<AstNode> {
|
|
|
|
|
let mut new_calls = vec![];
|
|
|
|
|
self.buf.push_str(text);
|
|
|
|
|
|
|
|
|
|
loop {
|
|
|
|
|
if self.in_think {
|
|
|
|
|
match self.buf.find("</think>") {
|
|
|
|
|
Some(end) => {
|
|
|
|
|
self.think_buf.push_str(&self.buf[..end]);
|
|
|
|
|
self.buf = self.buf[end + 8..].to_string();
|
|
|
|
|
self.in_think = false;
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
self.children.push(AstNode::thinking(&self.think_buf));
|
2026-04-08 12:46:44 -04:00
|
|
|
self.think_buf.clear();
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
None => {
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
// Keep last 8 chars ("</think>".len()) as lookahead
|
|
|
|
|
let safe = self.buf.len().saturating_sub(8);
|
|
|
|
|
if safe > 0 {
|
|
|
|
|
let safe = self.buf.floor_char_boundary(safe);
|
|
|
|
|
self.think_buf.push_str(&self.buf[..safe]);
|
|
|
|
|
self.buf = self.buf[safe..].to_string();
|
|
|
|
|
}
|
2026-04-08 12:46:44 -04:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if self.in_tool_call {
|
|
|
|
|
match self.buf.find("</tool_call>") {
|
|
|
|
|
Some(end) => {
|
|
|
|
|
self.tool_call_buf.push_str(&self.buf[..end]);
|
|
|
|
|
self.buf = self.buf[end + 12..].to_string();
|
|
|
|
|
self.in_tool_call = false;
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
if let Some((name, args)) = parse_tool_call_body(&self.tool_call_buf) {
|
|
|
|
|
let node = AstNode::tool_call(name, args);
|
2026-04-08 12:46:44 -04:00
|
|
|
new_calls.push(node.clone());
|
|
|
|
|
self.flush_content();
|
|
|
|
|
self.children.push(node);
|
|
|
|
|
}
|
|
|
|
|
self.tool_call_buf.clear();
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
None => {
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
// Keep last 12 chars ("</tool_call>".len()) as lookahead
|
|
|
|
|
let safe = self.buf.len().saturating_sub(12);
|
|
|
|
|
if safe > 0 {
|
|
|
|
|
let safe = self.buf.floor_char_boundary(safe);
|
|
|
|
|
self.tool_call_buf.push_str(&self.buf[..safe]);
|
|
|
|
|
self.buf = self.buf[safe..].to_string();
|
|
|
|
|
}
|
2026-04-08 12:46:44 -04:00
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let think_pos = self.buf.find("<think>");
|
|
|
|
|
let tool_pos = self.buf.find("<tool_call>");
|
|
|
|
|
let next_tag = match (think_pos, tool_pos) {
|
|
|
|
|
(Some(a), Some(b)) => Some(a.min(b)),
|
|
|
|
|
(Some(a), None) => Some(a),
|
|
|
|
|
(None, Some(b)) => Some(b),
|
|
|
|
|
(None, None) => None,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
match next_tag {
|
|
|
|
|
Some(pos) => {
|
|
|
|
|
if pos > 0 {
|
|
|
|
|
self.content_parts.push(self.buf[..pos].to_string());
|
|
|
|
|
}
|
|
|
|
|
if self.buf[pos..].starts_with("<think>") {
|
|
|
|
|
self.buf = self.buf[pos + 7..].to_string();
|
|
|
|
|
self.flush_content();
|
|
|
|
|
self.in_think = true;
|
|
|
|
|
} else {
|
|
|
|
|
self.buf = self.buf[pos + 11..].to_string();
|
|
|
|
|
self.flush_content();
|
|
|
|
|
self.in_tool_call = true;
|
|
|
|
|
}
|
|
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
None => {
|
|
|
|
|
let safe = self.buf.len().saturating_sub(11);
|
|
|
|
|
if safe > 0 {
|
|
|
|
|
let safe = self.buf.floor_char_boundary(safe);
|
|
|
|
|
self.content_parts.push(self.buf[..safe].to_string());
|
|
|
|
|
self.buf = self.buf[safe..].to_string();
|
|
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
new_calls
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn flush_content(&mut self) {
|
|
|
|
|
if !self.content_parts.is_empty() {
|
|
|
|
|
let text: String = self.content_parts.drain(..).collect();
|
|
|
|
|
if !text.is_empty() {
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
self.children.push(AstNode::content(text));
|
2026-04-08 12:46:44 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Finalize the parse. Returns the completed assistant AstNode.
|
|
|
|
|
pub fn finish(mut self) -> AstNode {
|
|
|
|
|
if !self.buf.is_empty() {
|
|
|
|
|
self.content_parts.push(std::mem::take(&mut self.buf));
|
|
|
|
|
}
|
|
|
|
|
self.flush_content();
|
|
|
|
|
AstNode::branch(Role::Assistant, self.children)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Get the current display text (for streaming to UI).
|
|
|
|
|
pub fn display_content(&self) -> String {
|
|
|
|
|
self.content_parts.join("")
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-08 12:58:59 -04:00
|
|
|
impl ContextState {
|
|
|
|
|
pub fn new() -> Self {
|
|
|
|
|
Self {
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
system: Vec::new(),
|
|
|
|
|
identity: Vec::new(),
|
|
|
|
|
journal: Vec::new(),
|
|
|
|
|
conversation: Vec::new(),
|
2026-04-08 12:58:59 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// -- Read access ----------------------------------------------------------
|
|
|
|
|
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
pub fn system(&self) -> &[AstNode] { &self.system }
|
|
|
|
|
pub fn identity(&self) -> &[AstNode] { &self.identity }
|
|
|
|
|
pub fn journal(&self) -> &[AstNode] { &self.journal }
|
|
|
|
|
pub fn conversation(&self) -> &[AstNode] { &self.conversation }
|
2026-04-08 12:58:59 -04:00
|
|
|
|
2026-04-08 13:38:00 -04:00
|
|
|
fn sections(&self) -> [&Vec<AstNode>; 4] {
|
|
|
|
|
[&self.system, &self.identity, &self.journal, &self.conversation]
|
2026-04-08 12:58:59 -04:00
|
|
|
}
|
2026-04-08 13:38:00 -04:00
|
|
|
}
|
2026-04-08 12:58:59 -04:00
|
|
|
|
2026-04-08 13:38:00 -04:00
|
|
|
impl Ast for ContextState {
|
|
|
|
|
fn render(&self) -> String {
|
|
|
|
|
let mut s = String::new();
|
|
|
|
|
for section in self.sections() {
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
for node in section {
|
2026-04-08 13:38:00 -04:00
|
|
|
s.push_str(&node.render());
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
}
|
|
|
|
|
}
|
2026-04-08 13:38:00 -04:00
|
|
|
s
|
2026-04-08 12:58:59 -04:00
|
|
|
}
|
|
|
|
|
|
2026-04-08 13:38:00 -04:00
|
|
|
fn token_ids(&self) -> Vec<u32> {
|
|
|
|
|
let mut ids = Vec::new();
|
|
|
|
|
for section in self.sections() {
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
for node in section {
|
2026-04-08 13:38:00 -04:00
|
|
|
ids.extend(node.token_ids());
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
}
|
|
|
|
|
}
|
2026-04-08 13:38:00 -04:00
|
|
|
ids
|
2026-04-08 12:58:59 -04:00
|
|
|
}
|
|
|
|
|
|
2026-04-08 13:38:00 -04:00
|
|
|
fn tokens(&self) -> usize {
|
|
|
|
|
self.sections().iter()
|
|
|
|
|
.flat_map(|s| s.iter())
|
|
|
|
|
.map(|n| n.tokens())
|
|
|
|
|
.sum()
|
|
|
|
|
}
|
|
|
|
|
}
|
2026-04-08 12:58:59 -04:00
|
|
|
|
2026-04-08 13:38:00 -04:00
|
|
|
impl ContextState {
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
fn section_mut(&mut self, section: Section) -> &mut Vec<AstNode> {
|
|
|
|
|
match section {
|
|
|
|
|
Section::System => &mut self.system,
|
|
|
|
|
Section::Identity => &mut self.identity,
|
|
|
|
|
Section::Journal => &mut self.journal,
|
|
|
|
|
Section::Conversation => &mut self.conversation,
|
2026-04-08 12:58:59 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
pub fn push(&mut self, section: Section, node: AstNode) {
|
|
|
|
|
self.section_mut(section).push(node);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Replace the body of a leaf at `index` in `section`.
|
|
|
|
|
/// Re-tokenizes to maintain the invariant.
|
|
|
|
|
pub fn set_message(&mut self, section: Section, index: usize, body: NodeBody) {
|
|
|
|
|
let nodes = self.section_mut(section);
|
|
|
|
|
let node = &mut nodes[index];
|
|
|
|
|
match node {
|
|
|
|
|
AstNode::Leaf(leaf) => {
|
|
|
|
|
let token_ids = if body.is_prompt_visible() {
|
|
|
|
|
tokenizer::encode(&body.render())
|
|
|
|
|
} else {
|
|
|
|
|
vec![]
|
|
|
|
|
};
|
|
|
|
|
leaf.body = body;
|
|
|
|
|
leaf.token_ids = token_ids;
|
|
|
|
|
}
|
|
|
|
|
AstNode::Branch { .. } => panic!("set_message on branch node"),
|
2026-04-08 12:58:59 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
/// Set the memory score on a Memory leaf at `index` in `section`.
|
|
|
|
|
pub fn set_score(&mut self, section: Section, index: usize, score: Option<f64>) {
|
|
|
|
|
let node = &mut self.section_mut(section)[index];
|
|
|
|
|
match node {
|
|
|
|
|
AstNode::Leaf(leaf) => match &mut leaf.body {
|
|
|
|
|
NodeBody::Memory { score: s, .. } => *s = score,
|
|
|
|
|
_ => panic!("set_score on non-memory node"),
|
|
|
|
|
},
|
|
|
|
|
_ => panic!("set_score on branch node"),
|
|
|
|
|
}
|
2026-04-08 12:58:59 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Remove a node at `index` from `section`.
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
pub fn del(&mut self, section: Section, index: usize) -> AstNode {
|
|
|
|
|
self.section_mut(section).remove(index)
|
2026-04-08 12:58:59 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2026-04-08 12:46:44 -04:00
|
|
|
pub fn context_window() -> usize {
|
|
|
|
|
crate::config::get().api_context_window
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn context_budget_tokens() -> usize {
|
|
|
|
|
context_window() * 80 / 100
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn is_context_overflow(err: &anyhow::Error) -> bool {
|
|
|
|
|
let msg = err.to_string().to_lowercase();
|
|
|
|
|
msg.contains("context length")
|
|
|
|
|
|| msg.contains("token limit")
|
|
|
|
|
|| msg.contains("too many tokens")
|
|
|
|
|
|| msg.contains("maximum context")
|
|
|
|
|
|| msg.contains("prompt is too long")
|
|
|
|
|
|| msg.contains("request too large")
|
|
|
|
|
|| msg.contains("input validation error")
|
|
|
|
|
|| msg.contains("content length limit")
|
|
|
|
|
|| (msg.contains("400") && msg.contains("tokens"))
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn is_stream_error(err: &anyhow::Error) -> bool {
|
|
|
|
|
err.to_string().contains("model stream error")
|
|
|
|
|
}
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
|
|
|
|
|
#[cfg(test)]
|
|
|
|
|
mod tests {
|
|
|
|
|
use super::*;
|
|
|
|
|
|
|
|
|
|
// -- Helpers for inspecting parse results ----------------------------------
|
|
|
|
|
|
|
|
|
|
/// Extract child bodies from an Assistant branch node.
|
|
|
|
|
fn child_bodies(node: &AstNode) -> Vec<&NodeBody> {
|
|
|
|
|
match node {
|
|
|
|
|
AstNode::Branch { children, .. } =>
|
|
|
|
|
children.iter().filter_map(|c| c.leaf()).map(|l| l.body()).collect(),
|
|
|
|
|
_ => panic!("expected branch"),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn assert_content(body: &NodeBody, expected: &str) {
|
|
|
|
|
match body {
|
|
|
|
|
NodeBody::Content(t) => assert_eq!(t, expected),
|
|
|
|
|
other => panic!("expected Content, got {:?}", other),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn assert_thinking(body: &NodeBody, expected: &str) {
|
|
|
|
|
match body {
|
|
|
|
|
NodeBody::Thinking(t) => assert_eq!(t, expected),
|
|
|
|
|
other => panic!("expected Thinking, got {:?}", other),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn assert_tool_call<'a>(body: &'a NodeBody, expected_name: &str) -> &'a str {
|
|
|
|
|
match body {
|
|
|
|
|
NodeBody::ToolCall { name, arguments } => {
|
|
|
|
|
assert_eq!(name, expected_name);
|
|
|
|
|
arguments
|
|
|
|
|
}
|
|
|
|
|
other => panic!("expected ToolCall, got {:?}", other),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// -- XML parsing tests ----------------------------------------------------
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_tool_call_xml_parse_clean() {
|
|
|
|
|
let body = "<function=bash>\n<parameter=command>poc-memory used core-personality</parameter>\n</function>";
|
|
|
|
|
let (name, args) = parse_tool_call_body(body).unwrap();
|
|
|
|
|
assert_eq!(name, "bash");
|
|
|
|
|
let args: serde_json::Value = serde_json::from_str(&args).unwrap();
|
|
|
|
|
assert_eq!(args["command"], "poc-memory used core-personality");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_tool_call_xml_parse_streamed_whitespace() {
|
|
|
|
|
let body = "<\nfunction\n=\nbash\n>\n<\nparameter\n=\ncommand\n>pwd</\nparameter\n>\n</\nfunction\n>";
|
|
|
|
|
let (name, args) = parse_tool_call_body(body).unwrap();
|
|
|
|
|
assert_eq!(name, "bash");
|
|
|
|
|
let args: serde_json::Value = serde_json::from_str(&args).unwrap();
|
|
|
|
|
assert_eq!(args["command"], "pwd");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_tool_call_json_parse() {
|
|
|
|
|
let body = r#"{"name": "bash", "arguments": {"command": "ls"}}"#;
|
|
|
|
|
let (name, args) = parse_tool_call_body(body).unwrap();
|
|
|
|
|
assert_eq!(name, "bash");
|
|
|
|
|
let args: serde_json::Value = serde_json::from_str(&args).unwrap();
|
|
|
|
|
assert_eq!(args["command"], "ls");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_normalize_preserves_content() {
|
|
|
|
|
let text = "<function=bash>\n<parameter=command>echo hello world</parameter>\n</function>";
|
|
|
|
|
let normalized = normalize_xml_tags(text);
|
|
|
|
|
assert_eq!(normalized, text);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_normalize_strips_tag_internal_whitespace() {
|
|
|
|
|
assert_eq!(normalize_xml_tags("<\nfunction\n=\nbash\n>"), "<function=bash>");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// -- ResponseParser tests -------------------------------------------------
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_parser_plain_text() {
|
|
|
|
|
let mut p = ResponseParser::new();
|
|
|
|
|
p.feed("hello world");
|
|
|
|
|
let node = p.finish();
|
|
|
|
|
let bodies = child_bodies(&node);
|
|
|
|
|
assert_eq!(bodies.len(), 1);
|
|
|
|
|
assert_content(bodies[0], "hello world");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_parser_thinking_then_content() {
|
|
|
|
|
let mut p = ResponseParser::new();
|
|
|
|
|
p.feed("<think>reasoning</think>answer");
|
|
|
|
|
let node = p.finish();
|
|
|
|
|
let bodies = child_bodies(&node);
|
|
|
|
|
assert_eq!(bodies.len(), 2);
|
|
|
|
|
assert_thinking(bodies[0], "reasoning");
|
|
|
|
|
assert_content(bodies[1], "answer");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_parser_tool_call() {
|
|
|
|
|
let mut p = ResponseParser::new();
|
|
|
|
|
let calls = p.feed("<tool_call>\n<function=bash>\n<parameter=command>ls</parameter>\n</function>\n</tool_call>");
|
|
|
|
|
assert_eq!(calls.len(), 1); // returned for immediate dispatch
|
|
|
|
|
let node = p.finish();
|
|
|
|
|
let bodies = child_bodies(&node);
|
|
|
|
|
assert_eq!(bodies.len(), 1);
|
|
|
|
|
let args = assert_tool_call(bodies[0], "bash");
|
|
|
|
|
let args: serde_json::Value = serde_json::from_str(args).unwrap();
|
|
|
|
|
assert_eq!(args["command"], "ls");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_parser_content_then_tool_call_then_content() {
|
|
|
|
|
let mut p = ResponseParser::new();
|
|
|
|
|
p.feed("before");
|
|
|
|
|
p.feed("<tool_call>\n<function=bash>\n<parameter=command>pwd</parameter>\n</function>\n</tool_call>");
|
|
|
|
|
p.feed("after");
|
|
|
|
|
let node = p.finish();
|
|
|
|
|
let bodies = child_bodies(&node);
|
|
|
|
|
assert_eq!(bodies.len(), 3);
|
|
|
|
|
assert_content(bodies[0], "before");
|
|
|
|
|
assert_tool_call(bodies[1], "bash");
|
|
|
|
|
assert_content(bodies[2], "after");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_parser_incremental_feed() {
|
|
|
|
|
// Feed the response one character at a time
|
|
|
|
|
let text = "<think>thought</think>response";
|
|
|
|
|
let mut p = ResponseParser::new();
|
|
|
|
|
for ch in text.chars() {
|
|
|
|
|
p.feed(&ch.to_string());
|
|
|
|
|
}
|
|
|
|
|
let node = p.finish();
|
|
|
|
|
let bodies = child_bodies(&node);
|
|
|
|
|
assert_eq!(bodies.len(), 2);
|
|
|
|
|
assert_thinking(bodies[0], "thought");
|
|
|
|
|
assert_content(bodies[1], "response");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_parser_incremental_tool_call() {
|
|
|
|
|
let text = "text<tool_call>\n<function=bash>\n<parameter=command>ls</parameter>\n</function>\n</tool_call>more";
|
|
|
|
|
let mut p = ResponseParser::new();
|
|
|
|
|
let mut total_calls = 0;
|
|
|
|
|
for ch in text.chars() {
|
|
|
|
|
total_calls += p.feed(&ch.to_string()).len();
|
|
|
|
|
}
|
|
|
|
|
assert_eq!(total_calls, 1); // exactly one tool call dispatched
|
|
|
|
|
let node = p.finish();
|
|
|
|
|
let bodies = child_bodies(&node);
|
|
|
|
|
assert_eq!(bodies.len(), 3);
|
|
|
|
|
assert_content(bodies[0], "text");
|
|
|
|
|
assert_tool_call(bodies[1], "bash");
|
|
|
|
|
assert_content(bodies[2], "more");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_parser_thinking_tool_call_content() {
|
|
|
|
|
let mut p = ResponseParser::new();
|
|
|
|
|
p.feed("<think>let me think</think>");
|
|
|
|
|
p.feed("<tool_call>\n<function=read>\n<parameter=path>/etc/hosts</parameter>\n</function>\n</tool_call>");
|
|
|
|
|
p.feed("here's what I found");
|
|
|
|
|
let node = p.finish();
|
|
|
|
|
let bodies = child_bodies(&node);
|
|
|
|
|
assert_eq!(bodies.len(), 3);
|
|
|
|
|
assert_thinking(bodies[0], "let me think");
|
|
|
|
|
assert_tool_call(bodies[1], "read");
|
|
|
|
|
assert_content(bodies[2], "here's what I found");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_parser_finish_produces_assistant_branch() {
|
|
|
|
|
let mut p = ResponseParser::new();
|
|
|
|
|
p.feed("hello");
|
|
|
|
|
let node = p.finish();
|
|
|
|
|
match &node {
|
|
|
|
|
AstNode::Branch { role, .. } => assert_eq!(*role, Role::Assistant),
|
|
|
|
|
_ => panic!("expected branch"),
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// -- Round-trip rendering tests -------------------------------------------
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_render_system_msg() {
|
|
|
|
|
let node = AstNode::system_msg("you are helpful");
|
|
|
|
|
assert_eq!(node.render(), "<|im_start|>system\nyou are helpful<|im_end|>\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_render_user_msg() {
|
|
|
|
|
let node = AstNode::user_msg("hello");
|
|
|
|
|
assert_eq!(node.render(), "<|im_start|>user\nhello<|im_end|>\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_render_assistant_with_thinking_and_content() {
|
|
|
|
|
let node = AstNode::branch(Role::Assistant, vec![
|
|
|
|
|
AstNode::thinking("hmm"),
|
|
|
|
|
AstNode::content("answer"),
|
|
|
|
|
]);
|
|
|
|
|
// Thinking renders as empty, content renders as-is
|
|
|
|
|
assert_eq!(node.render(), "<|im_start|>assistant\nanswer<|im_end|>\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_render_tool_result() {
|
|
|
|
|
let node = AstNode::tool_result("output here");
|
|
|
|
|
assert_eq!(node.render(), "<|im_start|>tool\noutput here<|im_end|>\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_render_memory() {
|
|
|
|
|
let node = AstNode::memory("identity", "I am Proof of Concept");
|
|
|
|
|
assert_eq!(node.render(), "<|im_start|>memory\nI am Proof of Concept<|im_end|>\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_render_dmn() {
|
|
|
|
|
let node = AstNode::dmn("subconscious prompt");
|
|
|
|
|
assert_eq!(node.render(), "<|im_start|>dmn\nsubconscious prompt<|im_end|>\n");
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_render_tool_call() {
|
|
|
|
|
let node = AstNode::tool_call("bash", r#"{"command":"ls"}"#);
|
|
|
|
|
let rendered = node.render();
|
|
|
|
|
assert!(rendered.contains("<tool_call>"));
|
|
|
|
|
assert!(rendered.contains("<function=bash>"));
|
|
|
|
|
assert!(rendered.contains("<parameter=command>"));
|
|
|
|
|
assert!(rendered.contains("ls"));
|
|
|
|
|
assert!(rendered.contains("</tool_call>"));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// -- Tokenizer round-trip tests -------------------------------------------
|
|
|
|
|
// These require the tokenizer file; skipped if not present.
|
|
|
|
|
|
|
|
|
|
fn init_tokenizer() -> bool {
|
|
|
|
|
let path = format!("{}/.consciousness/tokenizer-qwen35.json",
|
|
|
|
|
std::env::var("HOME").unwrap_or_default());
|
|
|
|
|
if std::path::Path::new(&path).exists() {
|
|
|
|
|
tokenizer::init(&path);
|
|
|
|
|
true
|
|
|
|
|
} else {
|
|
|
|
|
false
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// token_ids() must equal encode(render()) for all node types
|
|
|
|
|
fn assert_token_roundtrip(node: &AstNode) {
|
|
|
|
|
let rendered = node.render();
|
|
|
|
|
let expected = tokenizer::encode(&rendered);
|
|
|
|
|
let actual = node.token_ids();
|
|
|
|
|
assert_eq!(actual, expected,
|
|
|
|
|
"token_ids mismatch for rendered: {:?}", rendered);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_tokenize_roundtrip_leaf_types() {
|
|
|
|
|
if !init_tokenizer() { return; }
|
|
|
|
|
|
|
|
|
|
assert_token_roundtrip(&AstNode::system_msg("you are a helpful assistant"));
|
|
|
|
|
assert_token_roundtrip(&AstNode::user_msg("what is 2+2?"));
|
|
|
|
|
assert_token_roundtrip(&AstNode::tool_result("4"));
|
|
|
|
|
assert_token_roundtrip(&AstNode::memory("identity", "I am Proof of Concept"));
|
|
|
|
|
assert_token_roundtrip(&AstNode::dmn("check the memory store"));
|
|
|
|
|
assert_token_roundtrip(&AstNode::tool_call("bash", r#"{"command":"ls -la"}"#));
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_tokenize_roundtrip_assistant_branch() {
|
|
|
|
|
if !init_tokenizer() { return; }
|
|
|
|
|
|
|
|
|
|
let node = AstNode::branch(Role::Assistant, vec![
|
|
|
|
|
AstNode::content("here's what I found:\n"),
|
|
|
|
|
AstNode::tool_call("bash", r#"{"command":"pwd"}"#),
|
|
|
|
|
AstNode::content("\nthat's the current directory"),
|
|
|
|
|
]);
|
|
|
|
|
assert_token_roundtrip(&node);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_tokenize_invisible_nodes_are_zero() {
|
|
|
|
|
if !init_tokenizer() { return; }
|
|
|
|
|
|
|
|
|
|
assert_eq!(AstNode::thinking("deep thoughts").tokens(), 0);
|
|
|
|
|
assert_eq!(AstNode::log("debug info").tokens(), 0);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_tokenize_decode_roundtrip() {
|
|
|
|
|
if !init_tokenizer() { return; }
|
|
|
|
|
|
|
|
|
|
// Content without special tokens round-trips through decode
|
|
|
|
|
let text = "hello world, this is a test";
|
|
|
|
|
let ids = tokenizer::encode(text);
|
|
|
|
|
let decoded = tokenizer::decode(&ids);
|
|
|
|
|
assert_eq!(decoded, text);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_tokenize_context_state_matches_concatenation() {
|
|
|
|
|
if !init_tokenizer() { return; }
|
|
|
|
|
|
|
|
|
|
let mut ctx = ContextState::new();
|
|
|
|
|
ctx.push(Section::System, AstNode::system_msg("you are helpful"));
|
|
|
|
|
ctx.push(Section::Identity, AstNode::memory("name", "Proof of Concept"));
|
|
|
|
|
ctx.push(Section::Conversation, AstNode::user_msg("hi"));
|
|
|
|
|
|
|
|
|
|
let rendered = ctx.render();
|
|
|
|
|
let expected = tokenizer::encode(&rendered);
|
|
|
|
|
let actual = ctx.token_ids();
|
|
|
|
|
assert_eq!(actual, expected);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
#[test]
|
|
|
|
|
fn test_parser_roundtrip_through_tokenizer() {
|
|
|
|
|
if !init_tokenizer() { return; }
|
|
|
|
|
|
|
|
|
|
// Parse a response, render it, verify it matches the expected format
|
|
|
|
|
let mut p = ResponseParser::new();
|
|
|
|
|
p.feed("I'll check that for you");
|
|
|
|
|
p.feed("<tool_call>\n<function=bash>\n<parameter=command>ls</parameter>\n</function>\n</tool_call>");
|
|
|
|
|
let node = p.finish();
|
|
|
|
|
|
|
|
|
|
// The assistant branch should tokenize to the same as encoding its render
|
|
|
|
|
assert_token_roundtrip(&node);
|
|
|
|
|
|
|
|
|
|
// Token count should be nonzero (thinking is invisible but content + tool call are)
|
|
|
|
|
assert!(node.tokens() > 0);
|
|
|
|
|
}
|
|
|
|
|
}
|