2026-04-08 11:20:03 -04:00
|
|
|
// tokenizer.rs — Qwen tokenizer for direct token generation
|
|
|
|
|
//
|
|
|
|
|
// Loads the HuggingFace tokenizer.json for the target model and provides
|
|
|
|
|
// tokenization for context entries. The tokenizer is loaded once globally
|
|
|
|
|
// and shared across all callers.
|
|
|
|
|
//
|
|
|
|
|
// Token IDs include the chat template wrapping:
|
|
|
|
|
// <|im_start|>role\ncontent<|im_end|>\n
|
|
|
|
|
// so concatenating token_ids across entries produces a ready-to-send prompt.
|
|
|
|
|
|
|
|
|
|
use std::sync::OnceLock;
|
|
|
|
|
use tokenizers::Tokenizer;
|
|
|
|
|
|
|
|
|
|
static TOKENIZER: OnceLock<Tokenizer> = OnceLock::new();
|
|
|
|
|
|
|
|
|
|
/// Special token IDs for Qwen 3.5
|
|
|
|
|
pub const IM_START: u32 = 248045;
|
|
|
|
|
pub const IM_END: u32 = 248046;
|
|
|
|
|
|
|
|
|
|
/// Initialize the global tokenizer from a file path.
|
|
|
|
|
/// Call once at startup. Panics if the file can't be loaded.
|
|
|
|
|
pub fn init(path: &str) {
|
|
|
|
|
let t = Tokenizer::from_file(path)
|
|
|
|
|
.unwrap_or_else(|e| panic!("failed to load tokenizer from {}: {}", path, e));
|
|
|
|
|
TOKENIZER.set(t).ok();
|
|
|
|
|
}
|
|
|
|
|
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
/// Get the global tokenizer. Returns None if not initialized.
|
|
|
|
|
fn get() -> Option<&'static Tokenizer> {
|
|
|
|
|
TOKENIZER.get()
|
2026-04-08 11:20:03 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Tokenize a raw string, returning token IDs.
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
/// Returns empty vec if the tokenizer is not initialized.
|
2026-04-08 11:20:03 -04:00
|
|
|
pub fn encode(text: &str) -> Vec<u32> {
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
match get() {
|
|
|
|
|
Some(t) => t.encode(text, false)
|
|
|
|
|
.unwrap_or_else(|e| panic!("tokenization failed: {}", e))
|
|
|
|
|
.get_ids()
|
|
|
|
|
.to_vec(),
|
|
|
|
|
None => vec![],
|
|
|
|
|
}
|
2026-04-08 11:20:03 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Tokenize a chat entry with template wrapping:
|
|
|
|
|
/// <|im_start|>role\ncontent<|im_end|>\n
|
|
|
|
|
/// Returns the complete token ID sequence for this entry.
|
|
|
|
|
pub fn tokenize_entry(role: &str, content: &str) -> Vec<u32> {
|
|
|
|
|
let mut ids = Vec::new();
|
|
|
|
|
ids.push(IM_START);
|
|
|
|
|
ids.extend(encode(role));
|
|
|
|
|
ids.extend(encode("\n"));
|
|
|
|
|
ids.extend(encode(content));
|
|
|
|
|
ids.push(IM_END);
|
|
|
|
|
ids.extend(encode("\n"));
|
|
|
|
|
ids
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Count tokens for a string (convenience for budget checks).
|
|
|
|
|
pub fn count(text: &str) -> usize {
|
|
|
|
|
encode(text).len()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Decode token IDs back to text.
|
|
|
|
|
pub fn decode(ids: &[u32]) -> String {
|
Redesign context AST: typed NodeBody, Role as grammar roles, tests
Role is now just System/User/Assistant — maps 1:1 to the grammar.
Leaf types are NodeBody variants: Content, Thinking, ToolCall,
ToolResult, Memory, Dmn, Log. Each variant renders itself; no Role
needed on leaves. AstNode is Leaf(NodeLeaf) | Branch{role, children}.
ContextState holds four Vec<AstNode> sections directly.
Moved tool call XML parsing from api/parsing.rs into context_new.rs
so all grammar knowledge lives in one place.
Tokenizer encode() now returns empty vec when uninitialized instead
of panicking, so tests work without the tokenizer file.
26 tests: XML parsing, incremental streaming (char-by-char feeds
found and fixed a lookahead bug), rendering for all node types,
tokenizer round-trip verification.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 13:35:04 -04:00
|
|
|
match get() {
|
|
|
|
|
Some(t) => t.decode(ids, true)
|
|
|
|
|
.unwrap_or_else(|e| panic!("detokenization failed: {}", e)),
|
|
|
|
|
None => String::new(),
|
|
|
|
|
}
|
2026-04-08 11:20:03 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Check if the tokenizer is initialized.
|
|
|
|
|
pub fn is_initialized() -> bool {
|
|
|
|
|
TOKENIZER.get().is_some()
|
|
|
|
|
}
|
|
|
|
|
|