// tokenizer.rs — Qwen tokenizer for direct token generation // // Loads the HuggingFace tokenizer.json for the target model and provides // tokenization for context entries. The tokenizer is loaded once globally // and shared across all callers. // // Token IDs include the chat template wrapping: // <|im_start|>role\ncontent<|im_end|>\n // so concatenating token_ids across entries produces a ready-to-send prompt. use std::sync::OnceLock; use tokenizers::Tokenizer; static TOKENIZER: OnceLock = OnceLock::new(); /// Special token IDs for Qwen 3.5 pub const IM_START: u32 = 248045; pub const IM_END: u32 = 248046; /// Initialize the global tokenizer from a file path. /// Call once at startup. Panics if the file can't be loaded. pub fn init(path: &str) { let t = Tokenizer::from_file(path) .unwrap_or_else(|e| panic!("failed to load tokenizer from {}: {}", path, e)); TOKENIZER.set(t).ok(); } /// Get the global tokenizer. Panics if not initialized. fn get() -> &'static Tokenizer { TOKENIZER.get().expect("tokenizer not initialized — call tokenizer::init() first") } /// Tokenize a raw string, returning token IDs. pub fn encode(text: &str) -> Vec { get().encode(text, false) .unwrap_or_else(|e| panic!("tokenization failed: {}", e)) .get_ids() .to_vec() } /// Tokenize a chat entry with template wrapping: /// <|im_start|>role\ncontent<|im_end|>\n /// Returns the complete token ID sequence for this entry. pub fn tokenize_entry(role: &str, content: &str) -> Vec { let mut ids = Vec::new(); ids.push(IM_START); ids.extend(encode(role)); ids.extend(encode("\n")); ids.extend(encode(content)); ids.push(IM_END); ids.extend(encode("\n")); ids } /// Count tokens for a string (convenience for budget checks). pub fn count(text: &str) -> usize { encode(text).len() } /// Decode token IDs back to text. pub fn decode(ids: &[u32]) -> String { get().decode(ids, true) .unwrap_or_else(|e| panic!("detokenization failed: {}", e)) } /// Check if the tokenizer is initialized. pub fn is_initialized() -> bool { TOKENIZER.get().is_some() } /// Tokenize a ConversationEntry with its role and content. pub fn tokenize_conv_entry(entry: &super::context::ConversationEntry) -> Vec { use super::context::ConversationEntry; match entry { ConversationEntry::System(m) => tokenize_entry("system", m.content_text()), ConversationEntry::Message(m) => tokenize_entry(m.role_str(), m.content_text()), ConversationEntry::Memory { message, .. } => tokenize_entry("memory", message.content_text()), ConversationEntry::Dmn(m) => tokenize_entry("dmn", m.content_text()), ConversationEntry::Thinking(text) => tokenize_entry("thinking", text), ConversationEntry::Log(_) => vec![], // logs don't consume tokens } }