// tokenizer.rs — Qwen tokenizer for direct token generation // // Loads the HuggingFace tokenizer.json for the target model and provides // tokenization for context entries. The tokenizer is loaded once globally // and shared across all callers. // // Token IDs include the chat template wrapping: // <|im_start|>role\ncontent<|im_end|>\n // so concatenating token_ids across entries produces a ready-to-send prompt. use std::sync::OnceLock; use tokenizers::Tokenizer; static TOKENIZER: OnceLock = OnceLock::new(); /// Special token IDs for Qwen 3.5 pub const IM_START: u32 = 248045; pub const IM_END: u32 = 248046; /// Initialize the global tokenizer from a file path. /// Call once at startup. Panics if the file can't be loaded. pub fn init(path: &str) { let t = Tokenizer::from_file(path) .unwrap_or_else(|e| panic!("failed to load tokenizer from {}: {}", path, e)); TOKENIZER.set(t).ok(); } /// Get the global tokenizer. Returns None if not initialized. fn get() -> Option<&'static Tokenizer> { TOKENIZER.get() } /// Tokenize a raw string, returning token IDs. /// Returns empty vec if the tokenizer is not initialized. pub fn encode(text: &str) -> Vec { match get() { Some(t) => t.encode(text, false) .unwrap_or_else(|e| panic!("tokenization failed: {}", e)) .get_ids() .to_vec(), None => vec![], } } /// Tokenize a chat entry with template wrapping: /// <|im_start|>role\ncontent<|im_end|>\n /// Returns the complete token ID sequence for this entry. pub fn tokenize_entry(role: &str, content: &str) -> Vec { let mut ids = Vec::new(); ids.push(IM_START); ids.extend(encode(role)); ids.extend(encode("\n")); ids.extend(encode(content)); ids.push(IM_END); ids.extend(encode("\n")); ids } /// Count tokens for a string (convenience for budget checks). pub fn count(text: &str) -> usize { encode(text).len() } /// Decode token IDs back to text. pub fn decode(ids: &[u32]) -> String { match get() { Some(t) => t.decode(ids, true) .unwrap_or_else(|e| panic!("detokenization failed: {}", e)), None => String::new(), } } /// Check if the tokenizer is initialized. pub fn is_initialized() -> bool { TOKENIZER.get().is_some() }