Replace token counting with token generation via HuggingFace tokenizer

Add agent/tokenizer.rs with global Qwen 3.5 tokenizer that generates actual token IDs including chat template wrapping. ContextEntry now stores token_ids: Vec<u32> instead of tokens: usize — the count is derived from the length. ContextEntry::new() tokenizes automatically via the global tokenizer. ContextSection::push_entry() takes a raw ConversationEntry and tokenizes it. set_message() re-tokenizes without needing an external tokenizer parameter. Token IDs include the full chat template: <|im_start|>role\ncontent <|im_end|>\n — so concatenating token_ids across entries produces a ready-to-send prompt for vLLM's /v1/completions endpoint. The old tiktoken CoreBPE is now unused on Agent (will be removed in a followup). Token counts are now exact for Qwen 3.5 instead of the ~85-90% approximation from cl100k_base. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-08 11:20:03 -04:00 · 2026-04-08 11:20:03 -04:00 · 5e4067c04f
commit 5e4067c04f
parent 70ee7abea5
10 changed files with 540 additions and 97 deletions
--- a/src/agent/tokenizer.rs
+++ b/src/agent/tokenizer.rs
@ -0,0 +1,82 @@
+// tokenizer.rs — Qwen tokenizer for direct token generation
+//
+// Loads the HuggingFace tokenizer.json for the target model and provides
+// tokenization for context entries. The tokenizer is loaded once globally
+// and shared across all callers.
+//
+// Token IDs include the chat template wrapping:
+//   <|im_start|>role\ncontent<|im_end|>\n
+// so concatenating token_ids across entries produces a ready-to-send prompt.
+
+use std::sync::OnceLock;
+use tokenizers::Tokenizer;
+
+static TOKENIZER: OnceLock<Tokenizer> = OnceLock::new();
+
+/// Special token IDs for Qwen 3.5
+pub const IM_START: u32 = 248045;
+pub const IM_END: u32 = 248046;
+
+/// Initialize the global tokenizer from a file path.
+/// Call once at startup. Panics if the file can't be loaded.
+pub fn init(path: &str) {
+    let t = Tokenizer::from_file(path)
+        .unwrap_or_else(|e| panic!("failed to load tokenizer from {}: {}", path, e));
+    TOKENIZER.set(t).ok();
+}
+
+/// Get the global tokenizer. Panics if not initialized.
+fn get() -> &'static Tokenizer {
+    TOKENIZER.get().expect("tokenizer not initialized — call tokenizer::init() first")
+}
+
+/// Tokenize a raw string, returning token IDs.
+pub fn encode(text: &str) -> Vec<u32> {
+    get().encode(text, false)
+        .unwrap_or_else(|e| panic!("tokenization failed: {}", e))
+        .get_ids()
+        .to_vec()
+}
+
+/// Tokenize a chat entry with template wrapping:
+///   <|im_start|>role\ncontent<|im_end|>\n
+/// Returns the complete token ID sequence for this entry.
+pub fn tokenize_entry(role: &str, content: &str) -> Vec<u32> {
+    let mut ids = Vec::new();
+    ids.push(IM_START);
+    ids.extend(encode(role));
+    ids.extend(encode("\n"));
+    ids.extend(encode(content));
+    ids.push(IM_END);
+    ids.extend(encode("\n"));
+    ids
+}
+
+/// Count tokens for a string (convenience for budget checks).
+pub fn count(text: &str) -> usize {
+    encode(text).len()
+}
+
+/// Decode token IDs back to text.
+pub fn decode(ids: &[u32]) -> String {
+    get().decode(ids, true)
+        .unwrap_or_else(|e| panic!("detokenization failed: {}", e))
+}
+
+/// Check if the tokenizer is initialized.
+pub fn is_initialized() -> bool {
+    TOKENIZER.get().is_some()
+}
+
+/// Tokenize a ConversationEntry with its role and content.
+pub fn tokenize_conv_entry(entry: &super::context::ConversationEntry) -> Vec<u32> {
+    use super::context::ConversationEntry;
+    match entry {
+        ConversationEntry::System(m) => tokenize_entry("system", m.content_text()),
+        ConversationEntry::Message(m) => tokenize_entry(m.role_str(), m.content_text()),
+        ConversationEntry::Memory { message, .. } => tokenize_entry("memory", message.content_text()),
+        ConversationEntry::Dmn(m) => tokenize_entry("dmn", m.content_text()),
+        ConversationEntry::Thinking(text) => tokenize_entry("thinking", text),
+        ConversationEntry::Log(_) => vec![], // logs don't consume tokens
+    }
+}