consciousness/src/agent/tokenizer.rs

83 lines
2.8 KiB
Rust
Raw Normal View History

// tokenizer.rs — Qwen tokenizer for direct token generation
//
// Loads the HuggingFace tokenizer.json for the target model and provides
// tokenization for context entries. The tokenizer is loaded once globally
// and shared across all callers.
//
// Token IDs include the chat template wrapping:
// <|im_start|>role\ncontent<|im_end|>\n
// so concatenating token_ids across entries produces a ready-to-send prompt.
use std::sync::OnceLock;
use tokenizers::Tokenizer;
static TOKENIZER: OnceLock<Tokenizer> = OnceLock::new();
/// Special token IDs for Qwen 3.5
pub const IM_START: u32 = 248045;
pub const IM_END: u32 = 248046;
/// Initialize the global tokenizer from a file path.
/// Call once at startup. Panics if the file can't be loaded.
pub fn init(path: &str) {
let t = Tokenizer::from_file(path)
.unwrap_or_else(|e| panic!("failed to load tokenizer from {}: {}", path, e));
TOKENIZER.set(t).ok();
}
/// Get the global tokenizer. Panics if not initialized.
fn get() -> &'static Tokenizer {
TOKENIZER.get().expect("tokenizer not initialized — call tokenizer::init() first")
}
/// Tokenize a raw string, returning token IDs.
pub fn encode(text: &str) -> Vec<u32> {
get().encode(text, false)
.unwrap_or_else(|e| panic!("tokenization failed: {}", e))
.get_ids()
.to_vec()
}
/// Tokenize a chat entry with template wrapping:
/// <|im_start|>role\ncontent<|im_end|>\n
/// Returns the complete token ID sequence for this entry.
pub fn tokenize_entry(role: &str, content: &str) -> Vec<u32> {
let mut ids = Vec::new();
ids.push(IM_START);
ids.extend(encode(role));
ids.extend(encode("\n"));
ids.extend(encode(content));
ids.push(IM_END);
ids.extend(encode("\n"));
ids
}
/// Count tokens for a string (convenience for budget checks).
pub fn count(text: &str) -> usize {
encode(text).len()
}
/// Decode token IDs back to text.
pub fn decode(ids: &[u32]) -> String {
get().decode(ids, true)
.unwrap_or_else(|e| panic!("detokenization failed: {}", e))
}
/// Check if the tokenizer is initialized.
pub fn is_initialized() -> bool {
TOKENIZER.get().is_some()
}
/// Tokenize a ConversationEntry with its role and content.
pub fn tokenize_conv_entry(entry: &super::context::ConversationEntry) -> Vec<u32> {
use super::context::ConversationEntry;
match entry {
ConversationEntry::System(m) => tokenize_entry("system", m.content_text()),
ConversationEntry::Message(m) => tokenize_entry(m.role_str(), m.content_text()),
ConversationEntry::Memory { message, .. } => tokenize_entry("memory", message.content_text()),
ConversationEntry::Dmn(m) => tokenize_entry("dmn", m.content_text()),
ConversationEntry::Thinking(text) => tokenize_entry("thinking", text),
ConversationEntry::Log(_) => vec![], // logs don't consume tokens
}
}