Replace token counting with token generation via HuggingFace tokenizer
Add agent/tokenizer.rs with global Qwen 3.5 tokenizer that generates actual token IDs including chat template wrapping. ContextEntry now stores token_ids: Vec<u32> instead of tokens: usize — the count is derived from the length. ContextEntry::new() tokenizes automatically via the global tokenizer. ContextSection::push_entry() takes a raw ConversationEntry and tokenizes it. set_message() re-tokenizes without needing an external tokenizer parameter. Token IDs include the full chat template: <|im_start|>role\ncontent <|im_end|>\n — so concatenating token_ids across entries produces a ready-to-send prompt for vLLM's /v1/completions endpoint. The old tiktoken CoreBPE is now unused on Agent (will be removed in a followup). Token counts are now exact for Qwen 3.5 instead of the ~85-90% approximation from cl100k_base. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
parent
70ee7abea5
commit
5e4067c04f
10 changed files with 540 additions and 97 deletions
108
src/agent/mod.rs
108
src/agent/mod.rs
|
|
@ -16,6 +16,7 @@
|
|||
pub mod api;
|
||||
pub mod context;
|
||||
pub mod oneshot;
|
||||
pub mod tokenizer;
|
||||
pub mod tools;
|
||||
|
||||
use std::sync::Arc;
|
||||
|
|
@ -196,19 +197,12 @@ impl Agent {
|
|||
.expect("failed to load cl100k_base tokenizer");
|
||||
|
||||
let mut system = ContextSection::new("System prompt");
|
||||
system.push(ContextEntry {
|
||||
entry: ConversationEntry::System(Message::system(&system_prompt)),
|
||||
tokens: context::msg_token_count(&tokenizer, &Message::system(&system_prompt)),
|
||||
timestamp: None,
|
||||
});
|
||||
system.push(ContextEntry::new(
|
||||
ConversationEntry::System(Message::system(&system_prompt)), None));
|
||||
let mut identity = ContextSection::new("Identity");
|
||||
for (_name, content) in &personality {
|
||||
let msg = Message::user(content);
|
||||
identity.push(ContextEntry {
|
||||
tokens: context::msg_token_count(&tokenizer, &msg),
|
||||
entry: ConversationEntry::Message(msg),
|
||||
timestamp: None,
|
||||
});
|
||||
identity.push(ContextEntry::new(
|
||||
ConversationEntry::Message(Message::user(content)), None));
|
||||
}
|
||||
let context = ContextState {
|
||||
system,
|
||||
|
|
@ -324,12 +318,8 @@ impl Agent {
|
|||
eprintln!("warning: failed to log entry: {:#}", e);
|
||||
}
|
||||
}
|
||||
let tokens = if entry.is_log() || entry.is_thinking() { 0 } else {
|
||||
context::msg_token_count(&self.tokenizer, entry.api_message())
|
||||
};
|
||||
self.context.conversation.push(ContextEntry {
|
||||
entry, tokens, timestamp: Some(chrono::Utc::now()),
|
||||
});
|
||||
self.context.conversation.push(ContextEntry::new(
|
||||
entry, Some(chrono::Utc::now())));
|
||||
|
||||
self.changed.notify_one();
|
||||
}
|
||||
|
|
@ -348,22 +338,19 @@ impl Agent {
|
|||
if let Some(idx) = self.streaming_index() {
|
||||
let mut msg = self.context.conversation.entries()[idx].entry.message().clone();
|
||||
msg.append_content(text);
|
||||
self.context.conversation.set_message(idx, &self.tokenizer, msg);
|
||||
self.context.conversation.set_message(idx, msg);
|
||||
} else {
|
||||
let msg = Message {
|
||||
role: Role::Assistant,
|
||||
content: Some(MessageContent::Text(text.to_string())),
|
||||
tool_calls: None,
|
||||
tool_call_id: None,
|
||||
name: None,
|
||||
timestamp: None,
|
||||
};
|
||||
let tokens = context::msg_token_count(&self.tokenizer, &msg);
|
||||
self.context.conversation.push(ContextEntry {
|
||||
entry: ConversationEntry::Message(msg),
|
||||
tokens,
|
||||
timestamp: None,
|
||||
});
|
||||
self.context.conversation.push(ContextEntry::new(
|
||||
ConversationEntry::Message(Message {
|
||||
role: Role::Assistant,
|
||||
content: Some(MessageContent::Text(text.to_string())),
|
||||
tool_calls: None,
|
||||
tool_call_id: None,
|
||||
name: None,
|
||||
timestamp: None,
|
||||
}),
|
||||
None,
|
||||
));
|
||||
}
|
||||
|
||||
self.changed.notify_one();
|
||||
|
|
@ -375,12 +362,10 @@ impl Agent {
|
|||
if let Some(i) = self.streaming_index() {
|
||||
let mut stamped = msg.clone();
|
||||
stamped.stamp();
|
||||
let tokens = context::msg_token_count(&self.tokenizer, &stamped);
|
||||
self.context.conversation.set(i, ContextEntry {
|
||||
entry: ConversationEntry::Message(stamped),
|
||||
tokens,
|
||||
timestamp: Some(chrono::Utc::now()),
|
||||
});
|
||||
self.context.conversation.set(i, ContextEntry::new(
|
||||
ConversationEntry::Message(stamped),
|
||||
Some(chrono::Utc::now()),
|
||||
));
|
||||
} else {
|
||||
self.push_message(msg.clone());
|
||||
}
|
||||
|
|
@ -770,16 +755,15 @@ impl Agent {
|
|||
|
||||
for node in journal_nodes[..cutoff_idx].iter().rev() {
|
||||
let msg = Message::user(&node.content);
|
||||
let tokens = context::msg_token_count(&self.tokenizer, &msg);
|
||||
if total_tokens + tokens > journal_budget && !journal_entries.is_empty() {
|
||||
let ce = ContextEntry::new(
|
||||
ConversationEntry::Message(msg),
|
||||
chrono::DateTime::from_timestamp(node.created_at, 0),
|
||||
);
|
||||
if total_tokens + ce.tokens() > journal_budget && !journal_entries.is_empty() {
|
||||
break;
|
||||
}
|
||||
journal_entries.push(ContextEntry {
|
||||
entry: ConversationEntry::Message(msg),
|
||||
tokens,
|
||||
timestamp: chrono::DateTime::from_timestamp(node.created_at, 0),
|
||||
});
|
||||
total_tokens += tokens;
|
||||
total_tokens += ce.tokens();
|
||||
journal_entries.push(ce);
|
||||
}
|
||||
journal_entries.reverse();
|
||||
dbg_log!("[journal] loaded {} entries, {} tokens", journal_entries.len(), total_tokens);
|
||||
|
|
@ -842,12 +826,10 @@ impl Agent {
|
|||
}
|
||||
let mut new_msg = msg.clone();
|
||||
new_msg.content = Some(MessageContent::Text(replacement));
|
||||
let tokens = context::msg_token_count(&self.tokenizer, &new_msg);
|
||||
self.context.conversation.set(i, ContextEntry {
|
||||
entry: ConversationEntry::Message(new_msg),
|
||||
tokens,
|
||||
timestamp: old.timestamp,
|
||||
});
|
||||
self.context.conversation.set(i, ContextEntry::new(
|
||||
ConversationEntry::Message(new_msg),
|
||||
old.timestamp,
|
||||
));
|
||||
}
|
||||
}
|
||||
self.generation += 1;
|
||||
|
|
@ -866,19 +848,12 @@ impl Agent {
|
|||
match crate::config::reload_for_model(&self.app_config, &self.prompt_file) {
|
||||
Ok((system_prompt, personality)) => {
|
||||
self.context.system.clear();
|
||||
self.context.system.push(ContextEntry {
|
||||
entry: ConversationEntry::System(Message::system(&system_prompt)),
|
||||
tokens: context::msg_token_count(&self.tokenizer, &Message::system(&system_prompt)),
|
||||
timestamp: None,
|
||||
});
|
||||
self.context.system.push(ContextEntry::new(
|
||||
ConversationEntry::System(Message::system(&system_prompt)), None));
|
||||
self.context.identity.clear();
|
||||
for (_name, content) in &personality {
|
||||
let msg = Message::user(content);
|
||||
self.context.identity.push(ContextEntry {
|
||||
tokens: context::msg_token_count(&self.tokenizer, &msg),
|
||||
entry: ConversationEntry::Message(msg),
|
||||
timestamp: None,
|
||||
});
|
||||
self.context.identity.push(ContextEntry::new(
|
||||
ConversationEntry::Message(Message::user(content)), None));
|
||||
}
|
||||
}
|
||||
Err(e) => {
|
||||
|
|
@ -932,16 +907,13 @@ impl Agent {
|
|||
let all: Vec<ContextEntry> = entries.into_iter()
|
||||
.filter(|e| !e.is_log() && !e.is_thinking() && e.message().role != Role::System)
|
||||
.map(|e| {
|
||||
let tokens = if e.is_log() { 0 } else {
|
||||
context::msg_token_count(&self.tokenizer, e.api_message())
|
||||
};
|
||||
let timestamp = if e.is_log() { None } else {
|
||||
let timestamp = if e.is_log() || e.is_thinking() { None } else {
|
||||
e.message().timestamp.as_ref().and_then(|ts| {
|
||||
chrono::DateTime::parse_from_rfc3339(ts).ok()
|
||||
.map(|dt| dt.with_timezone(&chrono::Utc))
|
||||
})
|
||||
};
|
||||
ContextEntry { entry: e, tokens, timestamp }
|
||||
ContextEntry::new(e, timestamp)
|
||||
})
|
||||
.collect();
|
||||
let mem_count = all.iter().filter(|e| e.entry.is_memory()).count();
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue