learn: nanosecond timestamps, token ranges for /score
Two related changes to the learn subsystem: 1. AST node timestamps are now non-optional — both Leaf and Branch variants carry a DateTime<Utc>. UNIX_EPOCH means "unset" (old entries deserialized from on-disk conversation logs). Training uses timestamps as unique keys for dedup, so we promote to nanosecond precision: node_timestamp_ns(), TrainData.timestamp_ns, FinetuneCandidate.timestamp_ns, mark_trained(ns). 2. build_token_ids() now also returns token-position ranges of assistant messages. These are passed to vLLM's /score endpoint via the new score_ranges field so only scored-position logprobs are returned — cuts bandwidth/compute when scoring small windows. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
parent
5d9d3ffc5b
commit
2b632d568b
5 changed files with 130 additions and 44 deletions
|
|
@ -55,15 +55,15 @@ impl ConversationLog {
|
|||
}
|
||||
|
||||
pub fn oldest_timestamp(&self) -> Option<chrono::DateTime<chrono::Utc>> {
|
||||
// Read forward from the start to find first timestamp
|
||||
// Read forward from the start to find first non-epoch timestamp
|
||||
let file = File::open(&self.path).ok()?;
|
||||
let mmap = unsafe { Mmap::map(&file).ok()? };
|
||||
// Find first { ... } and parse
|
||||
for line in mmap.split(|&b| b == b'\n') {
|
||||
if line.is_empty() { continue; }
|
||||
if let Ok(node) = serde_json::from_slice::<AstNode>(line) {
|
||||
if let Some(leaf) = node.leaf() {
|
||||
if let Some(ts) = leaf.timestamp() {
|
||||
let ts = leaf.timestamp();
|
||||
if ts != chrono::DateTime::UNIX_EPOCH {
|
||||
return Some(ts);
|
||||
}
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue