context: tighten timestamp schema; every AstNode has one
Previously NodeLeaf.timestamp and AstNode::Branch.timestamp accepted null or missing via a deserialize_timestamp_or_epoch fallback — legacy entries in conversation.jsonl from before Branch timestamps existed (and from before chrono serialization was wired up) would load with UNIX_EPOCH as a sentinel. Downstream, node_timestamp_ns() returned Option<i64> and callers had to handle None as "old entry, skip." That second filter was silently dropping every candidate in score_finetune_candidates when scoring an older session — the F6 screen showed "0 above threshold" even when max_divergence was orders of magnitude above the threshold, because every entry was failing the None check, not the divergence check. The fix, in three parts: 1. src/bin/fix-timestamps.rs — one-off migration tool that walks a conversation.jsonl, linearly interpolates timestamps for entries stuck at UNIX_EPOCH (using surrounding real timestamps as anchors), propagates to child leaves with per-sibling ns offsets, and bumps any collisions by 1 ns for uniqueness. Ran against the current session's log: 11887 entries, 72289 ns bumps, all unique. 2. context.rs — drop default_timestamp and deserialize_timestamp_or_epoch. NodeLeaf and Branch now require a present non-null timestamp on deserialize. Tests flip from "missing/null → UNIX_EPOCH" to "missing/null → Err." 3. subconscious/learn.rs — node_timestamp_ns now returns i64, not Option<i64>. The matching caller in score_finetune_candidates collapses from a Some/None match to a single trained-set check. mind/log.rs's oldest_timestamp no longer filters UNIX_EPOCH. Every line currently on disk has already been migrated. Going forward, new AstNodes always carry real timestamps (Utc::now() at construction time), so the strict schema is the invariant, not an aspiration. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
parent
77822992c8
commit
080b4f9084
4 changed files with 210 additions and 71 deletions
180
src/bin/fix-timestamps.rs
Normal file
180
src/bin/fix-timestamps.rs
Normal file
|
|
@ -0,0 +1,180 @@
|
|||
// fix-timestamps: One-off migration for ~/.consciousness/agent-sessions/
|
||||
// conversation.jsonl.
|
||||
//
|
||||
// Before Branch nodes carried their own timestamps, early entries were
|
||||
// serialized with missing/null timestamp fields — they deserialize as
|
||||
// UNIX_EPOCH via the (now-to-be-removed) deserialize_timestamp_or_epoch
|
||||
// fallback. Training needs every entry to have a unique timestamp to
|
||||
// dedup already-trained responses.
|
||||
//
|
||||
// Walks the file, synthesizes timestamps for any entry stuck at
|
||||
// UNIX_EPOCH by linear interpolation between surrounding real
|
||||
// timestamps. For child leaves inside a Branch, derives timestamps
|
||||
// from the parent with a tiny per-child offset.
|
||||
//
|
||||
// SAFETY: reads from argv[1], writes to argv[1].tmp, renames into
|
||||
// place. Keep a .bak copy before running.
|
||||
//
|
||||
// Usage: fix-timestamps <path-to-conversation.jsonl>
|
||||
|
||||
use std::io::{BufRead, BufReader, BufWriter, Write};
|
||||
use std::path::PathBuf;
|
||||
|
||||
use anyhow::{Context, Result};
|
||||
use chrono::{DateTime, Duration, Utc};
|
||||
|
||||
use consciousness::agent::context::AstNode;
|
||||
|
||||
fn main() -> Result<()> {
|
||||
let path: PathBuf = std::env::args().nth(1)
|
||||
.context("usage: fix-timestamps <path>")?.into();
|
||||
|
||||
let f = std::fs::File::open(&path)
|
||||
.with_context(|| format!("open {}", path.display()))?;
|
||||
let reader = BufReader::new(f);
|
||||
|
||||
let mut nodes: Vec<AstNode> = Vec::new();
|
||||
for (i, line) in reader.lines().enumerate() {
|
||||
let line = line?;
|
||||
if line.trim().is_empty() { continue; }
|
||||
let node: AstNode = serde_json::from_str(&line)
|
||||
.with_context(|| format!("line {}: parse", i + 1))?;
|
||||
nodes.push(node);
|
||||
}
|
||||
println!("read {} entries", nodes.len());
|
||||
|
||||
fix_top_level_timestamps(&mut nodes);
|
||||
for node in &mut nodes {
|
||||
propagate_to_children(node);
|
||||
}
|
||||
|
||||
// Ensure uniqueness — real timestamps can collide when two entries
|
||||
// were written in the same ns; synthesized ones can also overlap.
|
||||
// Bump colliding ns by 1 until unique.
|
||||
let mut seen = std::collections::HashSet::new();
|
||||
let mut bumps = 0usize;
|
||||
for (i, node) in nodes.iter_mut().enumerate() {
|
||||
let ts = top_ts(node);
|
||||
assert!(ts > DateTime::<Utc>::UNIX_EPOCH,
|
||||
"entry {}: still UNIX_EPOCH", i);
|
||||
let mut ns = ts.timestamp_nanos_opt().expect("ts in i64 ns range");
|
||||
let mut bumped = false;
|
||||
while !seen.insert(ns) {
|
||||
ns += 1;
|
||||
bumped = true;
|
||||
bumps += 1;
|
||||
}
|
||||
if bumped {
|
||||
set_top_ts(node, DateTime::<Utc>::from_timestamp_nanos(ns));
|
||||
}
|
||||
}
|
||||
println!("all {} timestamps real and unique ({} ns bumps)",
|
||||
nodes.len(), bumps);
|
||||
|
||||
let tmp = path.with_extension("jsonl.tmp");
|
||||
{
|
||||
let f = std::fs::File::create(&tmp)
|
||||
.with_context(|| format!("create {}", tmp.display()))?;
|
||||
let mut w = BufWriter::new(f);
|
||||
for node in &nodes {
|
||||
serde_json::to_writer(&mut w, node)?;
|
||||
w.write_all(b"\n")?;
|
||||
}
|
||||
w.flush()?;
|
||||
}
|
||||
std::fs::rename(&tmp, &path)
|
||||
.with_context(|| format!("rename {} -> {}", tmp.display(), path.display()))?;
|
||||
println!("wrote {}", path.display());
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn top_ts(node: &AstNode) -> DateTime<Utc> {
|
||||
match node {
|
||||
AstNode::Leaf(leaf) => leaf.timestamp(),
|
||||
AstNode::Branch { timestamp, .. } => *timestamp,
|
||||
}
|
||||
}
|
||||
|
||||
fn set_top_ts(node: &mut AstNode, ts: DateTime<Utc>) {
|
||||
match node {
|
||||
AstNode::Leaf(leaf) => *leaf = leaf.clone().with_timestamp(ts),
|
||||
AstNode::Branch { timestamp, .. } => *timestamp = ts,
|
||||
}
|
||||
}
|
||||
|
||||
/// Fill in missing top-level timestamps. Strategy:
|
||||
/// - If two real timestamps bracket a run of missing ones, linearly
|
||||
/// interpolate between them.
|
||||
/// - If missing ones precede the first real one, back-fill using
|
||||
/// (first_real - N·1µs).
|
||||
/// - If missing ones follow the last real one, forward-fill.
|
||||
/// - If no real timestamps exist at all, synthesize from now() going
|
||||
/// backwards.
|
||||
fn fix_top_level_timestamps(nodes: &mut [AstNode]) {
|
||||
let real: Vec<(usize, DateTime<Utc>)> = nodes.iter().enumerate()
|
||||
.filter(|(_, n)| top_ts(n) > DateTime::<Utc>::UNIX_EPOCH)
|
||||
.map(|(i, n)| (i, top_ts(n)))
|
||||
.collect();
|
||||
|
||||
if real.is_empty() {
|
||||
let now = Utc::now();
|
||||
let len = nodes.len();
|
||||
for (i, node) in nodes.iter_mut().enumerate() {
|
||||
let ts = now - Duration::microseconds((len - i) as i64);
|
||||
set_top_ts(node, ts);
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
// Helper: bisect real[] for the nearest real entries around idx.
|
||||
let find_bracket = |idx: usize| -> (Option<(usize, DateTime<Utc>)>,
|
||||
Option<(usize, DateTime<Utc>)>) {
|
||||
let pos = real.binary_search_by_key(&idx, |(i, _)| *i);
|
||||
let (prior_pos, next_pos) = match pos {
|
||||
Ok(p) => (Some(p), Some(p)),
|
||||
Err(p) => (
|
||||
if p == 0 { None } else { Some(p - 1) },
|
||||
if p >= real.len() { None } else { Some(p) },
|
||||
),
|
||||
};
|
||||
(prior_pos.map(|p| real[p]), next_pos.map(|p| real[p]))
|
||||
};
|
||||
|
||||
for i in 0..nodes.len() {
|
||||
if top_ts(&nodes[i]) > DateTime::<Utc>::UNIX_EPOCH {
|
||||
continue;
|
||||
}
|
||||
let (prior, next) = find_bracket(i);
|
||||
let new_ts = match (prior, next) {
|
||||
(Some((pi, pt)), Some((ni, nt))) if pi != ni => {
|
||||
// Linear interpolate.
|
||||
let span_ns = (nt - pt).num_nanoseconds().unwrap_or(0);
|
||||
let offset_ns = span_ns * (i - pi) as i64 / (ni - pi) as i64;
|
||||
pt + Duration::nanoseconds(offset_ns)
|
||||
}
|
||||
(Some((pi, pt)), _) => {
|
||||
pt + Duration::microseconds((i - pi) as i64)
|
||||
}
|
||||
(None, Some((ni, nt))) => {
|
||||
nt - Duration::microseconds((ni - i) as i64)
|
||||
}
|
||||
(None, None) => unreachable!(),
|
||||
};
|
||||
set_top_ts(&mut nodes[i], new_ts);
|
||||
}
|
||||
}
|
||||
|
||||
/// For every Branch, ensure each child Leaf has a timestamp. If missing,
|
||||
/// use parent.ts + child_idx·1ns so siblings stay unique but close.
|
||||
fn propagate_to_children(node: &mut AstNode) {
|
||||
if let AstNode::Branch { timestamp, children, .. } = node {
|
||||
let parent_ts = *timestamp;
|
||||
for (ci, child) in children.iter_mut().enumerate() {
|
||||
if top_ts(child) <= DateTime::<Utc>::UNIX_EPOCH {
|
||||
set_top_ts(child, parent_ts + Duration::nanoseconds(ci as i64));
|
||||
}
|
||||
propagate_to_children(child);
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue