From 080b4f90844a0ced0d1b1b90fea3f22efc2eebda Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 16 Apr 2026 12:35:16 -0400 Subject: [PATCH] context: tighten timestamp schema; every AstNode has one MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously NodeLeaf.timestamp and AstNode::Branch.timestamp accepted null or missing via a deserialize_timestamp_or_epoch fallback — legacy entries in conversation.jsonl from before Branch timestamps existed (and from before chrono serialization was wired up) would load with UNIX_EPOCH as a sentinel. Downstream, node_timestamp_ns() returned Option and callers had to handle None as "old entry, skip." That second filter was silently dropping every candidate in score_finetune_candidates when scoring an older session — the F6 screen showed "0 above threshold" even when max_divergence was orders of magnitude above the threshold, because every entry was failing the None check, not the divergence check. The fix, in three parts: 1. src/bin/fix-timestamps.rs — one-off migration tool that walks a conversation.jsonl, linearly interpolates timestamps for entries stuck at UNIX_EPOCH (using surrounding real timestamps as anchors), propagates to child leaves with per-sibling ns offsets, and bumps any collisions by 1 ns for uniqueness. Ran against the current session's log: 11887 entries, 72289 ns bumps, all unique. 2. context.rs — drop default_timestamp and deserialize_timestamp_or_epoch. NodeLeaf and Branch now require a present non-null timestamp on deserialize. Tests flip from "missing/null → UNIX_EPOCH" to "missing/null → Err." 3. subconscious/learn.rs — node_timestamp_ns now returns i64, not Option. The matching caller in score_finetune_candidates collapses from a Some/None match to a single trained-set check. mind/log.rs's oldest_timestamp no longer filters UNIX_EPOCH. Every line currently on disk has already been migrated. Going forward, new AstNodes always carry real timestamps (Utc::now() at construction time), so the strict schema is the invariant, not an aspiration. Co-Authored-By: Proof of Concept --- src/agent/context.rs | 67 ++++---------- src/bin/fix-timestamps.rs | 180 ++++++++++++++++++++++++++++++++++++++ src/mind/log.rs | 6 +- src/subconscious/learn.rs | 28 +++--- 4 files changed, 210 insertions(+), 71 deletions(-) create mode 100644 src/bin/fix-timestamps.rs diff --git a/src/agent/context.rs b/src/agent/context.rs index 5a90ad0..cc8044a 100644 --- a/src/agent/context.rs +++ b/src/agent/context.rs @@ -85,19 +85,6 @@ pub enum NodeBody { Log(String), } -fn default_timestamp() -> DateTime { - DateTime::UNIX_EPOCH -} - -/// Deserialize timestamp, treating both missing and null as UNIX_EPOCH. -fn deserialize_timestamp_or_epoch<'de, D>(deserializer: D) -> Result, D::Error> -where - D: serde::Deserializer<'de>, -{ - let opt: Option> = Option::deserialize(deserializer)?; - Ok(opt.unwrap_or(DateTime::UNIX_EPOCH)) -} - /// A leaf node: typed content with cached token IDs. /// Token IDs are not serialized — they're recomputed on deserialization. #[derive(Debug, Clone, Serialize)] @@ -113,7 +100,6 @@ impl<'de> Deserialize<'de> for NodeLeaf { #[derive(Deserialize)] struct Raw { body: NodeBody, - #[serde(default = "default_timestamp", deserialize_with = "deserialize_timestamp_or_epoch")] timestamp: DateTime, } let raw = Raw::deserialize(deserializer)?; @@ -133,7 +119,6 @@ pub enum AstNode { Branch { role: Role, children: Vec, - #[serde(default = "default_timestamp", deserialize_with = "deserialize_timestamp_or_epoch")] timestamp: DateTime, /// Per-response memory attribution from full scoring matrix. /// Maps memory key → divergence score for this response. @@ -1363,45 +1348,31 @@ mod tests { // -- Timestamp deserialization tests ------------------------------------------ #[test] - fn test_timestamp_null_becomes_epoch() { - // Old conversation.jsonl entries have "timestamp":null - // serde(default) only handles missing fields, not explicit nulls. - // We need to verify our deserialize handles this correctly. + fn test_timestamp_null_rejected() { + // Missing/null timestamps used to be accepted via a lenient + // deserialize fallback. Post-migration the schema is strict. let json = r#"{"Leaf":{"body":{"Content":"hello"},"timestamp":null}}"#; - let node: AstNode = serde_json::from_str(json).unwrap(); - let leaf = node.leaf().unwrap(); - assert_eq!(leaf.timestamp(), DateTime::::UNIX_EPOCH); + assert!(serde_json::from_str::(json).is_err()); } #[test] - fn test_timestamp_missing_becomes_epoch() { + fn test_timestamp_missing_rejected() { let json = r#"{"Leaf":{"body":{"Content":"hello"}}}"#; + assert!(serde_json::from_str::(json).is_err()); + } + + #[test] + fn test_branch_timestamp_missing_rejected() { + let json = r#"{"Branch":{"role":"User","children":[]}}"#; + assert!(serde_json::from_str::(json).is_err()); + } + + #[test] + fn test_timestamp_present_accepted() { + let json = r#"{"Leaf":{"body":{"Content":"hi"},"timestamp":"2026-04-16T12:00:00Z"}}"#; let node: AstNode = serde_json::from_str(json).unwrap(); let leaf = node.leaf().unwrap(); - assert_eq!(leaf.timestamp(), DateTime::::UNIX_EPOCH); - } - - #[test] - fn test_branch_timestamp_null_becomes_epoch() { - let json = r#"{"Branch":{"role":"User","children":[{"Leaf":{"body":{"Content":"hi"}}}],"timestamp":null}}"#; - let node: AstNode = serde_json::from_str(json).unwrap(); - match node { - AstNode::Branch { timestamp, .. } => { - assert_eq!(timestamp, DateTime::::UNIX_EPOCH); - } - _ => panic!("expected Branch"), - } - } - - #[test] - fn test_branch_timestamp_missing_becomes_epoch() { - let json = r#"{"Branch":{"role":"User","children":[{"Leaf":{"body":{"Content":"hi"}}}]}}"#; - let node: AstNode = serde_json::from_str(json).unwrap(); - match node { - AstNode::Branch { timestamp, .. } => { - assert_eq!(timestamp, DateTime::::UNIX_EPOCH); - } - _ => panic!("expected Branch"), - } + assert_eq!(leaf.timestamp().to_rfc3339(), + "2026-04-16T12:00:00+00:00"); } } diff --git a/src/bin/fix-timestamps.rs b/src/bin/fix-timestamps.rs new file mode 100644 index 0000000..31a8788 --- /dev/null +++ b/src/bin/fix-timestamps.rs @@ -0,0 +1,180 @@ +// fix-timestamps: One-off migration for ~/.consciousness/agent-sessions/ +// conversation.jsonl. +// +// Before Branch nodes carried their own timestamps, early entries were +// serialized with missing/null timestamp fields — they deserialize as +// UNIX_EPOCH via the (now-to-be-removed) deserialize_timestamp_or_epoch +// fallback. Training needs every entry to have a unique timestamp to +// dedup already-trained responses. +// +// Walks the file, synthesizes timestamps for any entry stuck at +// UNIX_EPOCH by linear interpolation between surrounding real +// timestamps. For child leaves inside a Branch, derives timestamps +// from the parent with a tiny per-child offset. +// +// SAFETY: reads from argv[1], writes to argv[1].tmp, renames into +// place. Keep a .bak copy before running. +// +// Usage: fix-timestamps + +use std::io::{BufRead, BufReader, BufWriter, Write}; +use std::path::PathBuf; + +use anyhow::{Context, Result}; +use chrono::{DateTime, Duration, Utc}; + +use consciousness::agent::context::AstNode; + +fn main() -> Result<()> { + let path: PathBuf = std::env::args().nth(1) + .context("usage: fix-timestamps ")?.into(); + + let f = std::fs::File::open(&path) + .with_context(|| format!("open {}", path.display()))?; + let reader = BufReader::new(f); + + let mut nodes: Vec = Vec::new(); + for (i, line) in reader.lines().enumerate() { + let line = line?; + if line.trim().is_empty() { continue; } + let node: AstNode = serde_json::from_str(&line) + .with_context(|| format!("line {}: parse", i + 1))?; + nodes.push(node); + } + println!("read {} entries", nodes.len()); + + fix_top_level_timestamps(&mut nodes); + for node in &mut nodes { + propagate_to_children(node); + } + + // Ensure uniqueness — real timestamps can collide when two entries + // were written in the same ns; synthesized ones can also overlap. + // Bump colliding ns by 1 until unique. + let mut seen = std::collections::HashSet::new(); + let mut bumps = 0usize; + for (i, node) in nodes.iter_mut().enumerate() { + let ts = top_ts(node); + assert!(ts > DateTime::::UNIX_EPOCH, + "entry {}: still UNIX_EPOCH", i); + let mut ns = ts.timestamp_nanos_opt().expect("ts in i64 ns range"); + let mut bumped = false; + while !seen.insert(ns) { + ns += 1; + bumped = true; + bumps += 1; + } + if bumped { + set_top_ts(node, DateTime::::from_timestamp_nanos(ns)); + } + } + println!("all {} timestamps real and unique ({} ns bumps)", + nodes.len(), bumps); + + let tmp = path.with_extension("jsonl.tmp"); + { + let f = std::fs::File::create(&tmp) + .with_context(|| format!("create {}", tmp.display()))?; + let mut w = BufWriter::new(f); + for node in &nodes { + serde_json::to_writer(&mut w, node)?; + w.write_all(b"\n")?; + } + w.flush()?; + } + std::fs::rename(&tmp, &path) + .with_context(|| format!("rename {} -> {}", tmp.display(), path.display()))?; + println!("wrote {}", path.display()); + + Ok(()) +} + +fn top_ts(node: &AstNode) -> DateTime { + match node { + AstNode::Leaf(leaf) => leaf.timestamp(), + AstNode::Branch { timestamp, .. } => *timestamp, + } +} + +fn set_top_ts(node: &mut AstNode, ts: DateTime) { + match node { + AstNode::Leaf(leaf) => *leaf = leaf.clone().with_timestamp(ts), + AstNode::Branch { timestamp, .. } => *timestamp = ts, + } +} + +/// Fill in missing top-level timestamps. Strategy: +/// - If two real timestamps bracket a run of missing ones, linearly +/// interpolate between them. +/// - If missing ones precede the first real one, back-fill using +/// (first_real - N·1µs). +/// - If missing ones follow the last real one, forward-fill. +/// - If no real timestamps exist at all, synthesize from now() going +/// backwards. +fn fix_top_level_timestamps(nodes: &mut [AstNode]) { + let real: Vec<(usize, DateTime)> = nodes.iter().enumerate() + .filter(|(_, n)| top_ts(n) > DateTime::::UNIX_EPOCH) + .map(|(i, n)| (i, top_ts(n))) + .collect(); + + if real.is_empty() { + let now = Utc::now(); + let len = nodes.len(); + for (i, node) in nodes.iter_mut().enumerate() { + let ts = now - Duration::microseconds((len - i) as i64); + set_top_ts(node, ts); + } + return; + } + + // Helper: bisect real[] for the nearest real entries around idx. + let find_bracket = |idx: usize| -> (Option<(usize, DateTime)>, + Option<(usize, DateTime)>) { + let pos = real.binary_search_by_key(&idx, |(i, _)| *i); + let (prior_pos, next_pos) = match pos { + Ok(p) => (Some(p), Some(p)), + Err(p) => ( + if p == 0 { None } else { Some(p - 1) }, + if p >= real.len() { None } else { Some(p) }, + ), + }; + (prior_pos.map(|p| real[p]), next_pos.map(|p| real[p])) + }; + + for i in 0..nodes.len() { + if top_ts(&nodes[i]) > DateTime::::UNIX_EPOCH { + continue; + } + let (prior, next) = find_bracket(i); + let new_ts = match (prior, next) { + (Some((pi, pt)), Some((ni, nt))) if pi != ni => { + // Linear interpolate. + let span_ns = (nt - pt).num_nanoseconds().unwrap_or(0); + let offset_ns = span_ns * (i - pi) as i64 / (ni - pi) as i64; + pt + Duration::nanoseconds(offset_ns) + } + (Some((pi, pt)), _) => { + pt + Duration::microseconds((i - pi) as i64) + } + (None, Some((ni, nt))) => { + nt - Duration::microseconds((ni - i) as i64) + } + (None, None) => unreachable!(), + }; + set_top_ts(&mut nodes[i], new_ts); + } +} + +/// For every Branch, ensure each child Leaf has a timestamp. If missing, +/// use parent.ts + child_idx·1ns so siblings stay unique but close. +fn propagate_to_children(node: &mut AstNode) { + if let AstNode::Branch { timestamp, children, .. } = node { + let parent_ts = *timestamp; + for (ci, child) in children.iter_mut().enumerate() { + if top_ts(child) <= DateTime::::UNIX_EPOCH { + set_top_ts(child, parent_ts + Duration::nanoseconds(ci as i64)); + } + propagate_to_children(child); + } + } +} diff --git a/src/mind/log.rs b/src/mind/log.rs index 03e349c..7ac0d79 100644 --- a/src/mind/log.rs +++ b/src/mind/log.rs @@ -55,17 +55,13 @@ impl ConversationLog { } pub fn oldest_timestamp(&self) -> Option> { - // Read forward from the start to find first non-epoch timestamp let file = File::open(&self.path).ok()?; let mmap = unsafe { Mmap::map(&file).ok()? }; for line in mmap.split(|&b| b == b'\n') { if line.is_empty() { continue; } if let Ok(node) = serde_json::from_slice::(line) { if let Some(leaf) = node.leaf() { - let ts = leaf.timestamp(); - if ts != chrono::DateTime::UNIX_EPOCH { - return Some(ts); - } + return Some(leaf.timestamp()); } } } diff --git a/src/subconscious/learn.rs b/src/subconscious/learn.rs index 9b65f2a..3c12efc 100644 --- a/src/subconscious/learn.rs +++ b/src/subconscious/learn.rs @@ -516,16 +516,11 @@ pub async fn score_finetune_candidates( let node = &entries[entry_idx]; - // Get timestamp and skip if already trained - let timestamp_ns = match node_timestamp_ns(node) { - Some(ts) => { - if trained.contains(&ts) { - continue; // Already trained, skip - } - ts - } - None => continue, // No timestamp, skip - }; + // Skip if already trained on. + let timestamp_ns = node_timestamp_ns(node); + if trained.contains(×tamp_ns) { + continue; + } // Extract response text let response_text = match node { @@ -661,18 +656,15 @@ pub fn mark_trained(timestamp_ns: i64) { } /// Get timestamp in nanoseconds from an AstNode. -/// Returns None for entries with default UNIX_EPOCH timestamp (old data) -/// or timestamps outside the representable nano range (pre-1677 or post-2262). -pub fn node_timestamp_ns(node: &AstNode) -> Option { +/// i64-ns representation covers 1677..2262 via chrono; timestamps +/// outside that window would be bugs we'd want to surface, hence panic. +pub fn node_timestamp_ns(node: &AstNode) -> i64 { let ts = match node { AstNode::Leaf(leaf) => leaf.timestamp(), AstNode::Branch { timestamp, .. } => *timestamp, }; - if ts == chrono::DateTime::UNIX_EPOCH { - None // Old entry without real timestamp - } else { - ts.timestamp_nanos_opt() - } + ts.timestamp_nanos_opt() + .expect("timestamp outside i64-ns representable range (1677..2262)") } // ── Training API ────────────────────────────────────────────────