From c7e7cfb7af74a9d465dd88d46637f34e49468090 Mon Sep 17 00:00:00 2001 From: ProofOfConcept Date: Sun, 1 Mar 2026 05:46:35 -0500 Subject: [PATCH] store: always replay from capnp log, remove stale cache optimization The mtime-based cache (state.bin) was causing data loss under concurrent writes. Multiple processes (dream loop journal writes, link audit agents, journal enrichment agents) would each: 1. Load state.bin (stale - missing other processes' recent writes) 2. Make their own changes 3. Save state.bin, overwriting entries from other processes This caused 48 nodes to be lost from tonight's dream session - entries were in the append-only capnp log but invisible to the index because a later writer's state.bin overwrote the version that contained them. Fix: always replay from the capnp log (the source of truth). Cost: ~10ms extra at 2K nodes (36ms vs 26ms). The cache saved 10ms but introduced a correctness bug that lost real data. The append-only log design was correct - the cache layer violated its invariant by allowing stale reads to silently discard writes. --- src/capnp_store.rs | 46 +++++++--------------------------------------- 1 file changed, 7 insertions(+), 39 deletions(-) diff --git a/src/capnp_store.rs b/src/capnp_store.rs index 8578f10..2c503fa 100644 --- a/src/capnp_store.rs +++ b/src/capnp_store.rs @@ -283,47 +283,15 @@ pub struct Store { impl Store { /// Load store: try state.json cache first, rebuild from capnp logs if stale pub fn load() -> Result { - let state = state_path(); let nodes_p = nodes_path(); let rels_p = relations_path(); - // Check if cache is up to date - let cache_fresh = state.exists() && { - let cache_mtime = fs::metadata(&state).ok() - .and_then(|m| m.modified().ok()) - .unwrap_or(UNIX_EPOCH); - let nodes_mtime = fs::metadata(&nodes_p).ok() - .and_then(|m| m.modified().ok()) - .unwrap_or(UNIX_EPOCH); - let rels_mtime = fs::metadata(&rels_p).ok() - .and_then(|m| m.modified().ok()) - .unwrap_or(UNIX_EPOCH); - cache_mtime >= nodes_mtime && cache_mtime >= rels_mtime - }; - - if cache_fresh { - let data = fs::read(&state) - .map_err(|e| format!("read state.bin: {}", e))?; - let mut store: Store = bincode::deserialize(&data) - .map_err(|e| format!("parse state.bin: {}", e))?; - store.rebuild_uuid_index(); - return Ok(store); - } - - // Try legacy JSON cache for migration - let json_state = state_json_path(); - if json_state.exists() { - let data = fs::read_to_string(&json_state) - .map_err(|e| format!("read state.json: {}", e))?; - if let Ok(mut store) = serde_json::from_str::(&data) { - store.rebuild_uuid_index(); - // Migrate to bincode - store.save()?; - return Ok(store); - } - } - - // Rebuild from capnp logs + // Always rebuild from capnp logs (source of truth). + // The mtime-based cache was causing data loss: concurrent + // writers (dream loop, link audit, journal enrichment) would + // load stale state.bin, make changes, and save — overwriting + // entries from other processes. Replaying from the append-only + // log costs ~10ms extra at 2K nodes and is always correct. let mut store = Store::default(); if nodes_p.exists() { @@ -339,7 +307,7 @@ impl Store { store.nodes.contains_key(&r.target_key) ); - // Save cache + // Save cache (still useful for tools that read state.bin directly) store.save()?; Ok(store) }