From c7e7cfb7af74a9d465dd88d46637f34e49468090 Mon Sep 17 00:00:00 2001
From: ProofOfConcept <poc@bcachefs.org>
Date: Sun, 1 Mar 2026 05:46:35 -0500
Subject: [PATCH] store: always replay from capnp log, remove stale cache
 optimization

The mtime-based cache (state.bin) was causing data loss under
concurrent writes. Multiple processes (dream loop journal writes,
link audit agents, journal enrichment agents) would each:
1. Load state.bin (stale - missing other processes' recent writes)
2. Make their own changes
3. Save state.bin, overwriting entries from other processes

This caused 48 nodes to be lost from tonight's dream session -
entries were in the append-only capnp log but invisible to the
index because a later writer's state.bin overwrote the version
that contained them.

Fix: always replay from the capnp log (the source of truth).
Cost: ~10ms extra at 2K nodes (36ms vs 26ms). The cache saved
10ms but introduced a correctness bug that lost real data.

The append-only log design was correct - the cache layer violated
its invariant by allowing stale reads to silently discard writes.
---
 src/capnp_store.rs | 46 +++++++---------------------------------------
 1 file changed, 7 insertions(+), 39 deletions(-)
diff --git a/src/capnp_store.rs b/src/capnp_store.rs
index 8578f10..2c503fa 100644
--- a/src/capnp_store.rs
+++ b/src/capnp_store.rs
@@ -283,47 +283,15 @@ pub struct Store {
 impl Store {
     /// Load store: try state.json cache first, rebuild from capnp logs if stale
     pub fn load() -> Result<Store, String> {
-        let state = state_path();
         let nodes_p = nodes_path();
         let rels_p = relations_path();
 
-        // Check if cache is up to date
-        let cache_fresh = state.exists() && {
-            let cache_mtime = fs::metadata(&state).ok()
-                .and_then(|m| m.modified().ok())
-                .unwrap_or(UNIX_EPOCH);
-            let nodes_mtime = fs::metadata(&nodes_p).ok()
-                .and_then(|m| m.modified().ok())
-                .unwrap_or(UNIX_EPOCH);
-            let rels_mtime = fs::metadata(&rels_p).ok()
-                .and_then(|m| m.modified().ok())
-                .unwrap_or(UNIX_EPOCH);
-            cache_mtime >= nodes_mtime && cache_mtime >= rels_mtime
-        };
-
-        if cache_fresh {
-            let data = fs::read(&state)
-                .map_err(|e| format!("read state.bin: {}", e))?;
-            let mut store: Store = bincode::deserialize(&data)
-                .map_err(|e| format!("parse state.bin: {}", e))?;
-            store.rebuild_uuid_index();
-            return Ok(store);
-        }
-
-        // Try legacy JSON cache for migration
-        let json_state = state_json_path();
-        if json_state.exists() {
-            let data = fs::read_to_string(&json_state)
-                .map_err(|e| format!("read state.json: {}", e))?;
-            if let Ok(mut store) = serde_json::from_str::<Store>(&data) {
-                store.rebuild_uuid_index();
-                // Migrate to bincode
-                store.save()?;
-                return Ok(store);
-            }
-        }
-
-        // Rebuild from capnp logs
+        // Always rebuild from capnp logs (source of truth).
+        // The mtime-based cache was causing data loss: concurrent
+        // writers (dream loop, link audit, journal enrichment) would
+        // load stale state.bin, make changes, and save — overwriting
+        // entries from other processes. Replaying from the append-only
+        // log costs ~10ms extra at 2K nodes and is always correct.
         let mut store = Store::default();
 
         if nodes_p.exists() {
@@ -339,7 +307,7 @@ impl Store {
             store.nodes.contains_key(&r.target_key)
         );
 
-        // Save cache
+        // Save cache (still useful for tools that read state.bin directly)
         store.save()?;
         Ok(store)
     }