forked from kent/consciousness
- KEY_TO_UUID now stores weight (30 bytes: uuid+type+ts+deleted+weight) - UUID_OFFSETS changed to composite key for O(log n) max-offset lookup - Add NODES_BY_TYPE index for efficient type+date range queries - Add for_each_key_weight() to StoreView for index-only iteration - match_seeds uses index-only path when content not needed - Fix transaction consistency in ops (single txn for related updates) - rebuild() now records all uuid→offset mappings for version history - Backwards compatible: old index formats decoded with default weight Co-Authored-By: Proof of Concept <poc@bcachefs.org> Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
584 lines
21 KiB
Rust
584 lines
21 KiB
Rust
// Cap'n Proto serialization and persistence
|
|
//
|
|
// capnp logs are the source of truth; redb provides indexed access.
|
|
// This module contains:
|
|
// - Serialization macros (capnp_enum!, capnp_message!)
|
|
// - Load/replay from capnp logs
|
|
// - Append to capnp logs
|
|
// - fsck (corruption repair)
|
|
|
|
use super::{index, types::*};
|
|
use crate::memory_capnp;
|
|
use super::Store;
|
|
|
|
use anyhow::{anyhow, Context, Result};
|
|
use capnp::message;
|
|
use capnp::serialize;
|
|
|
|
use std::collections::HashMap;
|
|
use std::fs;
|
|
use std::io::{BufReader, Seek};
|
|
use std::path::Path;
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Capnp serialization macros
|
|
//
|
|
// Declarative mapping between Rust types and capnp generated types.
|
|
// Adding a field to the schema means adding it in one place below;
|
|
// both read and write are generated from the same declaration.
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/// Generate to_capnp/from_capnp conversion methods for an enum.
|
|
macro_rules! capnp_enum {
|
|
($rust_type:ident, $capnp_type:path, [$($variant:ident),+ $(,)?]) => {
|
|
impl $rust_type {
|
|
#[allow(clippy::wrong_self_convention, dead_code)]
|
|
pub(crate) fn to_capnp(&self) -> $capnp_type {
|
|
match self {
|
|
$(Self::$variant => <$capnp_type>::$variant,)+
|
|
}
|
|
}
|
|
pub(crate) fn from_capnp(v: $capnp_type) -> Self {
|
|
match v {
|
|
$(<$capnp_type>::$variant => Self::$variant,)+
|
|
}
|
|
}
|
|
}
|
|
};
|
|
}
|
|
|
|
/// Generate from_capnp/to_capnp methods for a struct with capnp serialization.
|
|
/// Fields are grouped by serialization kind:
|
|
/// text - capnp Text fields (String in Rust)
|
|
/// uuid - capnp Data fields ([u8; 16] in Rust)
|
|
/// prim - copy types (u32, f32, f64, bool)
|
|
/// enm - enums with to_capnp/from_capnp methods
|
|
/// skip - Rust-only fields not in capnp (set to Default on read)
|
|
macro_rules! capnp_message {
|
|
(
|
|
$struct:ident,
|
|
reader: $reader:ty,
|
|
builder: $builder:ty,
|
|
text: [$($tf:ident),* $(,)?],
|
|
uuid: [$($uf:ident),* $(,)?],
|
|
prim: [$($pf:ident),* $(,)?],
|
|
enm: [$($ef:ident: $et:ident),* $(,)?],
|
|
skip: [$($sf:ident),* $(,)?] $(,)?
|
|
) => {
|
|
impl $struct {
|
|
pub fn from_capnp(r: $reader) -> Result<Self> {
|
|
paste::paste! {
|
|
Ok(Self {
|
|
$($tf: read_text(r.[<get_ $tf>]()),)*
|
|
$($uf: read_uuid(r.[<get_ $uf>]()),)*
|
|
$($pf: r.[<get_ $pf>](),)*
|
|
$($ef: $et::from_capnp(
|
|
r.[<get_ $ef>]().map_err(|_| anyhow!(concat!("bad ", stringify!($ef))))?
|
|
),)*
|
|
$($sf: Default::default(),)*
|
|
})
|
|
}
|
|
}
|
|
|
|
pub fn to_capnp(&self, mut b: $builder) {
|
|
paste::paste! {
|
|
$(b.[<set_ $tf>](&self.$tf);)*
|
|
$(b.[<set_ $uf>](&self.$uf);)*
|
|
$(b.[<set_ $pf>](self.$pf);)*
|
|
$(b.[<set_ $ef>](self.$ef.to_capnp());)*
|
|
}
|
|
}
|
|
}
|
|
};
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Capnp helpers
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/// Read a capnp text field, returning empty string on any error
|
|
fn read_text(result: capnp::Result<capnp::text::Reader>) -> String {
|
|
result.ok()
|
|
.and_then(|t| t.to_str().ok())
|
|
.unwrap_or("")
|
|
.to_string()
|
|
}
|
|
|
|
/// Read a capnp data field as [u8; 16], zero-padded
|
|
fn read_uuid(result: capnp::Result<&[u8]>) -> [u8; 16] {
|
|
let mut out = [0u8; 16];
|
|
if let Ok(data) = result
|
|
&& data.len() >= 16 {
|
|
out.copy_from_slice(&data[..16]);
|
|
}
|
|
out
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Type-to-capnp mappings
|
|
// ---------------------------------------------------------------------------
|
|
|
|
capnp_enum!(NodeType, memory_capnp::NodeType,
|
|
[EpisodicSession, EpisodicDaily, EpisodicWeekly, Semantic, EpisodicMonthly]);
|
|
|
|
capnp_enum!(RelationType, memory_capnp::RelationType,
|
|
[Link, Causal, Auto]);
|
|
|
|
capnp_message!(Node,
|
|
reader: memory_capnp::content_node::Reader<'_>,
|
|
builder: memory_capnp::content_node::Builder<'_>,
|
|
text: [key, content, source_ref, provenance],
|
|
uuid: [uuid],
|
|
prim: [version, timestamp, weight, emotion, deleted,
|
|
retrievals, uses, wrongs, last_replayed,
|
|
spaced_repetition_interval, created_at, last_scored],
|
|
enm: [node_type: NodeType],
|
|
skip: [community_id, clustering_coefficient, degree],
|
|
);
|
|
|
|
capnp_message!(Relation,
|
|
reader: memory_capnp::relation::Reader<'_>,
|
|
builder: memory_capnp::relation::Builder<'_>,
|
|
text: [source_key, target_key, provenance],
|
|
uuid: [uuid, source, target],
|
|
prim: [version, timestamp, strength, deleted],
|
|
enm: [rel_type: RelationType],
|
|
skip: [],
|
|
);
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Migration helpers (legacy provenance enum → string)
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/// Convert legacy capnp provenance enum to string label.
|
|
fn legacy_provenance_label(p: memory_capnp::Provenance) -> &'static str {
|
|
use memory_capnp::Provenance::*;
|
|
match p {
|
|
Manual => "manual",
|
|
Journal => "journal",
|
|
Agent => "agent",
|
|
Dream => "dream",
|
|
Derived => "derived",
|
|
AgentExperienceMine => "agent:experience-mine",
|
|
AgentKnowledgeObservation => "agent:knowledge-observation",
|
|
AgentKnowledgePattern => "agent:knowledge-pattern",
|
|
AgentKnowledgeConnector => "agent:knowledge-connector",
|
|
AgentKnowledgeChallenger => "agent:knowledge-challenger",
|
|
AgentConsolidate => "agent:consolidate",
|
|
AgentDigest => "agent:digest",
|
|
AgentFactMine => "agent:fact-mine",
|
|
AgentDecay => "agent:decay",
|
|
}
|
|
}
|
|
|
|
impl Node {
|
|
/// Read from capnp with migration: if the new provenance text field
|
|
/// is empty (old record), fall back to the deprecated provenanceOld enum.
|
|
pub fn from_capnp_migrate(r: memory_capnp::content_node::Reader<'_>) -> Result<Self> {
|
|
let mut node = Self::from_capnp(r)?;
|
|
if node.provenance.is_empty()
|
|
&& let Ok(old) = r.get_provenance_old() {
|
|
node.provenance = legacy_provenance_label(old).to_string();
|
|
}
|
|
// Sanitize timestamps: old capnp records have raw offsets instead
|
|
// of unix epoch. Anything past year 2100 (~4102444800) is bogus.
|
|
const MAX_SANE_EPOCH: i64 = 4_102_444_800;
|
|
if node.timestamp > MAX_SANE_EPOCH || node.timestamp < 0 {
|
|
node.timestamp = node.created_at;
|
|
}
|
|
if node.created_at > MAX_SANE_EPOCH || node.created_at < 0 {
|
|
node.created_at = node.timestamp.min(MAX_SANE_EPOCH);
|
|
}
|
|
Ok(node)
|
|
}
|
|
}
|
|
|
|
impl Relation {
|
|
pub fn from_capnp_migrate(r: memory_capnp::relation::Reader<'_>) -> Result<Self> {
|
|
let mut rel = Self::from_capnp(r)?;
|
|
if rel.provenance.is_empty()
|
|
&& let Ok(old) = r.get_provenance_old() {
|
|
rel.provenance = legacy_provenance_label(old).to_string();
|
|
}
|
|
Ok(rel)
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Direct node access
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/// Read a single node at the given offset in the capnp log.
|
|
/// The offset must point to a valid message containing the node.
|
|
/// Read a node at a given offset. If `target_key` is provided, find that specific
|
|
/// node in the message (handles batch writes where multiple nodes share an offset).
|
|
pub fn read_node_at_offset_for_key(offset: u64, target_key: Option<&str>) -> Result<Node> {
|
|
let path = nodes_path();
|
|
let mut file = fs::File::open(&path)
|
|
.with_context(|| format!("open {}", path.display()))?;
|
|
|
|
use std::io::{Seek, SeekFrom};
|
|
file.seek(SeekFrom::Start(offset))?;
|
|
|
|
let mut reader = BufReader::new(file);
|
|
let msg = serialize::read_message(&mut reader, message::ReaderOptions::new())
|
|
.with_context(|| format!("read message at offset {}", offset))?;
|
|
|
|
let log = msg.get_root::<memory_capnp::node_log::Reader>()
|
|
.with_context(|| "read node log")?;
|
|
let nodes = log.get_nodes()
|
|
.with_context(|| "get nodes")?;
|
|
|
|
if nodes.is_empty() {
|
|
anyhow::bail!("no nodes in message at offset {}", offset);
|
|
}
|
|
|
|
// If target_key specified, find that specific node
|
|
if let Some(key) = target_key {
|
|
for node_reader in nodes.iter() {
|
|
let node = Node::from_capnp_migrate(node_reader)?;
|
|
if node.key == key {
|
|
return Ok(node);
|
|
}
|
|
}
|
|
anyhow::bail!("node '{}' not found in message at offset {}", key, offset);
|
|
}
|
|
|
|
// No target key - return first non-deleted, or first if all deleted
|
|
for node_reader in nodes.iter() {
|
|
let node = Node::from_capnp_migrate(node_reader)?;
|
|
if !node.deleted {
|
|
return Ok(node);
|
|
}
|
|
}
|
|
|
|
Node::from_capnp_migrate(nodes.get(0))
|
|
}
|
|
|
|
/// Read a node at offset (legacy, no key filtering)
|
|
pub fn read_node_at_offset(offset: u64) -> Result<Node> {
|
|
read_node_at_offset_for_key(offset, None)
|
|
}
|
|
|
|
/// Iterate over all nodes in the capnp log, yielding (offset, Node) pairs.
|
|
/// Nodes are yielded in log order (oldest first).
|
|
/// Multiple nodes in the same message share the same offset.
|
|
pub fn iter_nodes() -> Result<Vec<(u64, Node)>> {
|
|
let path = nodes_path();
|
|
if !path.exists() {
|
|
return Ok(Vec::new());
|
|
}
|
|
|
|
let file = fs::File::open(&path)
|
|
.with_context(|| format!("open {}", path.display()))?;
|
|
let mut reader = BufReader::new(file);
|
|
let mut results = Vec::new();
|
|
|
|
loop {
|
|
let offset = reader.stream_position()?;
|
|
let msg = match serialize::read_message(&mut reader, message::ReaderOptions::new()) {
|
|
Ok(m) => m,
|
|
Err(_) => break, // EOF or corrupt
|
|
};
|
|
|
|
let log = match msg.get_root::<memory_capnp::node_log::Reader>() {
|
|
Ok(l) => l,
|
|
Err(_) => continue,
|
|
};
|
|
|
|
let nodes = match log.get_nodes() {
|
|
Ok(n) => n,
|
|
Err(_) => continue,
|
|
};
|
|
|
|
for node_reader in nodes {
|
|
if let Ok(node) = Node::from_capnp_migrate(node_reader) {
|
|
results.push((offset, node));
|
|
}
|
|
}
|
|
}
|
|
|
|
Ok(results)
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Store persistence methods
|
|
// ---------------------------------------------------------------------------
|
|
|
|
impl Store {
|
|
/// Load store by opening redb index and replaying relations.
|
|
pub fn load() -> Result<Store> {
|
|
let nodes_p = nodes_path();
|
|
let rels_p = relations_path();
|
|
|
|
let mut store = Store::default();
|
|
|
|
// Open redb index (rebuilds from capnp if needed)
|
|
let db_p = db_path();
|
|
store.db = Some(index::open_or_rebuild(&db_p)?);
|
|
|
|
// Replay relations
|
|
if rels_p.exists() {
|
|
store.replay_relations(&rels_p)?;
|
|
}
|
|
|
|
// Record log sizes
|
|
use std::sync::atomic::Ordering;
|
|
store.loaded_nodes_size.store(
|
|
fs::metadata(&nodes_p).map(|m| m.len()).unwrap_or(0),
|
|
Ordering::Relaxed
|
|
);
|
|
store.loaded_rels_size.store(
|
|
fs::metadata(&rels_p).map(|m| m.len()).unwrap_or(0),
|
|
Ordering::Relaxed
|
|
);
|
|
|
|
Ok(store)
|
|
}
|
|
|
|
/// Replay relation log, keeping latest version per UUID
|
|
fn replay_relations(&mut self, path: &Path) -> Result<()> {
|
|
let file = fs::File::open(path)
|
|
.with_context(|| format!("open {}", path.display()))?;
|
|
let mut reader = BufReader::new(file);
|
|
|
|
// Collect all, then deduplicate by UUID keeping latest version
|
|
let mut by_uuid: HashMap<[u8; 16], Relation> = HashMap::new();
|
|
|
|
while let Ok(msg) = serialize::read_message(&mut reader, message::ReaderOptions::new()) {
|
|
let log = msg.get_root::<memory_capnp::relation_log::Reader>()
|
|
.with_context(|| format!("read relation log"))?;
|
|
for rel_reader in log.get_relations()
|
|
.with_context(|| format!("get relations"))? {
|
|
let rel = Relation::from_capnp_migrate(rel_reader)?;
|
|
let existing_version = by_uuid.get(&rel.uuid)
|
|
.map(|r| r.version)
|
|
.unwrap_or(0);
|
|
if rel.version >= existing_version {
|
|
by_uuid.insert(rel.uuid, rel);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Index relations directly (single transaction)
|
|
if let Some(db) = &self.db {
|
|
let txn = db.begin_write()?;
|
|
for rel in by_uuid.into_values() {
|
|
if rel.deleted { continue; }
|
|
index::index_relation(&txn, &rel.source, &rel.target, rel.strength, rel.rel_type as u8)?;
|
|
}
|
|
txn.commit()?;
|
|
}
|
|
Ok(())
|
|
}
|
|
|
|
/// Find all duplicate keys: keys with multiple live UUIDs in the log.
|
|
/// Returns a map from key → vec of all live Node versions (one per UUID).
|
|
pub fn find_duplicates(&self) -> Result<HashMap<String, Vec<Node>>> {
|
|
let path = nodes_path();
|
|
if !path.exists() { return Ok(HashMap::new()); }
|
|
|
|
let file = fs::File::open(&path)
|
|
.with_context(|| format!("open {}", path.display()))?;
|
|
let mut reader = BufReader::new(file);
|
|
|
|
// Track latest version of each UUID
|
|
let mut by_uuid: HashMap<[u8; 16], Node> = HashMap::new();
|
|
|
|
while let Ok(msg) = serialize::read_message(&mut reader, message::ReaderOptions::new()) {
|
|
let log = msg.get_root::<memory_capnp::node_log::Reader>()
|
|
.with_context(|| format!("read node log"))?;
|
|
for node_reader in log.get_nodes()
|
|
.with_context(|| format!("get nodes"))? {
|
|
let node = Node::from_capnp_migrate(node_reader)?;
|
|
let dominated = by_uuid.get(&node.uuid)
|
|
.map(|n| node.version >= n.version)
|
|
.unwrap_or(true);
|
|
if dominated {
|
|
by_uuid.insert(node.uuid, node);
|
|
}
|
|
}
|
|
}
|
|
|
|
// Group live (non-deleted) nodes by key
|
|
let mut by_key: HashMap<String, Vec<Node>> = HashMap::new();
|
|
for node in by_uuid.into_values() {
|
|
if !node.deleted {
|
|
by_key.entry(node.key.clone()).or_default().push(node);
|
|
}
|
|
}
|
|
|
|
// Keep only duplicates
|
|
by_key.retain(|_, nodes| nodes.len() > 1);
|
|
Ok(by_key)
|
|
}
|
|
|
|
/// Append nodes to the log file. Returns the offset where the message was written.
|
|
pub fn append_nodes(&self, nodes: &[Node]) -> Result<u64> {
|
|
use std::sync::atomic::Ordering;
|
|
|
|
let mut msg = message::Builder::new_default();
|
|
{
|
|
let log = msg.init_root::<memory_capnp::node_log::Builder>();
|
|
let mut list = log.init_nodes(nodes.len() as u32);
|
|
for (i, node) in nodes.iter().enumerate() {
|
|
node.to_capnp(list.reborrow().get(i as u32));
|
|
}
|
|
}
|
|
let mut buf = Vec::new();
|
|
serialize::write_message(&mut buf, &msg)
|
|
.with_context(|| format!("serialize nodes"))?;
|
|
|
|
// Lock for file append
|
|
let _guard = self.append_lock.lock().unwrap();
|
|
|
|
let path = nodes_path();
|
|
let file = fs::OpenOptions::new()
|
|
.create(true).append(true).open(&path)
|
|
.with_context(|| format!("open {}", path.display()))?;
|
|
|
|
// Get offset before writing
|
|
let offset = file.metadata().map(|m| m.len()).unwrap_or(0);
|
|
|
|
use std::io::Write;
|
|
(&file).write_all(&buf)
|
|
.with_context(|| format!("write nodes"))?;
|
|
|
|
self.loaded_nodes_size.store(
|
|
file.metadata().map(|m| m.len()).unwrap_or(0),
|
|
Ordering::Relaxed
|
|
);
|
|
Ok(offset)
|
|
}
|
|
|
|
/// Append relations to the log file.
|
|
pub fn append_relations(&self, relations: &[Relation]) -> Result<()> {
|
|
use std::sync::atomic::Ordering;
|
|
|
|
let mut msg = message::Builder::new_default();
|
|
{
|
|
let log = msg.init_root::<memory_capnp::relation_log::Builder>();
|
|
let mut list = log.init_relations(relations.len() as u32);
|
|
for (i, rel) in relations.iter().enumerate() {
|
|
rel.to_capnp(list.reborrow().get(i as u32));
|
|
}
|
|
}
|
|
let mut buf = Vec::new();
|
|
serialize::write_message(&mut buf, &msg)
|
|
.with_context(|| format!("serialize relations"))?;
|
|
|
|
// Lock for file append
|
|
let _guard = self.append_lock.lock().unwrap();
|
|
|
|
let path = relations_path();
|
|
let file = fs::OpenOptions::new()
|
|
.create(true).append(true).open(&path)
|
|
.with_context(|| format!("open {}", path.display()))?;
|
|
use std::io::Write;
|
|
(&file).write_all(&buf)
|
|
.with_context(|| format!("write relations"))?;
|
|
|
|
self.loaded_rels_size.store(
|
|
file.metadata().map(|m| m.len()).unwrap_or(0),
|
|
Ordering::Relaxed
|
|
);
|
|
Ok(())
|
|
}
|
|
|
|
/// Placeholder - indices will be updated on write with redb.
|
|
pub fn save(&self) -> Result<()> {
|
|
Ok(())
|
|
}
|
|
}
|
|
|
|
/// Check and repair corrupt capnp log files.
|
|
///
|
|
/// Reads each message sequentially, tracking file position. On the first
|
|
/// corrupt message, truncates the file to the last good position. Also
|
|
/// removes stale caches so the next load replays from the repaired log.
|
|
pub fn fsck() -> Result<()> {
|
|
let mut any_corrupt = false;
|
|
|
|
for (path, kind) in [
|
|
(nodes_path(), "node"),
|
|
(relations_path(), "relation"),
|
|
] {
|
|
if !path.exists() { continue; }
|
|
|
|
let file = fs::File::open(&path)
|
|
.with_context(|| format!("open {}", path.display()))?;
|
|
let file_len = file.metadata()
|
|
.with_context(|| format!("stat {}", path.display()))?.len();
|
|
let mut reader = BufReader::new(file);
|
|
|
|
let mut good_messages = 0u64;
|
|
let mut last_good_pos = 0u64;
|
|
|
|
loop {
|
|
let pos = reader.stream_position()
|
|
.with_context(|| format!("tell {}", path.display()))?;
|
|
|
|
let msg = match serialize::read_message(&mut reader, message::ReaderOptions::new()) {
|
|
Ok(m) => m,
|
|
Err(_) => {
|
|
// read_message fails at EOF (normal) or on corrupt framing
|
|
if pos < file_len {
|
|
// Not at EOF — corrupt framing
|
|
eprintln!("{}: corrupt message at offset {}, truncating", kind, pos);
|
|
any_corrupt = true;
|
|
drop(reader);
|
|
let file = fs::OpenOptions::new().write(true).open(&path)
|
|
.with_context(|| format!("open for truncate"))?;
|
|
file.set_len(pos)
|
|
.with_context(|| format!("truncate {}", path.display()))?;
|
|
eprintln!("{}: truncated from {} to {} bytes ({} good messages)",
|
|
kind, file_len, pos, good_messages);
|
|
}
|
|
break;
|
|
}
|
|
};
|
|
|
|
// Validate the message content too
|
|
let valid = if kind == "node" {
|
|
msg.get_root::<memory_capnp::node_log::Reader>()
|
|
.and_then(|l| l.get_nodes().map(|_| ()))
|
|
.is_ok()
|
|
} else {
|
|
msg.get_root::<memory_capnp::relation_log::Reader>()
|
|
.and_then(|l| l.get_relations().map(|_| ()))
|
|
.is_ok()
|
|
};
|
|
|
|
if valid {
|
|
good_messages += 1;
|
|
last_good_pos = reader.stream_position()
|
|
.with_context(|| format!("tell {}", path.display()))?;
|
|
} else {
|
|
eprintln!("{}: corrupt message content at offset {}, truncating to {}",
|
|
kind, pos, last_good_pos);
|
|
any_corrupt = true;
|
|
drop(reader);
|
|
let file = fs::OpenOptions::new().write(true).open(&path)
|
|
.with_context(|| format!("open for truncate"))?;
|
|
file.set_len(last_good_pos)
|
|
.with_context(|| format!("truncate {}", path.display()))?;
|
|
eprintln!("{}: truncated from {} to {} bytes ({} good messages)",
|
|
kind, file_len, last_good_pos, good_messages);
|
|
break;
|
|
}
|
|
}
|
|
|
|
if !any_corrupt {
|
|
eprintln!("{}: {} messages, all clean", kind, good_messages);
|
|
}
|
|
}
|
|
|
|
if any_corrupt {
|
|
eprintln!("repair complete — run `poc-memory status` to verify");
|
|
} else {
|
|
eprintln!("store is clean");
|
|
}
|
|
|
|
Ok(())
|
|
}
|
|
|