fsck: add store integrity check and repair command
Reads each capnp log message sequentially, validates framing and content. On first corrupt message, truncates to last good position and removes stale caches so next load replays from repaired log. Wired up as `poc-memory fsck`.
This commit is contained in:
parent
d12c28ebcd
commit
63910e987c
3 changed files with 153 additions and 15 deletions
|
|
@ -29,6 +29,7 @@ mod ops;
|
|||
pub use types::*;
|
||||
pub use parse::{MemoryUnit, parse_units};
|
||||
pub use view::{StoreView, AnyView};
|
||||
pub use persist::fsck;
|
||||
|
||||
use crate::graph::{self, Graph};
|
||||
|
||||
|
|
|
|||
|
|
@ -16,7 +16,7 @@ use capnp::serialize;
|
|||
|
||||
use std::collections::HashMap;
|
||||
use std::fs;
|
||||
use std::io::{BufReader, BufWriter, Write as IoWrite};
|
||||
use std::io::{BufReader, BufWriter, Seek, Write as IoWrite};
|
||||
use std::path::Path;
|
||||
|
||||
impl Store {
|
||||
|
|
@ -339,3 +339,102 @@ impl Store {
|
|||
Ok(Some(store))
|
||||
}
|
||||
}
|
||||
|
||||
/// Check and repair corrupt capnp log files.
|
||||
///
|
||||
/// Reads each message sequentially, tracking file position. On the first
|
||||
/// corrupt message, truncates the file to the last good position. Also
|
||||
/// removes stale caches so the next load replays from the repaired log.
|
||||
pub fn fsck() -> Result<(), String> {
|
||||
let mut any_corrupt = false;
|
||||
|
||||
for (path, kind) in [
|
||||
(nodes_path(), "node"),
|
||||
(relations_path(), "relation"),
|
||||
] {
|
||||
if !path.exists() { continue; }
|
||||
|
||||
let file = fs::File::open(&path)
|
||||
.map_err(|e| format!("open {}: {}", path.display(), e))?;
|
||||
let file_len = file.metadata()
|
||||
.map_err(|e| format!("stat {}: {}", path.display(), e))?.len();
|
||||
let mut reader = BufReader::new(file);
|
||||
|
||||
let mut good_messages = 0u64;
|
||||
let mut last_good_pos = 0u64;
|
||||
|
||||
loop {
|
||||
let pos = reader.stream_position()
|
||||
.map_err(|e| format!("tell {}: {}", path.display(), e))?;
|
||||
|
||||
let msg = match serialize::read_message(&mut reader, message::ReaderOptions::new()) {
|
||||
Ok(m) => m,
|
||||
Err(_) => {
|
||||
// read_message fails at EOF (normal) or on corrupt framing
|
||||
if pos < file_len {
|
||||
// Not at EOF — corrupt framing
|
||||
eprintln!("{}: corrupt message at offset {}, truncating", kind, pos);
|
||||
any_corrupt = true;
|
||||
drop(reader);
|
||||
let file = fs::OpenOptions::new().write(true).open(&path)
|
||||
.map_err(|e| format!("open for truncate: {}", e))?;
|
||||
file.set_len(pos)
|
||||
.map_err(|e| format!("truncate {}: {}", path.display(), e))?;
|
||||
eprintln!("{}: truncated from {} to {} bytes ({} good messages)",
|
||||
kind, file_len, pos, good_messages);
|
||||
}
|
||||
break;
|
||||
}
|
||||
};
|
||||
|
||||
// Validate the message content too
|
||||
let valid = if kind == "node" {
|
||||
msg.get_root::<memory_capnp::node_log::Reader>()
|
||||
.and_then(|l| l.get_nodes().map(|_| ()))
|
||||
.is_ok()
|
||||
} else {
|
||||
msg.get_root::<memory_capnp::relation_log::Reader>()
|
||||
.and_then(|l| l.get_relations().map(|_| ()))
|
||||
.is_ok()
|
||||
};
|
||||
|
||||
if valid {
|
||||
good_messages += 1;
|
||||
last_good_pos = reader.stream_position()
|
||||
.map_err(|e| format!("tell {}: {}", path.display(), e))?;
|
||||
} else {
|
||||
eprintln!("{}: corrupt message content at offset {}, truncating to {}",
|
||||
kind, pos, last_good_pos);
|
||||
any_corrupt = true;
|
||||
drop(reader);
|
||||
let file = fs::OpenOptions::new().write(true).open(&path)
|
||||
.map_err(|e| format!("open for truncate: {}", e))?;
|
||||
file.set_len(last_good_pos)
|
||||
.map_err(|e| format!("truncate {}: {}", path.display(), e))?;
|
||||
eprintln!("{}: truncated from {} to {} bytes ({} good messages)",
|
||||
kind, file_len, last_good_pos, good_messages);
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if !any_corrupt {
|
||||
eprintln!("{}: {} messages, all clean", kind, good_messages);
|
||||
}
|
||||
}
|
||||
|
||||
if any_corrupt {
|
||||
// Nuke caches so next load replays from the repaired logs
|
||||
for p in [state_path(), snapshot_path()] {
|
||||
if p.exists() {
|
||||
fs::remove_file(&p)
|
||||
.map_err(|e| format!("remove {}: {}", p.display(), e))?;
|
||||
eprintln!("removed stale cache: {}", p.display());
|
||||
}
|
||||
}
|
||||
eprintln!("repair complete — run `poc-memory status` to verify");
|
||||
} else {
|
||||
eprintln!("store is clean");
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue