consciousness/src/hippocampus/transcript.rs

341 lines
11 KiB
Rust
Raw Normal View History

// Transcript JSONL parsing utilities.
//
// Provides mmap-based backward scanning of Claude Code transcript files
// and compaction detection. Used by memory-search (hook mode) and
// parse-claude-conversation (debug tool).
use memchr::memrchr3;
use memmap2::Mmap;
use serde_json::Value;
use std::fs;
use std::path::Path;
/// Scan backwards through mmap'd bytes, yielding byte slices of complete
/// top-level JSON objects (outermost { to matching }).
///
/// Uses memrchr3 (SIMD) to jump between structurally significant bytes
/// ({, }, ") instead of scanning byte-by-byte. Tracks brace depth,
/// skipping braces inside JSON strings. Returns objects in reverse order
/// (newest first).
pub struct JsonlBackwardIter<'a> {
data: &'a [u8],
pos: usize,
}
impl<'a> JsonlBackwardIter<'a> {
pub fn new(data: &'a [u8]) -> Self {
Self { data, pos: data.len() }
}
}
impl<'a> Iterator for JsonlBackwardIter<'a> {
type Item = &'a [u8];
fn next(&mut self) -> Option<Self::Item> {
// Find the closing } of the next object, skipping } inside strings
let close = {
let mut in_string = false;
loop {
let p = memrchr3(b'{', b'}', b'"', &self.data[..self.pos])?;
self.pos = p;
let ch = self.data[p];
if in_string {
if ch == b'"' {
let mut bs = 0;
while p > bs + 1 && self.data[p - 1 - bs] == b'\\' {
bs += 1;
}
if bs % 2 == 0 { in_string = false; }
}
continue;
}
match ch {
b'}' => break p,
b'"' => in_string = true,
_ => {}
}
}
};
// Track brace depth to find matching {
let mut depth: usize = 1;
let mut in_string = false;
loop {
let p = memrchr3(b'{', b'}', b'"', &self.data[..self.pos])?;
self.pos = p;
let ch = self.data[p];
if in_string {
if ch == b'"' {
// Check for escaped quote (count preceding backslashes)
let mut bs = 0;
while p > bs + 1 && self.data[p - 1 - bs] == b'\\' {
bs += 1;
}
if bs % 2 == 0 {
in_string = false;
}
}
// { and } inside strings don't affect depth
continue;
}
match ch {
b'"' => { in_string = true; }
b'}' => { depth += 1; }
b'{' => {
depth -= 1;
if depth == 0 {
return Some(&self.data[self.pos..=close]);
}
}
_ => {}
}
}
}
}
/// Find the byte offset of the last compaction summary in mmap'd transcript data.
///
/// Scans backward for a user-type message whose content starts with
/// "This session is being continued". Returns the byte offset of the
/// JSON object's opening brace.
pub fn find_last_compaction(data: &[u8]) -> Option<usize> {
let marker = b"This session is being continued";
for obj_bytes in JsonlBackwardIter::new(data) {
// Quick byte check before parsing
if !contains_bytes(obj_bytes, marker) {
continue;
}
let obj: Value = match serde_json::from_slice(obj_bytes) {
Ok(v) => v,
Err(_) => continue,
};
if obj.get("type").and_then(|v| v.as_str()) != Some("user") {
continue;
}
if let Some(content) = obj.get("message")
.and_then(|m| m.get("content"))
.and_then(|c| c.as_str())
&& content.starts_with("This session is being continued") {
let offset = obj_bytes.as_ptr() as usize - data.as_ptr() as usize;
return Some(offset);
}
}
None
}
/// Find the byte offset of the last compaction in a transcript file.
/// Returns None if the file can't be opened or has no compaction.
pub fn find_last_compaction_in_file(path: &str) -> Option<u64> {
if path.is_empty() { return None; }
let file = fs::File::open(path).ok()?;
let meta = file.metadata().ok()?;
if meta.len() == 0 { return None; }
let mmap = unsafe { Mmap::map(&file).ok()? };
find_last_compaction(&mmap).map(|off| off as u64)
}
/// Mmap a transcript file. Returns (Mmap, File) to keep both alive.
pub fn mmap_transcript(path: &str) -> Option<(Mmap, fs::File)> {
let file = fs::File::open(path).ok()?;
let meta = file.metadata().ok()?;
if meta.len() == 0 { return None; }
let mmap = unsafe { Mmap::map(&file).ok()? };
Some((mmap, file))
}
fn contains_bytes(haystack: &[u8], needle: &[u8]) -> bool {
haystack.windows(needle.len()).any(|w| w == needle)
}
/// Reverse iterator over user/assistant messages in a transcript file.
/// Yields (role, text, timestamp) tuples newest-first. The caller decides
/// when to stop (byte budget, count, etc).
pub struct TailMessages {
_file: fs::File,
mmap: Mmap,
pos: usize,
}
impl TailMessages {
pub fn open(path: &str) -> Option<Self> {
let (mmap, file) = mmap_transcript(path)?;
let pos = mmap.len();
Some(Self { _file: file, mmap, pos })
}
}
impl Iterator for TailMessages {
type Item = (String, String, String);
fn next(&mut self) -> Option<Self::Item> {
loop {
// Find closing }, skipping } inside strings
let close = {
let mut in_string = false;
loop {
let p = memrchr3(b'{', b'}', b'"', &self.mmap[..self.pos])?;
self.pos = p;
let ch = self.mmap[p];
if in_string {
if ch == b'"' {
let mut bs = 0;
while p > bs + 1 && self.mmap[p - 1 - bs] == b'\\' {
bs += 1;
}
if bs % 2 == 0 { in_string = false; }
}
continue;
}
match ch {
b'}' => break p,
b'"' => in_string = true,
_ => {}
}
}
};
// Track brace depth to find matching {
let mut depth: usize = 1;
let mut in_string = false;
let open = loop {
let p = memrchr3(b'{', b'}', b'"', &self.mmap[..self.pos])?;
self.pos = p;
let ch = self.mmap[p];
if in_string {
if ch == b'"' {
let mut bs = 0;
while p > bs + 1 && self.mmap[p - 1 - bs] == b'\\' {
bs += 1;
}
if bs % 2 == 0 { in_string = false; }
}
continue;
}
match ch {
b'"' => { in_string = true; }
b'}' => { depth += 1; }
b'{' => {
depth -= 1;
if depth == 0 { break p; }
}
_ => {}
}
};
let obj_bytes = &self.mmap[open..=close];
// The "type" field is near the start of top-level objects.
// Only check the first 200 bytes to avoid scanning megabyte objects.
let prefix = &obj_bytes[..obj_bytes.len().min(200)];
let is_user = memchr::memmem::find(prefix, b"\"type\":\"user\"").is_some();
let is_assistant = !is_user
&& memchr::memmem::find(prefix, b"\"type\":\"assistant\"").is_some();
if !is_user && !is_assistant { continue; }
let obj: Value = match serde_json::from_slice(obj_bytes) {
Ok(v) => v,
Err(_) => continue,
};
let msg_type = if is_user { "user" } else { "assistant" };
let msg = obj.get("message").unwrap_or(&obj);
let text = match msg.get("content") {
Some(Value::String(s)) => s.clone(),
Some(Value::Array(arr)) => {
arr.iter()
.filter(|b| b.get("type").and_then(|v| v.as_str()) == Some("text"))
.filter_map(|b| b.get("text").and_then(|v| v.as_str()))
.collect::<Vec<_>>()
.join(" ")
}
_ => continue,
};
if text.is_empty() { continue; }
let timestamp = obj.get("timestamp")
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
return Some((msg_type.to_string(), text, timestamp));
}
}
}
/// Get the timestamp of the compaction message at a given byte offset.
/// Returns a human-readable datetime string, or None if unavailable.
pub fn compaction_timestamp(path: &str, offset: u64) -> Option<String> {
let (mmap, _file) = mmap_transcript(path)?;
let start = offset as usize;
if start >= mmap.len() { return None; }
// Find the end of this JSONL line
let end = mmap[start..].iter().position(|&b| b == b'\n')
.map(|p| start + p)
.unwrap_or(mmap.len());
let obj: Value = serde_json::from_slice(&mmap[start..end]).ok()?;
// Claude Code transcript entries have a "timestamp" field (ISO 8601)
if let Some(ts) = obj.get("timestamp").and_then(|v| v.as_str()) {
return Some(ts.to_string());
}
// Fallback: try "createdAt" or similar fields
for field in &["createdAt", "created_at", "time"] {
if let Some(ts) = obj.get(*field).and_then(|v| v.as_str()) {
return Some(ts.to_string());
}
}
None
}
/// Detect whether a compaction has occurred since the last check.
///
/// Compares the current compaction offset against a saved value in
/// `state_dir/compaction-{session_id}`. Returns true if a new
/// compaction was found. Updates the saved offset.
pub fn detect_new_compaction(
state_dir: &Path,
session_id: &str,
transcript_path: &str,
) -> bool {
let offset = find_last_compaction_in_file(transcript_path);
let save_path = state_dir.join(format!("compaction-{}", session_id));
let saved: Option<u64> = fs::read_to_string(&save_path)
.ok()
.and_then(|s| s.trim().parse().ok());
let is_new = match (offset, saved) {
(Some(cur), Some(prev)) => cur != prev,
(Some(_), None) => true,
_ => false,
};
// Save current offset
if let Some(off) = offset {
fs::write(&save_path, off.to_string()).ok();
}
is_new
}