JsonlBackwardIter: use memrchr3 for SIMD-accelerated scanning

Replaces byte-by-byte backward iteration with memrchr3('{', '}', '"')
which uses SIMD to jump between structurally significant bytes. Major
speedup on large transcripts (1.4GB+).

Also simplifies tail_messages to use a byte budget (200KB) instead
of token counting.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Kent Overstreet 2026-03-22 03:11:30 -04:00
parent d7d631d77d
commit 6c41b50e04
3 changed files with 23 additions and 23 deletions

View file

@ -16,6 +16,7 @@ clap = { version = "4", features = ["derive"] }
libc = "0.2" libc = "0.2"
faer = "0.24.0" faer = "0.24.0"
rkyv = { version = "0.7", features = ["validation", "std"] } rkyv = { version = "0.7", features = ["validation", "std"] }
memchr = "2"
memmap2 = "0.9" memmap2 = "0.9"
rayon = "1" rayon = "1"
peg = "0.8" peg = "0.8"

View file

@ -456,7 +456,7 @@ fn resolve_conversation() -> String {
let Some(path) = transcript else { return String::new() }; let Some(path) = transcript else { return String::new() };
let path_str = path.to_string_lossy(); let path_str = path.to_string_lossy();
let messages = crate::transcript::tail_messages(&path_str, 25_000); let messages = crate::transcript::tail_messages(&path_str, 200_000);
if messages.is_empty() { return String::new(); } if messages.is_empty() { return String::new(); }
let cfg = crate::config::get(); let cfg = crate::config::get();

View file

@ -4,6 +4,7 @@
// and compaction detection. Used by memory-search (hook mode) and // and compaction detection. Used by memory-search (hook mode) and
// parse-claude-conversation (debug tool). // parse-claude-conversation (debug tool).
use memchr::memrchr3;
use memmap2::Mmap; use memmap2::Mmap;
use serde_json::Value; use serde_json::Value;
use std::fs; use std::fs;
@ -12,8 +13,10 @@ use std::path::Path;
/// Scan backwards through mmap'd bytes, yielding byte slices of complete /// Scan backwards through mmap'd bytes, yielding byte slices of complete
/// top-level JSON objects (outermost { to matching }). /// top-level JSON objects (outermost { to matching }).
/// ///
/// Tracks brace depth, skipping braces inside JSON strings. Returns /// Uses memrchr3 (SIMD) to jump between structurally significant bytes
/// objects in reverse order (newest first). /// ({, }, ") instead of scanning byte-by-byte. Tracks brace depth,
/// skipping braces inside JSON strings. Returns objects in reverse order
/// (newest first).
pub struct JsonlBackwardIter<'a> { pub struct JsonlBackwardIter<'a> {
data: &'a [u8], data: &'a [u8],
pos: usize, pos: usize,
@ -29,17 +32,14 @@ impl<'a> Iterator for JsonlBackwardIter<'a> {
type Item = &'a [u8]; type Item = &'a [u8];
fn next(&mut self) -> Option<Self::Item> { fn next(&mut self) -> Option<Self::Item> {
if self.pos == 0 { // Find the closing } of the next object
return None;
}
// Find the closing } of the next object (scanning backward)
let close = loop { let close = loop {
if self.pos == 0 { return None; } let p = memrchr3(b'{', b'}', b'"', &self.data[..self.pos])?;
self.pos -= 1; self.pos = p;
if self.data[self.pos] == b'}' { if self.data[p] == b'}' {
break self.pos; break p;
} }
// Skip past any { or " that aren't our closing brace
}; };
// Track brace depth to find matching { // Track brace depth to find matching {
@ -47,22 +47,22 @@ impl<'a> Iterator for JsonlBackwardIter<'a> {
let mut in_string = false; let mut in_string = false;
loop { loop {
if self.pos == 0 { let p = memrchr3(b'{', b'}', b'"', &self.data[..self.pos])?;
return None; self.pos = p;
} let ch = self.data[p];
self.pos -= 1;
let ch = self.data[self.pos];
if in_string { if in_string {
if ch == b'"' { if ch == b'"' {
// Check for escaped quote (count preceding backslashes)
let mut bs = 0; let mut bs = 0;
while self.pos > bs && self.data[self.pos - 1 - bs] == b'\\' { while p > bs + 1 && self.data[p - 1 - bs] == b'\\' {
bs += 1; bs += 1;
} }
if bs % 2 == 0 { if bs % 2 == 0 {
in_string = false; in_string = false;
} }
} }
// { and } inside strings don't affect depth
continue; continue;
} }
@ -145,18 +145,17 @@ fn contains_bytes(haystack: &[u8], needle: &[u8]) -> bool {
/// Reverse-scan a transcript file, collecting user/assistant messages /// Reverse-scan a transcript file, collecting user/assistant messages
/// until `max_tokens` tokens (~4 chars each) are accumulated. Stops at /// until `max_tokens` tokens (~4 chars each) are accumulated. Stops at
/// the last compaction boundary. Returns messages in chronological order. /// the last compaction boundary. Returns messages in chronological order.
pub fn tail_messages(path: &str, max_tokens: usize) -> Vec<(String, String, String)> { pub fn tail_messages(path: &str, max_bytes: usize) -> Vec<(String, String, String)> {
let (mmap, _file) = match mmap_transcript(path) { let (mmap, _file) = match mmap_transcript(path) {
Some(v) => v, Some(v) => v,
None => return Vec::new(), None => return Vec::new(),
}; };
let compaction_marker = b"This session is being continued";
let mut messages: Vec<(String, String, String)> = Vec::new(); let mut messages: Vec<(String, String, String)> = Vec::new();
let mut token_count = 0; let mut total_bytes = 0;
for obj_bytes in JsonlBackwardIter::new(&mmap) { for obj_bytes in JsonlBackwardIter::new(&mmap) {
if token_count >= max_tokens { break; } if total_bytes >= max_bytes { break; }
// Quick byte check: skip objects that aren't user/assistant // Quick byte check: skip objects that aren't user/assistant
// (avoids parsing large tool_result / system objects) // (avoids parsing large tool_result / system objects)
@ -197,7 +196,7 @@ pub fn tail_messages(path: &str, max_tokens: usize) -> Vec<(String, String, Stri
.unwrap_or("") .unwrap_or("")
.to_string(); .to_string();
token_count += text.len() / 4; total_bytes += text.len();
messages.push((msg_type.to_string(), text, timestamp)); messages.push((msg_type.to_string(), text, timestamp));
} }