JsonlBackwardIter: use memrchr3 for SIMD-accelerated scanning

Replaces byte-by-byte backward iteration with memrchr3('{', '}', '"')
which uses SIMD to jump between structurally significant bytes. Major
speedup on large transcripts (1.4GB+).

Also simplifies tail_messages to use a byte budget (200KB) instead
of token counting.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Kent Overstreet 2026-03-22 03:11:30 -04:00
parent d7d631d77d
commit 6c41b50e04
3 changed files with 23 additions and 23 deletions

View file

@ -456,7 +456,7 @@ fn resolve_conversation() -> String {
let Some(path) = transcript else { return String::new() };
let path_str = path.to_string_lossy();
let messages = crate::transcript::tail_messages(&path_str, 25_000);
let messages = crate::transcript::tail_messages(&path_str, 200_000);
if messages.is_empty() { return String::new(); }
let cfg = crate::config::get();

View file

@ -4,6 +4,7 @@
// and compaction detection. Used by memory-search (hook mode) and
// parse-claude-conversation (debug tool).
use memchr::memrchr3;
use memmap2::Mmap;
use serde_json::Value;
use std::fs;
@ -12,8 +13,10 @@ use std::path::Path;
/// Scan backwards through mmap'd bytes, yielding byte slices of complete
/// top-level JSON objects (outermost { to matching }).
///
/// Tracks brace depth, skipping braces inside JSON strings. Returns
/// objects in reverse order (newest first).
/// Uses memrchr3 (SIMD) to jump between structurally significant bytes
/// ({, }, ") instead of scanning byte-by-byte. Tracks brace depth,
/// skipping braces inside JSON strings. Returns objects in reverse order
/// (newest first).
pub struct JsonlBackwardIter<'a> {
data: &'a [u8],
pos: usize,
@ -29,17 +32,14 @@ impl<'a> Iterator for JsonlBackwardIter<'a> {
type Item = &'a [u8];
fn next(&mut self) -> Option<Self::Item> {
if self.pos == 0 {
return None;
}
// Find the closing } of the next object (scanning backward)
// Find the closing } of the next object
let close = loop {
if self.pos == 0 { return None; }
self.pos -= 1;
if self.data[self.pos] == b'}' {
break self.pos;
let p = memrchr3(b'{', b'}', b'"', &self.data[..self.pos])?;
self.pos = p;
if self.data[p] == b'}' {
break p;
}
// Skip past any { or " that aren't our closing brace
};
// Track brace depth to find matching {
@ -47,22 +47,22 @@ impl<'a> Iterator for JsonlBackwardIter<'a> {
let mut in_string = false;
loop {
if self.pos == 0 {
return None;
}
self.pos -= 1;
let ch = self.data[self.pos];
let p = memrchr3(b'{', b'}', b'"', &self.data[..self.pos])?;
self.pos = p;
let ch = self.data[p];
if in_string {
if ch == b'"' {
// Check for escaped quote (count preceding backslashes)
let mut bs = 0;
while self.pos > bs && self.data[self.pos - 1 - bs] == b'\\' {
while p > bs + 1 && self.data[p - 1 - bs] == b'\\' {
bs += 1;
}
if bs % 2 == 0 {
in_string = false;
}
}
// { and } inside strings don't affect depth
continue;
}
@ -145,18 +145,17 @@ fn contains_bytes(haystack: &[u8], needle: &[u8]) -> bool {
/// Reverse-scan a transcript file, collecting user/assistant messages
/// until `max_tokens` tokens (~4 chars each) are accumulated. Stops at
/// the last compaction boundary. Returns messages in chronological order.
pub fn tail_messages(path: &str, max_tokens: usize) -> Vec<(String, String, String)> {
pub fn tail_messages(path: &str, max_bytes: usize) -> Vec<(String, String, String)> {
let (mmap, _file) = match mmap_transcript(path) {
Some(v) => v,
None => return Vec::new(),
};
let compaction_marker = b"This session is being continued";
let mut messages: Vec<(String, String, String)> = Vec::new();
let mut token_count = 0;
let mut total_bytes = 0;
for obj_bytes in JsonlBackwardIter::new(&mmap) {
if token_count >= max_tokens { break; }
if total_bytes >= max_bytes { break; }
// Quick byte check: skip objects that aren't user/assistant
// (avoids parsing large tool_result / system objects)
@ -197,7 +196,7 @@ pub fn tail_messages(path: &str, max_tokens: usize) -> Vec<(String, String, Stri
.unwrap_or("")
.to_string();
token_count += text.len() / 4;
total_bytes += text.len();
messages.push((msg_type.to_string(), text, timestamp));
}