JsonlBackwardIter: use memrchr3 for SIMD-accelerated scanning
Replaces byte-by-byte backward iteration with memrchr3('{', '}', '"')
which uses SIMD to jump between structurally significant bytes. Major
speedup on large transcripts (1.4GB+).
Also simplifies tail_messages to use a byte budget (200KB) instead
of token counting.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
d7d631d77d
commit
6c41b50e04
3 changed files with 23 additions and 23 deletions
|
|
@ -16,6 +16,7 @@ clap = { version = "4", features = ["derive"] }
|
||||||
libc = "0.2"
|
libc = "0.2"
|
||||||
faer = "0.24.0"
|
faer = "0.24.0"
|
||||||
rkyv = { version = "0.7", features = ["validation", "std"] }
|
rkyv = { version = "0.7", features = ["validation", "std"] }
|
||||||
|
memchr = "2"
|
||||||
memmap2 = "0.9"
|
memmap2 = "0.9"
|
||||||
rayon = "1"
|
rayon = "1"
|
||||||
peg = "0.8"
|
peg = "0.8"
|
||||||
|
|
|
||||||
|
|
@ -456,7 +456,7 @@ fn resolve_conversation() -> String {
|
||||||
let Some(path) = transcript else { return String::new() };
|
let Some(path) = transcript else { return String::new() };
|
||||||
let path_str = path.to_string_lossy();
|
let path_str = path.to_string_lossy();
|
||||||
|
|
||||||
let messages = crate::transcript::tail_messages(&path_str, 25_000);
|
let messages = crate::transcript::tail_messages(&path_str, 200_000);
|
||||||
if messages.is_empty() { return String::new(); }
|
if messages.is_empty() { return String::new(); }
|
||||||
|
|
||||||
let cfg = crate::config::get();
|
let cfg = crate::config::get();
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,7 @@
|
||||||
// and compaction detection. Used by memory-search (hook mode) and
|
// and compaction detection. Used by memory-search (hook mode) and
|
||||||
// parse-claude-conversation (debug tool).
|
// parse-claude-conversation (debug tool).
|
||||||
|
|
||||||
|
use memchr::memrchr3;
|
||||||
use memmap2::Mmap;
|
use memmap2::Mmap;
|
||||||
use serde_json::Value;
|
use serde_json::Value;
|
||||||
use std::fs;
|
use std::fs;
|
||||||
|
|
@ -12,8 +13,10 @@ use std::path::Path;
|
||||||
/// Scan backwards through mmap'd bytes, yielding byte slices of complete
|
/// Scan backwards through mmap'd bytes, yielding byte slices of complete
|
||||||
/// top-level JSON objects (outermost { to matching }).
|
/// top-level JSON objects (outermost { to matching }).
|
||||||
///
|
///
|
||||||
/// Tracks brace depth, skipping braces inside JSON strings. Returns
|
/// Uses memrchr3 (SIMD) to jump between structurally significant bytes
|
||||||
/// objects in reverse order (newest first).
|
/// ({, }, ") instead of scanning byte-by-byte. Tracks brace depth,
|
||||||
|
/// skipping braces inside JSON strings. Returns objects in reverse order
|
||||||
|
/// (newest first).
|
||||||
pub struct JsonlBackwardIter<'a> {
|
pub struct JsonlBackwardIter<'a> {
|
||||||
data: &'a [u8],
|
data: &'a [u8],
|
||||||
pos: usize,
|
pos: usize,
|
||||||
|
|
@ -29,17 +32,14 @@ impl<'a> Iterator for JsonlBackwardIter<'a> {
|
||||||
type Item = &'a [u8];
|
type Item = &'a [u8];
|
||||||
|
|
||||||
fn next(&mut self) -> Option<Self::Item> {
|
fn next(&mut self) -> Option<Self::Item> {
|
||||||
if self.pos == 0 {
|
// Find the closing } of the next object
|
||||||
return None;
|
|
||||||
}
|
|
||||||
|
|
||||||
// Find the closing } of the next object (scanning backward)
|
|
||||||
let close = loop {
|
let close = loop {
|
||||||
if self.pos == 0 { return None; }
|
let p = memrchr3(b'{', b'}', b'"', &self.data[..self.pos])?;
|
||||||
self.pos -= 1;
|
self.pos = p;
|
||||||
if self.data[self.pos] == b'}' {
|
if self.data[p] == b'}' {
|
||||||
break self.pos;
|
break p;
|
||||||
}
|
}
|
||||||
|
// Skip past any { or " that aren't our closing brace
|
||||||
};
|
};
|
||||||
|
|
||||||
// Track brace depth to find matching {
|
// Track brace depth to find matching {
|
||||||
|
|
@ -47,22 +47,22 @@ impl<'a> Iterator for JsonlBackwardIter<'a> {
|
||||||
let mut in_string = false;
|
let mut in_string = false;
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
if self.pos == 0 {
|
let p = memrchr3(b'{', b'}', b'"', &self.data[..self.pos])?;
|
||||||
return None;
|
self.pos = p;
|
||||||
}
|
let ch = self.data[p];
|
||||||
self.pos -= 1;
|
|
||||||
let ch = self.data[self.pos];
|
|
||||||
|
|
||||||
if in_string {
|
if in_string {
|
||||||
if ch == b'"' {
|
if ch == b'"' {
|
||||||
|
// Check for escaped quote (count preceding backslashes)
|
||||||
let mut bs = 0;
|
let mut bs = 0;
|
||||||
while self.pos > bs && self.data[self.pos - 1 - bs] == b'\\' {
|
while p > bs + 1 && self.data[p - 1 - bs] == b'\\' {
|
||||||
bs += 1;
|
bs += 1;
|
||||||
}
|
}
|
||||||
if bs % 2 == 0 {
|
if bs % 2 == 0 {
|
||||||
in_string = false;
|
in_string = false;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
// { and } inside strings don't affect depth
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -145,18 +145,17 @@ fn contains_bytes(haystack: &[u8], needle: &[u8]) -> bool {
|
||||||
/// Reverse-scan a transcript file, collecting user/assistant messages
|
/// Reverse-scan a transcript file, collecting user/assistant messages
|
||||||
/// until `max_tokens` tokens (~4 chars each) are accumulated. Stops at
|
/// until `max_tokens` tokens (~4 chars each) are accumulated. Stops at
|
||||||
/// the last compaction boundary. Returns messages in chronological order.
|
/// the last compaction boundary. Returns messages in chronological order.
|
||||||
pub fn tail_messages(path: &str, max_tokens: usize) -> Vec<(String, String, String)> {
|
pub fn tail_messages(path: &str, max_bytes: usize) -> Vec<(String, String, String)> {
|
||||||
let (mmap, _file) = match mmap_transcript(path) {
|
let (mmap, _file) = match mmap_transcript(path) {
|
||||||
Some(v) => v,
|
Some(v) => v,
|
||||||
None => return Vec::new(),
|
None => return Vec::new(),
|
||||||
};
|
};
|
||||||
|
|
||||||
let compaction_marker = b"This session is being continued";
|
|
||||||
let mut messages: Vec<(String, String, String)> = Vec::new();
|
let mut messages: Vec<(String, String, String)> = Vec::new();
|
||||||
let mut token_count = 0;
|
let mut total_bytes = 0;
|
||||||
|
|
||||||
for obj_bytes in JsonlBackwardIter::new(&mmap) {
|
for obj_bytes in JsonlBackwardIter::new(&mmap) {
|
||||||
if token_count >= max_tokens { break; }
|
if total_bytes >= max_bytes { break; }
|
||||||
|
|
||||||
// Quick byte check: skip objects that aren't user/assistant
|
// Quick byte check: skip objects that aren't user/assistant
|
||||||
// (avoids parsing large tool_result / system objects)
|
// (avoids parsing large tool_result / system objects)
|
||||||
|
|
@ -197,7 +196,7 @@ pub fn tail_messages(path: &str, max_tokens: usize) -> Vec<(String, String, Stri
|
||||||
.unwrap_or("")
|
.unwrap_or("")
|
||||||
.to_string();
|
.to_string();
|
||||||
|
|
||||||
token_count += text.len() / 4;
|
total_bytes += text.len();
|
||||||
messages.push((msg_type.to_string(), text, timestamp));
|
messages.push((msg_type.to_string(), text, timestamp));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue