JsonlBackwardIter: use memrchr3 for SIMD-accelerated scanning
Replaces byte-by-byte backward iteration with memrchr3('{', '}', '"')
which uses SIMD to jump between structurally significant bytes. Major
speedup on large transcripts (1.4GB+).
Also simplifies tail_messages to use a byte budget (200KB) instead
of token counting.
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
d7d631d77d
commit
6c41b50e04
3 changed files with 23 additions and 23 deletions
|
|
@ -456,7 +456,7 @@ fn resolve_conversation() -> String {
|
|||
let Some(path) = transcript else { return String::new() };
|
||||
let path_str = path.to_string_lossy();
|
||||
|
||||
let messages = crate::transcript::tail_messages(&path_str, 25_000);
|
||||
let messages = crate::transcript::tail_messages(&path_str, 200_000);
|
||||
if messages.is_empty() { return String::new(); }
|
||||
|
||||
let cfg = crate::config::get();
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@
|
|||
// and compaction detection. Used by memory-search (hook mode) and
|
||||
// parse-claude-conversation (debug tool).
|
||||
|
||||
use memchr::memrchr3;
|
||||
use memmap2::Mmap;
|
||||
use serde_json::Value;
|
||||
use std::fs;
|
||||
|
|
@ -12,8 +13,10 @@ use std::path::Path;
|
|||
/// Scan backwards through mmap'd bytes, yielding byte slices of complete
|
||||
/// top-level JSON objects (outermost { to matching }).
|
||||
///
|
||||
/// Tracks brace depth, skipping braces inside JSON strings. Returns
|
||||
/// objects in reverse order (newest first).
|
||||
/// Uses memrchr3 (SIMD) to jump between structurally significant bytes
|
||||
/// ({, }, ") instead of scanning byte-by-byte. Tracks brace depth,
|
||||
/// skipping braces inside JSON strings. Returns objects in reverse order
|
||||
/// (newest first).
|
||||
pub struct JsonlBackwardIter<'a> {
|
||||
data: &'a [u8],
|
||||
pos: usize,
|
||||
|
|
@ -29,17 +32,14 @@ impl<'a> Iterator for JsonlBackwardIter<'a> {
|
|||
type Item = &'a [u8];
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
if self.pos == 0 {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Find the closing } of the next object (scanning backward)
|
||||
// Find the closing } of the next object
|
||||
let close = loop {
|
||||
if self.pos == 0 { return None; }
|
||||
self.pos -= 1;
|
||||
if self.data[self.pos] == b'}' {
|
||||
break self.pos;
|
||||
let p = memrchr3(b'{', b'}', b'"', &self.data[..self.pos])?;
|
||||
self.pos = p;
|
||||
if self.data[p] == b'}' {
|
||||
break p;
|
||||
}
|
||||
// Skip past any { or " that aren't our closing brace
|
||||
};
|
||||
|
||||
// Track brace depth to find matching {
|
||||
|
|
@ -47,22 +47,22 @@ impl<'a> Iterator for JsonlBackwardIter<'a> {
|
|||
let mut in_string = false;
|
||||
|
||||
loop {
|
||||
if self.pos == 0 {
|
||||
return None;
|
||||
}
|
||||
self.pos -= 1;
|
||||
let ch = self.data[self.pos];
|
||||
let p = memrchr3(b'{', b'}', b'"', &self.data[..self.pos])?;
|
||||
self.pos = p;
|
||||
let ch = self.data[p];
|
||||
|
||||
if in_string {
|
||||
if ch == b'"' {
|
||||
// Check for escaped quote (count preceding backslashes)
|
||||
let mut bs = 0;
|
||||
while self.pos > bs && self.data[self.pos - 1 - bs] == b'\\' {
|
||||
while p > bs + 1 && self.data[p - 1 - bs] == b'\\' {
|
||||
bs += 1;
|
||||
}
|
||||
if bs % 2 == 0 {
|
||||
in_string = false;
|
||||
}
|
||||
}
|
||||
// { and } inside strings don't affect depth
|
||||
continue;
|
||||
}
|
||||
|
||||
|
|
@ -145,18 +145,17 @@ fn contains_bytes(haystack: &[u8], needle: &[u8]) -> bool {
|
|||
/// Reverse-scan a transcript file, collecting user/assistant messages
|
||||
/// until `max_tokens` tokens (~4 chars each) are accumulated. Stops at
|
||||
/// the last compaction boundary. Returns messages in chronological order.
|
||||
pub fn tail_messages(path: &str, max_tokens: usize) -> Vec<(String, String, String)> {
|
||||
pub fn tail_messages(path: &str, max_bytes: usize) -> Vec<(String, String, String)> {
|
||||
let (mmap, _file) = match mmap_transcript(path) {
|
||||
Some(v) => v,
|
||||
None => return Vec::new(),
|
||||
};
|
||||
|
||||
let compaction_marker = b"This session is being continued";
|
||||
let mut messages: Vec<(String, String, String)> = Vec::new();
|
||||
let mut token_count = 0;
|
||||
let mut total_bytes = 0;
|
||||
|
||||
for obj_bytes in JsonlBackwardIter::new(&mmap) {
|
||||
if token_count >= max_tokens { break; }
|
||||
if total_bytes >= max_bytes { break; }
|
||||
|
||||
// Quick byte check: skip objects that aren't user/assistant
|
||||
// (avoids parsing large tool_result / system objects)
|
||||
|
|
@ -197,7 +196,7 @@ pub fn tail_messages(path: &str, max_tokens: usize) -> Vec<(String, String, Stri
|
|||
.unwrap_or("")
|
||||
.to_string();
|
||||
|
||||
token_count += text.len() / 4;
|
||||
total_bytes += text.len();
|
||||
messages.push((msg_type.to_string(), text, timestamp));
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue