consciousness/src/conversation/jsonl.rs

110 lines
2.8 KiB
Rust
Raw Normal View History

2026-06-15 11:24:18 -05:00
use memchr::memrchr3;
/// Scan backwards through mmap'd bytes, yielding byte slices of complete
/// top-level JSON objects (outermost { to matching }).
///
/// Uses memrchr3 (SIMD) to jump between structurally significant bytes
/// ({, }, ") instead of scanning byte-by-byte. Tracks brace depth,
/// skipping braces inside JSON strings. Returns objects in reverse order
/// (newest first).
pub struct JsonlBackwardIter<'a> {
data: &'a [u8],
pos: usize,
}
impl<'a> JsonlBackwardIter<'a> {
pub fn new(data: &'a [u8]) -> Self {
Self { data, pos: data.len() }
}
}
impl<'a> Iterator for JsonlBackwardIter<'a> {
type Item = (usize, &'a [u8]);
fn next(&mut self) -> Option<Self::Item> {
next_json_object(self.data, &mut self.pos)
}
}
fn is_unescaped_quote(data: &[u8], p: usize) -> bool {
let mut bs = 0;
while p > bs && data[p - 1 - bs] == b'\\' {
bs += 1;
}
bs % 2 == 0
}
fn next_json_object<'a>(data: &'a [u8], pos: &mut usize) -> Option<(usize, &'a [u8])> {
// Find the closing } of the next object, skipping } inside strings.
let close = {
let mut in_string = false;
loop {
let p = memrchr3(b'{', b'}', b'"', &data[..*pos])?;
*pos = p;
let ch = data[p];
if in_string {
if ch == b'"' && is_unescaped_quote(data, p) {
in_string = false;
}
continue;
}
match ch {
b'}' => break p,
b'"' => in_string = true,
_ => {}
}
}
};
// Track brace depth to find matching {.
let mut depth: usize = 1;
let mut in_string = false;
loop {
let p = memrchr3(b'{', b'}', b'"', &data[..*pos])?;
*pos = p;
let ch = data[p];
if in_string {
if ch == b'"' && is_unescaped_quote(data, p) {
in_string = false;
}
continue;
}
match ch {
b'"' => { in_string = true; }
b'}' => { depth += 1; }
b'{' => {
depth -= 1;
if depth == 0 {
return Some((*pos, &data[*pos..=close]));
}
}
_ => {}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn handles_nested_json_and_quoted_braces() {
let data = br#"{"n":1,"s":"literal } brace"}
{"n":2,"nested":{"s":"escaped quote: \" and { brace"}}
trailing garbage
"#;
let objs: Vec<_> = JsonlBackwardIter::new(data)
.map(|(_, bytes)| std::str::from_utf8(bytes).unwrap().to_string())
.collect();
assert_eq!(objs.len(), 2);
assert!(objs[0].contains(r#""n":2"#));
assert!(objs[1].contains(r#""n":1"#));
}
}