forked from kent/consciousness
110 lines
2.8 KiB
Rust
110 lines
2.8 KiB
Rust
|
|
use memchr::memrchr3;
|
||
|
|
|
||
|
|
/// Scan backwards through mmap'd bytes, yielding byte slices of complete
|
||
|
|
/// top-level JSON objects (outermost { to matching }).
|
||
|
|
///
|
||
|
|
/// Uses memrchr3 (SIMD) to jump between structurally significant bytes
|
||
|
|
/// ({, }, ") instead of scanning byte-by-byte. Tracks brace depth,
|
||
|
|
/// skipping braces inside JSON strings. Returns objects in reverse order
|
||
|
|
/// (newest first).
|
||
|
|
pub struct JsonlBackwardIter<'a> {
|
||
|
|
data: &'a [u8],
|
||
|
|
pos: usize,
|
||
|
|
}
|
||
|
|
|
||
|
|
impl<'a> JsonlBackwardIter<'a> {
|
||
|
|
pub fn new(data: &'a [u8]) -> Self {
|
||
|
|
Self { data, pos: data.len() }
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
impl<'a> Iterator for JsonlBackwardIter<'a> {
|
||
|
|
type Item = (usize, &'a [u8]);
|
||
|
|
|
||
|
|
fn next(&mut self) -> Option<Self::Item> {
|
||
|
|
next_json_object(self.data, &mut self.pos)
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
fn is_unescaped_quote(data: &[u8], p: usize) -> bool {
|
||
|
|
let mut bs = 0;
|
||
|
|
while p > bs && data[p - 1 - bs] == b'\\' {
|
||
|
|
bs += 1;
|
||
|
|
}
|
||
|
|
bs % 2 == 0
|
||
|
|
}
|
||
|
|
|
||
|
|
fn next_json_object<'a>(data: &'a [u8], pos: &mut usize) -> Option<(usize, &'a [u8])> {
|
||
|
|
// Find the closing } of the next object, skipping } inside strings.
|
||
|
|
let close = {
|
||
|
|
let mut in_string = false;
|
||
|
|
loop {
|
||
|
|
let p = memrchr3(b'{', b'}', b'"', &data[..*pos])?;
|
||
|
|
*pos = p;
|
||
|
|
let ch = data[p];
|
||
|
|
|
||
|
|
if in_string {
|
||
|
|
if ch == b'"' && is_unescaped_quote(data, p) {
|
||
|
|
in_string = false;
|
||
|
|
}
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
match ch {
|
||
|
|
b'}' => break p,
|
||
|
|
b'"' => in_string = true,
|
||
|
|
_ => {}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
};
|
||
|
|
|
||
|
|
// Track brace depth to find matching {.
|
||
|
|
let mut depth: usize = 1;
|
||
|
|
let mut in_string = false;
|
||
|
|
|
||
|
|
loop {
|
||
|
|
let p = memrchr3(b'{', b'}', b'"', &data[..*pos])?;
|
||
|
|
*pos = p;
|
||
|
|
let ch = data[p];
|
||
|
|
|
||
|
|
if in_string {
|
||
|
|
if ch == b'"' && is_unescaped_quote(data, p) {
|
||
|
|
in_string = false;
|
||
|
|
}
|
||
|
|
continue;
|
||
|
|
}
|
||
|
|
|
||
|
|
match ch {
|
||
|
|
b'"' => { in_string = true; }
|
||
|
|
b'}' => { depth += 1; }
|
||
|
|
b'{' => {
|
||
|
|
depth -= 1;
|
||
|
|
if depth == 0 {
|
||
|
|
return Some((*pos, &data[*pos..=close]));
|
||
|
|
}
|
||
|
|
}
|
||
|
|
_ => {}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
#[cfg(test)]
|
||
|
|
mod tests {
|
||
|
|
use super::*;
|
||
|
|
|
||
|
|
#[test]
|
||
|
|
fn handles_nested_json_and_quoted_braces() {
|
||
|
|
let data = br#"{"n":1,"s":"literal } brace"}
|
||
|
|
{"n":2,"nested":{"s":"escaped quote: \" and { brace"}}
|
||
|
|
trailing garbage
|
||
|
|
"#;
|
||
|
|
|
||
|
|
let objs: Vec<_> = JsonlBackwardIter::new(data)
|
||
|
|
.map(|(_, bytes)| std::str::from_utf8(bytes).unwrap().to_string())
|
||
|
|
.collect();
|
||
|
|
|
||
|
|
assert_eq!(objs.len(), 2);
|
||
|
|
assert!(objs[0].contains(r#""n":2"#));
|
||
|
|
assert!(objs[1].contains(r#""n":1"#));
|
||
|
|
}
|
||
|
|
}
|