forked from kent/consciousness
Split conversation transcript parsing
This commit is contained in:
parent
f6a6e3066c
commit
78b4bbd5bb
9 changed files with 614 additions and 348 deletions
110
src/conversation/jsonl.rs
Normal file
110
src/conversation/jsonl.rs
Normal file
|
|
@ -0,0 +1,110 @@
|
|||
use memchr::memrchr3;
|
||||
|
||||
/// Scan backwards through mmap'd bytes, yielding byte slices of complete
|
||||
/// top-level JSON objects (outermost { to matching }).
|
||||
///
|
||||
/// Uses memrchr3 (SIMD) to jump between structurally significant bytes
|
||||
/// ({, }, ") instead of scanning byte-by-byte. Tracks brace depth,
|
||||
/// skipping braces inside JSON strings. Returns objects in reverse order
|
||||
/// (newest first).
|
||||
pub struct JsonlBackwardIter<'a> {
|
||||
data: &'a [u8],
|
||||
pos: usize,
|
||||
}
|
||||
|
||||
impl<'a> JsonlBackwardIter<'a> {
|
||||
pub fn new(data: &'a [u8]) -> Self {
|
||||
Self { data, pos: data.len() }
|
||||
}
|
||||
}
|
||||
|
||||
impl<'a> Iterator for JsonlBackwardIter<'a> {
|
||||
type Item = (usize, &'a [u8]);
|
||||
|
||||
fn next(&mut self) -> Option<Self::Item> {
|
||||
next_json_object(self.data, &mut self.pos)
|
||||
}
|
||||
}
|
||||
|
||||
fn is_unescaped_quote(data: &[u8], p: usize) -> bool {
|
||||
let mut bs = 0;
|
||||
while p > bs && data[p - 1 - bs] == b'\\' {
|
||||
bs += 1;
|
||||
}
|
||||
bs % 2 == 0
|
||||
}
|
||||
|
||||
fn next_json_object<'a>(data: &'a [u8], pos: &mut usize) -> Option<(usize, &'a [u8])> {
|
||||
// Find the closing } of the next object, skipping } inside strings.
|
||||
let close = {
|
||||
let mut in_string = false;
|
||||
loop {
|
||||
let p = memrchr3(b'{', b'}', b'"', &data[..*pos])?;
|
||||
*pos = p;
|
||||
let ch = data[p];
|
||||
|
||||
if in_string {
|
||||
if ch == b'"' && is_unescaped_quote(data, p) {
|
||||
in_string = false;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
match ch {
|
||||
b'}' => break p,
|
||||
b'"' => in_string = true,
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// Track brace depth to find matching {.
|
||||
let mut depth: usize = 1;
|
||||
let mut in_string = false;
|
||||
|
||||
loop {
|
||||
let p = memrchr3(b'{', b'}', b'"', &data[..*pos])?;
|
||||
*pos = p;
|
||||
let ch = data[p];
|
||||
|
||||
if in_string {
|
||||
if ch == b'"' && is_unescaped_quote(data, p) {
|
||||
in_string = false;
|
||||
}
|
||||
continue;
|
||||
}
|
||||
|
||||
match ch {
|
||||
b'"' => { in_string = true; }
|
||||
b'}' => { depth += 1; }
|
||||
b'{' => {
|
||||
depth -= 1;
|
||||
if depth == 0 {
|
||||
return Some((*pos, &data[*pos..=close]));
|
||||
}
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn handles_nested_json_and_quoted_braces() {
|
||||
let data = br#"{"n":1,"s":"literal } brace"}
|
||||
{"n":2,"nested":{"s":"escaped quote: \" and { brace"}}
|
||||
trailing garbage
|
||||
"#;
|
||||
|
||||
let objs: Vec<_> = JsonlBackwardIter::new(data)
|
||||
.map(|(_, bytes)| std::str::from_utf8(bytes).unwrap().to_string())
|
||||
.collect();
|
||||
|
||||
assert_eq!(objs.len(), 2);
|
||||
assert!(objs[0].contains(r#""n":2"#));
|
||||
assert!(objs[1].contains(r#""n":1"#));
|
||||
}
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue