transcript: extract JSONL backward scanner and compaction detection into library
Move JsonlBackwardIter and find_last_compaction() from parse-claude-conversation into a shared transcript module. Both memory-search and parse-claude-conversation now use the same robust compaction detection: mmap-based backward scan, JSON parsing to verify user-type message, content prefix check. Replaces memory-search's old detect_compaction() which did a forward scan with raw string matching on "continued from a previous conversation" — that could false-positive on the string appearing in assistant output or tool results. Add parse-claude-conversation as a new binary for debugging what's in the context window post-compaction. Co-Authored-By: ProofOfConcept <poc@bcachefs.org>
This commit is contained in:
parent
0e17ab00b0
commit
c2f245740c
4 changed files with 548 additions and 9 deletions
|
|
@ -31,6 +31,10 @@ struct Args {
|
|||
#[arg(long)]
|
||||
seen: bool,
|
||||
|
||||
/// Show full seen set (list all keys)
|
||||
#[arg(long)]
|
||||
seen_full: bool,
|
||||
|
||||
/// Max results to return
|
||||
#[arg(long, default_value = "5")]
|
||||
max_results: usize,
|
||||
|
|
@ -50,7 +54,7 @@ fn main() {
|
|||
|
||||
let args = Args::parse();
|
||||
|
||||
if args.seen {
|
||||
if args.seen || args.seen_full {
|
||||
show_seen();
|
||||
return;
|
||||
}
|
||||
|
|
@ -87,8 +91,11 @@ fn main() {
|
|||
let state_dir = PathBuf::from("/tmp/claude-memory-search");
|
||||
fs::create_dir_all(&state_dir).ok();
|
||||
|
||||
// Detect post-compaction reload
|
||||
let is_compaction = prompt.contains("continued from a previous conversation");
|
||||
// Detect post-compaction reload via mmap backward scan
|
||||
let transcript_path = json["transcript_path"].as_str().unwrap_or("");
|
||||
let is_compaction = poc_memory::transcript::detect_new_compaction(
|
||||
&state_dir, session_id, transcript_path,
|
||||
);
|
||||
|
||||
// First prompt or post-compaction: load full context
|
||||
let cookie_path = state_dir.join(format!("cookie-{}", session_id));
|
||||
|
|
@ -155,7 +162,6 @@ fn main() {
|
|||
};
|
||||
|
||||
// Search for node keys in last ~150k tokens of transcript
|
||||
let transcript_path = json["transcript_path"].as_str().unwrap_or("");
|
||||
if debug { println!("[memory-search] transcript: {}", transcript_path); }
|
||||
let terms = extract_weighted_terms(transcript_path, 150_000, &store);
|
||||
|
||||
|
|
@ -363,7 +369,7 @@ fn extract_key_from_line(line: &str) -> Option<String> {
|
|||
let rest = &line[after_bracket + 2..];
|
||||
let key_end = rest.find(" (c").unwrap_or(rest.len());
|
||||
let key = rest[..key_end].trim();
|
||||
if key.is_empty() || !key.contains('.') {
|
||||
if key.is_empty() {
|
||||
None
|
||||
} else {
|
||||
Some(key.to_string())
|
||||
|
|
@ -374,13 +380,19 @@ fn generate_cookie() -> String {
|
|||
uuid::Uuid::new_v4().as_simple().to_string()[..12].to_string()
|
||||
}
|
||||
|
||||
/// Parse a seen-file line: "TIMESTAMP\tKEY" or legacy "KEY"
|
||||
fn parse_seen_line(line: &str) -> &str {
|
||||
line.split_once('\t').map(|(_, key)| key).unwrap_or(line)
|
||||
}
|
||||
|
||||
fn load_seen(dir: &Path, session_id: &str) -> HashSet<String> {
|
||||
let path = dir.join(format!("seen-{}", session_id));
|
||||
if path.exists() {
|
||||
fs::read_to_string(path)
|
||||
.unwrap_or_default()
|
||||
.lines()
|
||||
.map(|s| s.to_string())
|
||||
.filter(|s| !s.is_empty())
|
||||
.map(|s| parse_seen_line(s).to_string())
|
||||
.collect()
|
||||
} else {
|
||||
HashSet::new()
|
||||
|
|
@ -390,7 +402,8 @@ fn load_seen(dir: &Path, session_id: &str) -> HashSet<String> {
|
|||
fn mark_seen(dir: &Path, session_id: &str, key: &str) {
|
||||
let path = dir.join(format!("seen-{}", session_id));
|
||||
if let Ok(mut f) = fs::OpenOptions::new().create(true).append(true).open(path) {
|
||||
writeln!(f, "{}", key).ok();
|
||||
let ts = chrono::Local::now().format("%Y-%m-%dT%H:%M:%S");
|
||||
writeln!(f, "{}\t{}", ts, key).ok();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -454,8 +467,29 @@ fn show_seen() {
|
|||
}
|
||||
}
|
||||
|
||||
let seen = load_seen(&state_dir, session_id);
|
||||
println!("\nSeen set ({} total, {} pre-seeded):", seen.len(), seen.len() - returned.len());
|
||||
// Read seen file in insertion order (append-only file)
|
||||
let seen_path = state_dir.join(format!("seen-{}", session_id));
|
||||
let seen_lines: Vec<String> = fs::read_to_string(&seen_path)
|
||||
.unwrap_or_default()
|
||||
.lines()
|
||||
.filter(|s| !s.is_empty())
|
||||
.map(|s| s.to_string())
|
||||
.collect();
|
||||
let returned_set: HashSet<_> = returned.iter().cloned().collect();
|
||||
println!("\nSeen set ({} total, {} pre-seeded):", seen_lines.len(), seen_lines.len() - returned.len());
|
||||
|
||||
if Args::parse().seen_full {
|
||||
for line in &seen_lines {
|
||||
let key = parse_seen_line(line);
|
||||
let marker = if returned_set.contains(key) { "→ " } else { " " };
|
||||
// Show timestamp if present, otherwise just key
|
||||
if let Some((ts, k)) = line.split_once('\t') {
|
||||
println!(" {} {}{}", ts, marker, k);
|
||||
} else {
|
||||
println!(" (no ts) {}{}", marker, line);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn cleanup_stale_files(dir: &Path, max_age: Duration) {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue