experience-mine: split oversized sessions at compaction boundaries

Claude Code doesn't create new session files on context compaction —
a single UUID can accumulate 170+ conversations, producing 400MB+
JSONL files that generate 1.3M token prompts.

Split at compaction markers ("This session is being continued..."):
- extract_conversation made pub, split_on_compaction splits messages
- experience_mine takes optional segment index
- daemon watcher parses files, spawns per-segment jobs (.0, .1, .2)
- seg_cache memoizes segment counts across ticks
- per-segment dedup keys; whole-file key when all segments complete
- 150K token guard skips any remaining oversized segments
- char-boundary-safe truncation in enrich.rs and fact_mine.rs

Backwards compatible: unsegmented calls still write content-hash
dedup keys, old whole-file mined keys still recognized.
This commit is contained in:
ProofOfConcept 2026-03-07 12:01:38 -05:00
parent 22a9fdabdb
commit 45335de220
4 changed files with 155 additions and 39 deletions

View file

@ -85,7 +85,7 @@ pub fn is_transcript_mined_with_keys(mined: &HashSet<String>, path: &str) -> boo
/// Extract user/assistant messages with line numbers from a JSONL transcript.
/// (line_number, role, text, timestamp)
fn extract_conversation(jsonl_path: &str) -> Result<Vec<(usize, String, String, String)>, String> {
pub fn extract_conversation(jsonl_path: &str) -> Result<Vec<(usize, String, String, String)>, String> {
let content = fs::read_to_string(jsonl_path)
.map_err(|e| format!("read {}: {}", jsonl_path, e))?;
@ -135,6 +135,33 @@ fn extract_conversation(jsonl_path: &str) -> Result<Vec<(usize, String, String,
Ok(messages)
}
pub const COMPACTION_MARKER: &str = "This session is being continued from a previous conversation that ran out of context";
/// Split extracted messages into segments at compaction boundaries.
/// Each segment represents one continuous conversation before context was compacted.
pub fn split_on_compaction(messages: Vec<(usize, String, String, String)>) -> Vec<Vec<(usize, String, String, String)>> {
let mut segments: Vec<Vec<(usize, String, String, String)>> = Vec::new();
let mut current = Vec::new();
for msg in messages {
if msg.1 == "user" && msg.2.starts_with(COMPACTION_MARKER) {
if !current.is_empty() {
segments.push(current);
current = Vec::new();
}
// The continuation message itself is part of the new segment
current.push(msg);
} else {
current.push(msg);
}
}
if !current.is_empty() {
segments.push(current);
}
segments
}
/// Format conversation messages for the prompt (truncating long messages).
fn format_conversation(messages: &[(usize, String, String, String)]) -> String {
messages.iter()
@ -259,9 +286,11 @@ pub fn journal_enrich(
}
/// Mine a conversation transcript for experiential moments not yet journaled.
/// If `segment` is Some, only process that compaction segment of the file.
pub fn experience_mine(
store: &mut Store,
jsonl_path: &str,
segment: Option<usize>,
) -> Result<usize, String> {
println!("Experience mining: {}", jsonl_path);
@ -287,7 +316,18 @@ pub fn experience_mine(
return Ok(0);
}
let messages = extract_conversation(jsonl_path)?;
let all_messages = extract_conversation(jsonl_path)?;
// If segment is specified, extract just that segment; otherwise process all messages
let messages = match segment {
Some(idx) => {
let segments = split_on_compaction(all_messages);
segments.into_iter().nth(idx)
.ok_or_else(|| format!("segment {} out of range", idx))?
}
None => all_messages,
};
let conversation = format_conversation(&messages);
println!(" {} messages, {} chars", messages.len(), conversation.len());
@ -327,7 +367,13 @@ pub fn experience_mine(
("{{KEYS}}", &keys_text),
("{{CONVERSATION}}", &conversation),
])?;
println!(" Prompt: {} chars (~{} tokens)", prompt.len(), prompt.len() / 4);
let est_tokens = prompt.len() / 4;
println!(" Prompt: {} chars (~{} tokens)", prompt.len(), est_tokens);
if est_tokens > 150_000 {
println!(" Skipping: prompt too large ({} tokens > 150k limit)", est_tokens);
return Ok(0);
}
println!(" Calling Sonnet...");
let response = call_sonnet("experience-mine", &prompt)?;
@ -389,24 +435,34 @@ pub fn experience_mine(
let _ = store.upsert_node(node);
count += 1;
let preview = if content.len() > 80 { &content[..77] } else { content };
let preview = if content.len() > 80 {
let end = content.floor_char_boundary(77);
&content[..end]
} else {
content
};
println!(" + [{}] {}...", ts, preview);
}
// Record this transcript as mined (even if count == 0, to prevent re-runs)
// Two keys: content hash (exact dedup) and filename (fast daemon reconcile)
// Record this transcript/segment as mined (even if count == 0, to prevent re-runs)
let fname_key = match segment {
Some(idx) => format!("{}.{}", transcript_filename_key(jsonl_path), idx),
None => transcript_filename_key(jsonl_path),
};
let dedup_content = format!("Mined {} ({} entries)", jsonl_path, count);
let mut dedup_node = new_node(&dedup_key, &dedup_content);
dedup_node.category = store::Category::Task;
dedup_node.provenance = store::Provenance::AgentExperienceMine;
let _ = store.upsert_node(dedup_node);
let fname_key = transcript_filename_key(jsonl_path);
let mut fname_node = new_node(&fname_key, &dedup_content);
fname_node.category = store::Category::Task;
fname_node.provenance = store::Provenance::AgentExperienceMine;
let _ = store.upsert_node(fname_node);
// For unsegmented calls, also write the content-hash key for backwards compat
if segment.is_none() {
let mut dedup_node = new_node(&dedup_key, &dedup_content);
dedup_node.category = store::Category::Task;
dedup_node.provenance = store::Provenance::AgentExperienceMine;
let _ = store.upsert_node(dedup_node);
}
if count > 0 {
println!(" Saved {} new journal entries.", count);
}