experience-mine: split oversized sessions at compaction boundaries

Claude Code doesn't create new session files on context compaction —
a single UUID can accumulate 170+ conversations, producing 400MB+
JSONL files that generate 1.3M token prompts.

Split at compaction markers ("This session is being continued..."):
- extract_conversation made pub, split_on_compaction splits messages
- experience_mine takes optional segment index
- daemon watcher parses files, spawns per-segment jobs (.0, .1, .2)
- seg_cache memoizes segment counts across ticks
- per-segment dedup keys; whole-file key when all segments complete
- 150K token guard skips any remaining oversized segments
- char-boundary-safe truncation in enrich.rs and fact_mine.rs

Backwards compatible: unsegmented calls still write content-hash
dedup keys, old whole-file mined keys still recognized.
This commit is contained in:
ProofOfConcept 2026-03-07 12:01:38 -05:00
parent 22a9fdabdb
commit 45335de220
4 changed files with 155 additions and 39 deletions

View file

@ -13,7 +13,7 @@
// Phase 2 will inline job logic; Phase 3 integrates into poc-agent.
use jobkit::{Choir, ExecutionContext, ResourcePool, TaskError, TaskInfo, TaskStatus};
use std::collections::HashSet;
use std::collections::{HashMap, HashSet};
use std::fs;
use std::io::Write;
use std::path::{Path, PathBuf};
@ -97,13 +97,13 @@ fn run_job(ctx: &ExecutionContext, name: &str, f: impl FnOnce() -> Result<(), St
}
}
fn job_experience_mine(ctx: &ExecutionContext, path: &str) -> Result<(), TaskError> {
fn job_experience_mine(ctx: &ExecutionContext, path: &str, segment: Option<usize>) -> Result<(), TaskError> {
let path = path.to_string();
run_job(ctx, &format!("experience-mine {}", path), || {
ctx.log_line("loading store");
let mut store = crate::store::Store::load()?;
ctx.log_line("mining");
let count = crate::enrich::experience_mine(&mut store, &path)?;
let count = crate::enrich::experience_mine(&mut store, &path, segment)?;
ctx.log_line(&format!("{} entries mined", count));
Ok(())
})
@ -323,6 +323,8 @@ pub fn run_daemon() -> Result<(), String> {
let last_daily_sw = Arc::clone(&last_daily);
choir.spawn("session-watcher").init(move |ctx| {
ctx.set_progress("idle");
// Cache segment counts so we don't re-parse large files every tick
let mut seg_cache: HashMap<String, usize> = HashMap::new();
loop {
if ctx.is_cancelled() {
return Err(TaskError::Fatal("cancelled".into()));
@ -366,10 +368,13 @@ pub fn run_daemon() -> Result<(), String> {
let mut already_mined = 0;
let mut still_open = 0;
let total_stale = stale.len();
// Multi-segment files where all segments are done — write whole-file key
let mut mark_transcript_done: Vec<(String, String, usize)> = Vec::new();
// Classify each session
let mut needs_extract: Vec<(PathBuf, String, String)> = Vec::new();
let mut needs_fact: Vec<(PathBuf, String, String)> = Vec::new();
// Classify each session — now segment-aware
// Each entry: (filename, path_str, segment_index)
let mut needs_extract: Vec<(String, String, Option<usize>)> = Vec::new();
let mut needs_fact: Vec<(String, String)> = Vec::new();
for session in stale {
let filename = session.file_name()
@ -377,44 +382,84 @@ pub fn run_daemon() -> Result<(), String> {
.unwrap_or_else(|| "unknown".into());
let path_str = session.to_string_lossy().to_string();
// Skip if already in-flight
let extract_name = format!("extract:{}", filename);
let fact_name = format!("fact-mine:{}", filename);
if active.contains(&extract_name) || active.contains(&fact_name) {
continue;
}
// Check for old-style whole-file mined key
let experience_done = crate::enrich::is_transcript_mined_with_keys(&mined, &path_str);
let fact_key = format!("_facts-{}", filename.trim_end_matches(".jsonl"));
let fact_done = fact_keys.contains(&fact_key);
if !experience_done {
if is_file_open(&session) {
still_open += 1;
} else {
needs_extract.push((session, filename, path_str));
continue;
}
// Get segment count, using cache to avoid re-parsing large files
let seg_count = if let Some(&cached) = seg_cache.get(&path_str) {
cached
} else {
let messages = match crate::enrich::extract_conversation(&path_str) {
Ok(m) => m,
Err(_) => continue,
};
let count = crate::enrich::split_on_compaction(messages).len();
seg_cache.insert(path_str.clone(), count);
count
};
if seg_count <= 1 {
let task_name = format!("extract:{}", filename);
if !active.contains(&task_name) {
needs_extract.push((filename, path_str, None));
}
} else {
// Multi-segment — find unmined segments
let fname_key = crate::enrich::transcript_filename_key(&path_str);
let mut unmined = 0;
for i in 0..seg_count {
let seg_key = format!("{}.{}", fname_key, i);
if mined.contains(&seg_key) { continue; }
unmined += 1;
let task_name = format!("extract:{}.{}", filename, i);
if active.contains(&task_name) { continue; }
needs_extract.push((
format!("{}.{}", filename, i),
path_str.clone(),
Some(i),
));
}
if unmined == 0 {
// All segments done — write whole-file key so we skip next tick
mark_transcript_done.push((fname_key, path_str.clone(), seg_count));
already_mined += 1;
}
}
} else if !fact_done {
needs_fact.push((session, filename, path_str));
} else {
already_mined += 1;
let fact_key = format!("_facts-{}", filename.trim_end_matches(".jsonl"));
let fact_done = fact_keys.contains(&fact_key);
if !fact_done {
let task_name = format!("fact-mine:{}", filename);
if !active.contains(&task_name) {
needs_fact.push((filename, path_str));
}
} else {
already_mined += 1;
}
}
}
// Spawn experience-mine jobs (priority)
for (_, filename, path_str) in &needs_extract {
for (task_label, path_str, segment) in &needs_extract {
if extract_queued >= MAX_NEW_PER_TICK {
extract_remaining += 1;
continue;
}
let task_name = format!("extract:{}", filename);
log_event("extract", "queued", path_str);
let task_name = format!("extract:{}", task_label);
log_event("extract", "queued", &task_name);
let path = path_str.clone();
let seg = *segment;
choir_sw.spawn(task_name)
.resource(&llm_sw)
.retries(2)
.init(move |ctx| {
job_experience_mine(ctx, &path)
job_experience_mine(ctx, &path, seg)
});
extract_queued += 1;
}
@ -423,7 +468,7 @@ pub fn run_daemon() -> Result<(), String> {
let mut fact_queued = 0;
if needs_extract.len() == extract_queued {
let fact_budget = MAX_NEW_PER_TICK.saturating_sub(extract_queued);
for (_, filename, path_str) in &needs_fact {
for (filename, path_str) in &needs_fact {
if fact_queued >= fact_budget {
fact_remaining += 1;
continue;
@ -443,6 +488,21 @@ pub fn run_daemon() -> Result<(), String> {
fact_remaining = needs_fact.len();
}
// Write whole-file keys for fully-mined multi-segment files
if !mark_transcript_done.is_empty() {
if let Ok(mut store) = crate::store::Store::load() {
for (fname_key, path_str, seg_count) in &mark_transcript_done {
let content = format!("All {} segments mined for {}", seg_count, path_str);
let mut node = crate::store::new_node(fname_key, &content);
node.category = crate::store::Category::Task;
node.provenance = crate::store::Provenance::AgentExperienceMine;
let _ = store.upsert_node(node);
seg_cache.remove(path_str);
}
let _ = store.save();
}
}
let extract_pending = extract_queued + extract_remaining;
let fact_pending = fact_queued + fact_remaining;
if extract_pending > 0 || fact_pending > 0 || still_open > 0 {