observation: chunk large transcripts, remove format_segment limit

Large conversation segments are now split into 50KB chunks with 10KB
overlap, instead of being truncated to 8000 chars (which was broken
anyway — broke after exceeding, not before). Each chunk gets its own
candidate ID for independent mining and dedup.

format_segment simplified: no size limit, added timestamps to output
so observation agent can cross-reference with journal entries.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Kent Overstreet 2026-03-16 20:52:20 -04:00
parent 03310dafa4
commit 15d4bfa01f

View file

@ -773,10 +773,41 @@ pub fn select_conversation_fragments(n: usize) -> Vec<(String, String)> {
if has_rate_limit && assistant_msgs < 3 { if has_rate_limit && assistant_msgs < 3 {
continue; continue;
} }
let text = format_segment(&segment, 8000); let text = format_segment(&segment);
if text.len() > 500 { if text.len() < 500 {
continue;
}
const CHUNK_SIZE: usize = 50_000;
const OVERLAP: usize = 10_000;
if text.len() <= CHUNK_SIZE {
let id = format!("{}.{}", session_id, seg_idx); let id = format!("{}.{}", session_id, seg_idx);
candidates.push((id, text)); candidates.push((id, text));
} else {
// Split on line boundaries with overlap
let lines: Vec<&str> = text.lines().collect();
let mut start_line = 0;
let mut chunk_idx = 0;
while start_line < lines.len() {
let mut end_line = start_line;
let mut size = 0;
while end_line < lines.len() && size < CHUNK_SIZE {
size += lines[end_line].len() + 1;
end_line += 1;
}
let chunk: String = lines[start_line..end_line].join("\n");
let id = format!("{}.{}.{}", session_id, seg_idx, chunk_idx);
candidates.push((id, chunk));
if end_line >= lines.len() { break; }
// Back up by overlap amount for next chunk
let mut overlap_size = 0;
let mut overlap_start = end_line;
while overlap_start > start_line && overlap_size < OVERLAP {
overlap_start -= 1;
overlap_size += lines[overlap_start].len() + 1;
}
start_line = overlap_start;
chunk_idx += 1;
}
} }
} }
@ -803,19 +834,20 @@ pub fn mark_observation_done(fragment_ids: &[String]) {
} }
/// Format a segment's messages into readable text for the observation agent. /// Format a segment's messages into readable text for the observation agent.
fn format_segment(messages: &[(usize, String, String, String)], max_chars: usize) -> String { fn format_segment(messages: &[(usize, String, String, String)]) -> String {
let cfg = crate::config::get(); let cfg = crate::config::get();
let mut fragments = Vec::new(); let mut fragments = Vec::new();
let mut total = 0;
for (_, role, text, _) in messages { for (_, role, text, ts) in messages {
let min_len = if role == "user" { 5 } else { 10 }; let min_len = if role == "user" { 5 } else { 10 };
if text.len() <= min_len { continue; } if text.len() <= min_len { continue; }
let name = if role == "user" { &cfg.user_name } else { &cfg.assistant_name }; let name = if role == "user" { &cfg.user_name } else { &cfg.assistant_name };
fragments.push(format!("**{}:** {}", name, text)); if ts.is_empty() {
total += text.len(); fragments.push(format!("**{}:** {}", name, text));
if total > max_chars { break; } } else {
fragments.push(format!("**{}** {}: {}", name, &ts[..ts.len().min(19)], text));
}
} }
fragments.join("\n\n") fragments.join("\n\n")
} }