From 15d4bfa01f63199be39fb86934c49ff28749259c Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Mon, 16 Mar 2026 20:52:20 -0400 Subject: [PATCH] observation: chunk large transcripts, remove format_segment limit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Large conversation segments are now split into 50KB chunks with 10KB overlap, instead of being truncated to 8000 chars (which was broken anyway — broke after exceeding, not before). Each chunk gets its own candidate ID for independent mining and dedup. format_segment simplified: no size limit, added timestamps to output so observation agent can cross-reference with journal entries. Co-Authored-By: Claude Opus 4.6 (1M context) --- poc-memory/src/agents/knowledge.rs | 48 +++++++++++++++++++++++++----- 1 file changed, 40 insertions(+), 8 deletions(-) diff --git a/poc-memory/src/agents/knowledge.rs b/poc-memory/src/agents/knowledge.rs index 1c3d13f..7c71121 100644 --- a/poc-memory/src/agents/knowledge.rs +++ b/poc-memory/src/agents/knowledge.rs @@ -773,10 +773,41 @@ pub fn select_conversation_fragments(n: usize) -> Vec<(String, String)> { if has_rate_limit && assistant_msgs < 3 { continue; } - let text = format_segment(&segment, 8000); - if text.len() > 500 { + let text = format_segment(&segment); + if text.len() < 500 { + continue; + } + const CHUNK_SIZE: usize = 50_000; + const OVERLAP: usize = 10_000; + if text.len() <= CHUNK_SIZE { let id = format!("{}.{}", session_id, seg_idx); candidates.push((id, text)); + } else { + // Split on line boundaries with overlap + let lines: Vec<&str> = text.lines().collect(); + let mut start_line = 0; + let mut chunk_idx = 0; + while start_line < lines.len() { + let mut end_line = start_line; + let mut size = 0; + while end_line < lines.len() && size < CHUNK_SIZE { + size += lines[end_line].len() + 1; + end_line += 1; + } + let chunk: String = lines[start_line..end_line].join("\n"); + let id = format!("{}.{}.{}", session_id, seg_idx, chunk_idx); + candidates.push((id, chunk)); + if end_line >= lines.len() { break; } + // Back up by overlap amount for next chunk + let mut overlap_size = 0; + let mut overlap_start = end_line; + while overlap_start > start_line && overlap_size < OVERLAP { + overlap_start -= 1; + overlap_size += lines[overlap_start].len() + 1; + } + start_line = overlap_start; + chunk_idx += 1; + } } } @@ -803,19 +834,20 @@ pub fn mark_observation_done(fragment_ids: &[String]) { } /// Format a segment's messages into readable text for the observation agent. -fn format_segment(messages: &[(usize, String, String, String)], max_chars: usize) -> String { +fn format_segment(messages: &[(usize, String, String, String)]) -> String { let cfg = crate::config::get(); let mut fragments = Vec::new(); - let mut total = 0; - for (_, role, text, _) in messages { + for (_, role, text, ts) in messages { let min_len = if role == "user" { 5 } else { 10 }; if text.len() <= min_len { continue; } let name = if role == "user" { &cfg.user_name } else { &cfg.assistant_name }; - fragments.push(format!("**{}:** {}", name, text)); - total += text.len(); - if total > max_chars { break; } + if ts.is_empty() { + fragments.push(format!("**{}:** {}", name, text)); + } else { + fragments.push(format!("**{}** {}: {}", name, &ts[..ts.len().min(19)], text)); + } } fragments.join("\n\n") }