observation: chunk large transcripts, remove format_segment limit
Large conversation segments are now split into 50KB chunks with 10KB overlap, instead of being truncated to 8000 chars (which was broken anyway — broke after exceeding, not before). Each chunk gets its own candidate ID for independent mining and dedup. format_segment simplified: no size limit, added timestamps to output so observation agent can cross-reference with journal entries. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
03310dafa4
commit
15d4bfa01f
1 changed files with 40 additions and 8 deletions
|
|
@ -773,10 +773,41 @@ pub fn select_conversation_fragments(n: usize) -> Vec<(String, String)> {
|
||||||
if has_rate_limit && assistant_msgs < 3 {
|
if has_rate_limit && assistant_msgs < 3 {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
let text = format_segment(&segment, 8000);
|
let text = format_segment(&segment);
|
||||||
if text.len() > 500 {
|
if text.len() < 500 {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
const CHUNK_SIZE: usize = 50_000;
|
||||||
|
const OVERLAP: usize = 10_000;
|
||||||
|
if text.len() <= CHUNK_SIZE {
|
||||||
let id = format!("{}.{}", session_id, seg_idx);
|
let id = format!("{}.{}", session_id, seg_idx);
|
||||||
candidates.push((id, text));
|
candidates.push((id, text));
|
||||||
|
} else {
|
||||||
|
// Split on line boundaries with overlap
|
||||||
|
let lines: Vec<&str> = text.lines().collect();
|
||||||
|
let mut start_line = 0;
|
||||||
|
let mut chunk_idx = 0;
|
||||||
|
while start_line < lines.len() {
|
||||||
|
let mut end_line = start_line;
|
||||||
|
let mut size = 0;
|
||||||
|
while end_line < lines.len() && size < CHUNK_SIZE {
|
||||||
|
size += lines[end_line].len() + 1;
|
||||||
|
end_line += 1;
|
||||||
|
}
|
||||||
|
let chunk: String = lines[start_line..end_line].join("\n");
|
||||||
|
let id = format!("{}.{}.{}", session_id, seg_idx, chunk_idx);
|
||||||
|
candidates.push((id, chunk));
|
||||||
|
if end_line >= lines.len() { break; }
|
||||||
|
// Back up by overlap amount for next chunk
|
||||||
|
let mut overlap_size = 0;
|
||||||
|
let mut overlap_start = end_line;
|
||||||
|
while overlap_start > start_line && overlap_size < OVERLAP {
|
||||||
|
overlap_start -= 1;
|
||||||
|
overlap_size += lines[overlap_start].len() + 1;
|
||||||
|
}
|
||||||
|
start_line = overlap_start;
|
||||||
|
chunk_idx += 1;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -803,19 +834,20 @@ pub fn mark_observation_done(fragment_ids: &[String]) {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Format a segment's messages into readable text for the observation agent.
|
/// Format a segment's messages into readable text for the observation agent.
|
||||||
fn format_segment(messages: &[(usize, String, String, String)], max_chars: usize) -> String {
|
fn format_segment(messages: &[(usize, String, String, String)]) -> String {
|
||||||
let cfg = crate::config::get();
|
let cfg = crate::config::get();
|
||||||
let mut fragments = Vec::new();
|
let mut fragments = Vec::new();
|
||||||
let mut total = 0;
|
|
||||||
|
|
||||||
for (_, role, text, _) in messages {
|
for (_, role, text, ts) in messages {
|
||||||
let min_len = if role == "user" { 5 } else { 10 };
|
let min_len = if role == "user" { 5 } else { 10 };
|
||||||
if text.len() <= min_len { continue; }
|
if text.len() <= min_len { continue; }
|
||||||
|
|
||||||
let name = if role == "user" { &cfg.user_name } else { &cfg.assistant_name };
|
let name = if role == "user" { &cfg.user_name } else { &cfg.assistant_name };
|
||||||
fragments.push(format!("**{}:** {}", name, text));
|
if ts.is_empty() {
|
||||||
total += text.len();
|
fragments.push(format!("**{}:** {}", name, text));
|
||||||
if total > max_chars { break; }
|
} else {
|
||||||
|
fragments.push(format!("**{}** {}: {}", name, &ts[..ts.len().min(19)], text));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
fragments.join("\n\n")
|
fragments.join("\n\n")
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue