From db48d579171eab0e6a4296886e606d010c94acca Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 21 Mar 2026 15:55:30 -0400 Subject: [PATCH] refactor: extract context window building into context.rs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move context window construction (build_context_window, plan_context, render_journal_text, assemble_context), token counting, error classification, and related helpers from agent.rs into context.rs. All extracted functions are pure — they take inputs and return values with no mutable state access. State mutation stays in agent.rs (compact, restore_from_log, load_startup_journal). agent.rs: 1504 → 987 lines (-517) context.rs: 365 lines (new) Net: -152 lines (duplicate comments removed) Co-Authored-By: Claude Opus 4.6 (1M context) --- poc-agent/src/agent.rs | 436 +-------------------------------------- poc-agent/src/context.rs | 253 +++++++++++++++++------ poc-agent/src/main.rs | 5 +- 3 files changed, 206 insertions(+), 488 deletions(-) diff --git a/poc-agent/src/agent.rs b/poc-agent/src/agent.rs index 9040491..f5a7ec0 100644 --- a/poc-agent/src/agent.rs +++ b/poc-agent/src/agent.rs @@ -14,7 +14,6 @@ // in, response out, tool calls dispatched. use anyhow::Result; -use chrono::{DateTime, Utc}; use tiktoken_rs::CoreBPE; use std::io::Write; @@ -54,10 +53,6 @@ struct DispatchState { dmn_pause: bool, } -/// Mutable context state — the structured regions of the context window. -/// -/// Each field is a different dimension of awareness. The struct renders - pub struct Agent { client: ApiClient, messages: Vec, @@ -196,7 +191,7 @@ impl Agent { let mut in_conversation = false; for msg in &self.messages { - let tokens = msg_token_count(&self.tokenizer, msg); + let tokens = crate::context::msg_token_count(&self.tokenizer, msg); if in_conversation { conv_tokens += tokens; @@ -231,7 +226,7 @@ impl Agent { memory_tokens: mem_tokens, journal_tokens: jnl_tokens, conversation_tokens: conv_tokens, - window_tokens: model_context_window(&self.client.model), + window_tokens: crate::context::model_context_window(&self.client.model), }; } @@ -279,7 +274,7 @@ impl Agent { // Context overflow → compact and retry (max 2 attempts) // Stream error → retry with backoff (max 2 attempts) let (msg, usage) = match api_result { - Err(e) if is_context_overflow(&e) && overflow_retries < 2 => { + Err(e) if crate::context::is_context_overflow(&e) && overflow_retries < 2 => { overflow_retries += 1; let _ = ui_tx.send(UiMessage::Info(format!( "[context overflow — compacting and retrying ({}/2)]", @@ -288,7 +283,7 @@ impl Agent { self.emergency_compact(); continue; } - Err(e) if is_stream_error(&e) && empty_retries < 2 => { + Err(e) if crate::context::is_stream_error(&e) && empty_retries < 2 => { empty_retries += 1; let _ = ui_tx.send(UiMessage::Info(format!( "[stream error: {} — retrying ({}/2)]", @@ -651,7 +646,7 @@ impl Agent { let count = |s: &str| self.tokenizer.encode_with_special_tokens(s).len(); let context_message = self.context.render_context_message(); - let plan = plan_context( + let plan = crate::context::plan_context( &self.context.system_prompt, &context_message, &[], // no conversation yet @@ -660,7 +655,7 @@ impl Agent { &count, ); - self.context.journal = render_journal_text(&entries, &plan); + self.context.journal = crate::context::render_journal_text(&entries, &plan); } /// Re-render the context message in self.messages from live ContextState. @@ -820,7 +815,7 @@ impl Agent { .unwrap_or(self.messages.len()); let conversation: Vec = self.messages[conv_start..].to_vec(); - let (messages, journal) = build_context_window( + let (messages, journal) = crate::context::build_context_window( &self.context, &conversation, &self.client.model, @@ -880,7 +875,7 @@ impl Agent { .collect(); dbglog!("[restore] {} messages after filtering system", conversation.len()); - let (messages, journal) = build_context_window( + let (messages, journal) = crate::context::build_context_window( &self.context, &conversation, &self.client.model, @@ -923,420 +918,9 @@ impl Agent { } } -/// Look up a model's context window size in tokens. -pub fn model_context_window(model: &str) -> usize { - let m = model.to_lowercase(); - if m.contains("opus") || m.contains("sonnet") { - 200_000 - } else if m.contains("qwen") { - 131_072 - } else { - 128_000 - } -} +// Context window building, token counting, and error classification +// live in context.rs -/// Context budget in tokens: 60% of the model's context window. -/// Leaves headroom for conversation to grow before compaction triggers. -/// -/// Future direction: make this dynamic based on what the agent is -/// doing — deep coding work might allocate more to conversation, -/// consolidation might allocate more to journal/memory, idle might -/// shrink everything to save cost. -fn context_budget_tokens(model: &str) -> usize { - model_context_window(model) * 60 / 100 -} - -/// Allocation plan for the context window. Separates the budget math -/// (which entries and messages to include) from the message assembly -/// (building the actual Vec). This makes the core algorithm -/// testable and inspectable — log the plan on compaction to see exactly -/// what allocation decisions were made. -struct ContextPlan { - /// Index into all_entries: header-only entries start here - header_start: usize, - /// Index into all_entries: full entries start here (headers end here) - full_start: usize, - /// Total journal entries (header-only + full go up to this) - entry_count: usize, - /// Index into recent conversation: skip messages before this - conv_trim: usize, - /// Total recent conversation messages - _conv_count: usize, - /// Tokens used by full journal entries - _full_tokens: usize, - /// Tokens used by header-only journal entries - _header_tokens: usize, - /// Tokens used by conversation (after trimming) - _conv_tokens: usize, - /// Total budget available (after identity, memory, reserve) - _available: usize, -} - -/// Build a context window from conversation messages + journal entries. -/// This is the core algorithm shared by compact() and restore_from_log(). -/// -/// Allocation strategy: identity and memory are fixed costs. The -/// remaining budget (minus 25% reserve for model output) is split -/// between journal and conversation. Conversation gets priority — -/// it's what's happening now. Journal fills the rest, newest first. -/// -/// When the budget is tight, journal entries are dropped first -/// (oldest entries go first). If conversation alone exceeds the -/// budget, oldest messages are trimmed to fit. -/// Returns (messages, journal_text) — caller stores journal_text in ContextState. -fn build_context_window( - context: &ContextState, - conversation: &[Message], - model: &str, - tokenizer: &CoreBPE, -) -> (Vec, String) { - let journal_path = journal::default_journal_path(); - let all_entries = journal::parse_journal(&journal_path); - dbglog!("[ctx] {} journal entries from {}", all_entries.len(), journal_path.display()); - let count = |s: &str| tokenizer.encode_with_special_tokens(s).len(); - - let system_prompt = context.system_prompt.clone(); - let context_message = context.render_context_message(); - - // Cap memory to 50% of the context budget so conversation always - // gets space. Truncate at the last complete section boundary. - let max_tokens = context_budget_tokens(model); - let memory_cap = max_tokens / 2; - let memory_tokens = count(&context_message); - let context_message = if memory_tokens > memory_cap { - dbglog!("[ctx] memory too large: {} tokens > {} cap, truncating", memory_tokens, memory_cap); - truncate_at_section(&context_message, memory_cap, &count) - } else { - context_message - }; - - let recent_start = find_journal_cutoff(conversation, all_entries.last()); - dbglog!("[ctx] journal cutoff: {} of {} conversation messages are 'recent'", - conversation.len() - recent_start, conversation.len()); - let recent = &conversation[recent_start..]; - - let plan = plan_context( - &system_prompt, - &context_message, - recent, - &all_entries, - model, - &count, - ); - - // Render journal text from the plan - let journal_text = render_journal_text(&all_entries, &plan); - dbglog!("[ctx] plan: header_start={} full_start={} entry_count={} conv_trim={} journal_text={} chars", - plan.header_start, plan.full_start, plan.entry_count, plan.conv_trim, journal_text.len()); - - let messages = assemble_context( - system_prompt, context_message, &journal_text, - recent, &plan, - ); - (messages, journal_text) -} - -/// Compute the allocation plan: how much budget goes to journal vs -/// conversation, which entries and messages to include. -fn plan_context( - system_prompt: &str, - context_message: &str, - recent: &[Message], - entries: &[journal::JournalEntry], - model: &str, - count: &dyn Fn(&str) -> usize, -) -> ContextPlan { - let max_tokens = context_budget_tokens(model); - - // Fixed costs — always included - let identity_cost = count(system_prompt); - let memory_cost = count(context_message); - let reserve = max_tokens / 4; - let available = max_tokens - .saturating_sub(identity_cost) - .saturating_sub(memory_cost) - .saturating_sub(reserve); - - // Measure conversation - let conv_costs: Vec = recent.iter().map(|m| msg_token_count_fn(m, count)).collect(); - let total_conv: usize = conv_costs.iter().sum(); - - // Journal always gets at least 15% of available budget so it doesn't - // get squeezed out by large conversations. - let journal_min = available * 15 / 100; - let journal_budget = available.saturating_sub(total_conv).max(journal_min); - - // Fill journal entries newest-first within budget. - // Tiered: recent entries get full content, older entries get just - // a header line (timestamp + first line) for timeline awareness. - let full_budget = journal_budget * 70 / 100; - let header_budget = journal_budget.saturating_sub(full_budget); - - // Phase 1: Full entries (newest first) - let mut full_used = 0; - let mut n_full = 0; - for entry in entries.iter().rev() { - let cost = count(&entry.content) + 10; - if full_used + cost > full_budget { - break; - } - full_used += cost; - n_full += 1; - } - let full_start = entries.len().saturating_sub(n_full); - - // Phase 2: Header-only entries (continuing backward from where full stopped) - let mut header_used = 0; - let mut n_headers = 0; - for entry in entries[..full_start].iter().rev() { - let first_line = entry - .content - .lines() - .find(|l| !l.trim().is_empty()) - .unwrap_or("(empty)"); - let cost = count(first_line) + 10; - if header_used + cost > header_budget { - break; - } - header_used += cost; - n_headers += 1; - } - let header_start = full_start.saturating_sub(n_headers); - - // If conversation exceeds available budget, trim oldest messages - let journal_used = full_used + header_used; - let mut conv_trim = 0; - let mut trimmed_conv = total_conv; - while trimmed_conv + journal_used > available && conv_trim < recent.len() { - trimmed_conv -= conv_costs[conv_trim]; - conv_trim += 1; - } - // Walk forward to user message boundary - while conv_trim < recent.len() && recent[conv_trim].role != Role::User { - conv_trim += 1; - } - - dbglog!("[plan] model={} max_tokens={} available={} (identity={} memory={} reserve={})", - model, max_tokens, available, identity_cost, memory_cost, reserve); - dbglog!("[plan] conv: {} msgs, {} tokens total, trimming {} msgs → {} tokens", - recent.len(), total_conv, conv_trim, trimmed_conv); - dbglog!("[plan] journal: {} full entries ({}t) + {} headers ({}t)", - n_full, full_used, n_headers, header_used); - - ContextPlan { - header_start, - full_start, - entry_count: entries.len(), - conv_trim, - _conv_count: recent.len(), - _full_tokens: full_used, - _header_tokens: header_used, - _conv_tokens: trimmed_conv, - _available: available, - } -} - -/// Render journal entries into text from a context plan. -fn render_journal_text( - entries: &[journal::JournalEntry], - plan: &ContextPlan, -) -> String { - let has_journal = plan.header_start < plan.entry_count; - if !has_journal { - return String::new(); - } - - let mut text = String::from("[Earlier in this conversation — from your journal]\n\n"); - - // Header-only entries (older) — just timestamp + first line - for entry in &entries[plan.header_start..plan.full_start] { - let first_line = entry - .content - .lines() - .find(|l| !l.trim().is_empty()) - .unwrap_or("(empty)"); - text.push_str(&format!( - "## {} — {}\n", - entry.timestamp.format("%Y-%m-%dT%H:%M"), - first_line, - )); - } - - // Separator between headers and full entries - let n_headers = plan.full_start - plan.header_start; - let n_full = plan.entry_count - plan.full_start; - if n_headers > 0 && n_full > 0 { - text.push_str("\n---\n\n"); - } - - // Full entries (recent) - for entry in &entries[plan.full_start..] { - text.push_str(&format!( - "## {}\n\n{}\n\n", - entry.timestamp.format("%Y-%m-%dT%H:%M"), - entry.content - )); - } - - text -} - -/// Assemble the context window from a plan. No allocation decisions -/// happen here — just follow the plan to build messages. -fn assemble_context( - system_prompt: String, - context_message: String, - journal_text: &str, - recent: &[Message], - plan: &ContextPlan, -) -> Vec { - let mut messages = vec![Message::system(system_prompt)]; - if !context_message.is_empty() { - messages.push(Message::user(context_message)); - } - - let final_recent = &recent[plan.conv_trim..]; - - if !journal_text.is_empty() { - messages.push(Message::user(journal_text.to_string())); - } else if !final_recent.is_empty() { - messages.push(Message::user( - "Your context was just rebuilt. Memory files have been \ - reloaded. Your recent conversation continues below. \ - Earlier context is in your journal and memory files." - .to_string(), - )); - } - - messages.extend(final_recent.iter().cloned()); - messages -} - -/// Find the conversation index where messages are no longer covered -/// Truncate a context message to fit within a token budget. Cuts at -/// section boundaries (lines starting with `---` or `## `) to avoid -/// splitting mid-section. Drops sections from the end first since -/// earlier sections (identity, instructions) matter more. -fn truncate_at_section(text: &str, max_tokens: usize, count: &dyn Fn(&str) -> usize) -> String { - // Find section boundaries (--- separators between assembled parts) - let mut boundaries = vec![0usize]; - for (i, line) in text.lines().enumerate() { - if line.trim() == "---" || line.starts_with("## ") { - // Find byte offset of this line - let offset = text.lines().take(i).map(|l| l.len() + 1).sum::(); - boundaries.push(offset); - } - } - boundaries.push(text.len()); - - // Binary search: find the largest prefix of sections that fits - let mut best = 0; - for &end in &boundaries[1..] { - let slice = &text[..end]; - if count(slice) <= max_tokens { - best = end; - } else { - break; - } - } - - if best == 0 { - // Even the first section doesn't fit — hard truncate - best = text.len().min(max_tokens * 3); // ~3 chars/token rough estimate - } - - let truncated = &text[..best]; - dbglog!("[ctx] truncated memory from {} to {} chars ({} tokens)", - text.len(), truncated.len(), count(truncated)); - truncated.to_string() -} - -/// by journal entries. Messages before this index are summarized by -/// the journal; messages from this index onward stay as raw conversation. -/// Walks back to a user message boundary to avoid splitting tool -/// call/result sequences. -fn find_journal_cutoff( - conversation: &[Message], - newest_entry: Option<&journal::JournalEntry>, -) -> usize { - let cutoff = match newest_entry { - Some(entry) => entry.timestamp, - None => return 0, - }; - - let mut split = conversation.len(); - for (i, msg) in conversation.iter().enumerate() { - if let Some(ts) = parse_msg_timestamp(msg) { - if ts > cutoff { - split = i; - break; - } - } - } - // Walk back to user message boundary - while split > 0 && split < conversation.len() && conversation[split].role != Role::User { - split -= 1; - } - split -} - -/// Count the token footprint of a message using a token counting function. -fn msg_token_count_fn(msg: &Message, count: &dyn Fn(&str) -> usize) -> usize { - let content = msg.content.as_ref().map_or(0, |c| match c { - MessageContent::Text(s) => count(s), - MessageContent::Parts(parts) => parts - .iter() - .map(|p| match p { - ContentPart::Text { text } => count(text), - ContentPart::ImageUrl { .. } => 85, - }) - .sum(), - }); - let tools = msg.tool_calls.as_ref().map_or(0, |calls| { - calls - .iter() - .map(|c| count(&c.function.arguments) + count(&c.function.name)) - .sum() - }); - content + tools -} - -/// Count the token footprint of a message using BPE tokenization. -fn msg_token_count(tokenizer: &CoreBPE, msg: &Message) -> usize { - msg_token_count_fn(msg, &|s| tokenizer.encode_with_special_tokens(s).len()) -} - -/// Detect context window overflow errors from the API. -/// Different providers phrase this differently; we check for common patterns. -/// OpenRouter wraps upstream errors, so we check both the wrapper and the raw message. -fn is_context_overflow(err: &anyhow::Error) -> bool { - let msg = err.to_string().to_lowercase(); - msg.contains("context length") - || msg.contains("token limit") - || msg.contains("too many tokens") - || msg.contains("maximum context") - || msg.contains("prompt is too long") - || msg.contains("request too large") - || msg.contains("input validation error") - || msg.contains("content length limit") - || (msg.contains("400") && msg.contains("tokens")) -} - -/// Detect model/provider errors delivered inside the SSE stream. -/// OpenRouter returns HTTP 200 but finish_reason="error" with -/// partial content (e.g. "system") — we surface this as an error -/// so the turn loop can retry. -fn is_stream_error(err: &anyhow::Error) -> bool { - err.to_string().contains("model stream error") -} - -/// Parse a message's timestamp field into a DateTime. -fn parse_msg_timestamp(msg: &Message) -> Option> { - msg.timestamp - .as_ref() - .and_then(|ts| DateTime::parse_from_rfc3339(ts).ok()) - .map(|dt| dt.with_timezone(&Utc)) -} /// Create a short summary of tool args for the tools pane header. fn summarize_args(tool_name: &str, args: &serde_json::Value) -> String { diff --git a/poc-agent/src/context.rs b/poc-agent/src/context.rs index 3c7e1c9..5437765 100644 --- a/poc-agent/src/context.rs +++ b/poc-agent/src/context.rs @@ -1,14 +1,52 @@ // context.rs — Context window building and management // // Pure functions for building the agent's context window from journal -// entries and conversation messages. No mutable state. +// entries and conversation messages. No mutable state — all functions +// take inputs and return new values. State mutation happens in agent.rs. use crate::journal; -use crate::types::{ContextPlan, ContextState, Message}; +use crate::types::*; use chrono::{DateTime, Utc}; use tiktoken_rs::CoreBPE; +/// Look up a model's context window size in tokens. +pub fn model_context_window(model: &str) -> usize { + let m = model.to_lowercase(); + if m.contains("opus") || m.contains("sonnet") { + 200_000 + } else if m.contains("qwen") { + 131_072 + } else { + 128_000 + } +} + +/// Context budget in tokens: 60% of the model's context window. +fn context_budget_tokens(model: &str) -> usize { + model_context_window(model) * 60 / 100 +} + +/// Allocation plan for the context window. +pub struct ContextPlan { + header_start: usize, + full_start: usize, + entry_count: usize, + conv_trim: usize, + _conv_count: usize, + _full_tokens: usize, + _header_tokens: usize, + _conv_tokens: usize, + _available: usize, +} + /// Build a context window from conversation messages + journal entries. +/// +/// Allocation strategy: identity and memory are fixed costs. The +/// remaining budget (minus 25% reserve for model output) is split +/// between journal and conversation. Conversation gets priority — +/// it's what's happening now. Journal fills the rest, newest first. +/// +/// Returns (messages, journal_text) — caller stores journal_text in ContextState. pub fn build_context_window( context: &ContextState, conversation: &[Message], @@ -17,44 +55,50 @@ pub fn build_context_window( ) -> (Vec, String) { let journal_path = journal::default_journal_path(); let all_entries = journal::parse_journal(&journal_path); - crate::dbglog!("[ctx] {} journal entries from {}", all_entries.len(), journal_path.display()); + dbglog!("[ctx] {} journal entries from {}", all_entries.len(), journal_path.display()); let count = |s: &str| tokenizer.encode_with_special_tokens(s).len(); let system_prompt = context.system_prompt.clone(); let context_message = context.render_context_message(); + // Cap memory to 50% of the context budget so conversation always + // gets space. Truncate at the last complete section boundary. let max_tokens = context_budget_tokens(model); let memory_cap = max_tokens / 2; let memory_tokens = count(&context_message); let context_message = if memory_tokens > memory_cap { - crate::dbglog!("[ctx] memory too large: {} tokens > {} cap, truncating", memory_tokens, memory_cap); + dbglog!("[ctx] memory too large: {} tokens > {} cap, truncating", memory_tokens, memory_cap); truncate_at_section(&context_message, memory_cap, &count) } else { context_message }; let recent_start = find_journal_cutoff(conversation, all_entries.last()); + dbglog!("[ctx] journal cutoff: {} of {} conversation messages are 'recent'", + conversation.len() - recent_start, conversation.len()); let recent = &conversation[recent_start..]; - let plan = plan_context(&system_prompt, &context_message, recent, &all_entries, model, &count); + let plan = plan_context( + &system_prompt, + &context_message, + recent, + &all_entries, + model, + &count, + ); + let journal_text = render_journal_text(&all_entries, &plan); - - let messages = assemble_context(system_prompt, context_message, &journal_text, recent, &plan); + dbglog!("[ctx] plan: header_start={} full_start={} entry_count={} conv_trim={} journal_text={} chars", + plan.header_start, plan.full_start, plan.entry_count, plan.conv_trim, journal_text.len()); + + let messages = assemble_context( + system_prompt, context_message, &journal_text, + recent, &plan, + ); (messages, journal_text) } -pub fn model_context_window(model: &str) -> usize { - let m = model.to_lowercase(); - if m.contains("opus") || m.contains("sonnet") { 200_000 } - else if m.contains("qwen") { 131_072 } - else { 128_000 } -} - -fn context_budget_tokens(model: &str) -> usize { - model_context_window(model) * 60 / 100 -} - -fn plan_context( +pub fn plan_context( system_prompt: &str, context_message: &str, recent: &[Message], @@ -63,40 +107,56 @@ fn plan_context( count: &dyn Fn(&str) -> usize, ) -> ContextPlan { let max_tokens = context_budget_tokens(model); + let identity_cost = count(system_prompt); let memory_cost = count(context_message); let reserve = max_tokens / 4; - let available = max_tokens.saturating_sub(identity_cost).saturating_sub(memory_cost).saturating_sub(reserve); + let available = max_tokens + .saturating_sub(identity_cost) + .saturating_sub(memory_cost) + .saturating_sub(reserve); let conv_costs: Vec = recent.iter().map(|m| msg_token_count_fn(m, count)).collect(); let total_conv: usize = conv_costs.iter().sum(); let journal_min = available * 15 / 100; let journal_budget = available.saturating_sub(total_conv).max(journal_min); + let full_budget = journal_budget * 70 / 100; let header_budget = journal_budget.saturating_sub(full_budget); + // Phase 1: Full entries (newest first) let mut full_used = 0; let mut n_full = 0; for entry in entries.iter().rev() { let cost = count(&entry.content) + 10; - if full_used + cost > full_budget { break; } + if full_used + cost > full_budget { + break; + } full_used += cost; n_full += 1; } let full_start = entries.len().saturating_sub(n_full); + // Phase 2: Header-only entries (continuing backward) let mut header_used = 0; let mut n_headers = 0; for entry in entries[..full_start].iter().rev() { - let first_line = entry.content.lines().find(|l| !l.trim().is_empty()).unwrap_or("(empty)"); + let first_line = entry + .content + .lines() + .find(|l| !l.trim().is_empty()) + .unwrap_or("(empty)"); let cost = count(first_line) + 10; - if header_used + cost > header_budget { break; } + if header_used + cost > header_budget { + break; + } header_used += cost; n_headers += 1; } let header_start = full_start.saturating_sub(n_headers); + // Trim oldest conversation if it exceeds budget let journal_used = full_used + header_used; let mut conv_trim = 0; let mut trimmed_conv = total_conv; @@ -104,34 +164,69 @@ fn plan_context( trimmed_conv -= conv_costs[conv_trim]; conv_trim += 1; } - while conv_trim < recent.len() && recent[conv_trim].role != crate::types::Role::User { + // Walk forward to user message boundary + while conv_trim < recent.len() && recent[conv_trim].role != Role::User { conv_trim += 1; } + dbglog!("[plan] model={} max_tokens={} available={} (identity={} memory={} reserve={})", + model, max_tokens, available, identity_cost, memory_cost, reserve); + dbglog!("[plan] conv: {} msgs, {} tokens total, trimming {} msgs → {} tokens", + recent.len(), total_conv, conv_trim, trimmed_conv); + dbglog!("[plan] journal: {} full entries ({}t) + {} headers ({}t)", + n_full, full_used, n_headers, header_used); + ContextPlan { - header_start, full_start, entry_count: entries.len(), conv_trim, - _conv_count: recent.len(), _full_tokens: full_used, _header_tokens: header_used, - _conv_tokens: trimmed_conv, _available: available, + header_start, + full_start, + entry_count: entries.len(), + conv_trim, + _conv_count: recent.len(), + _full_tokens: full_used, + _header_tokens: header_used, + _conv_tokens: trimmed_conv, + _available: available, } } -fn render_journal_text(entries: &[journal::JournalEntry], plan: &ContextPlan) -> String { - if plan.header_start >= plan.entry_count { return String::new(); } - +pub fn render_journal_text( + entries: &[journal::JournalEntry], + plan: &ContextPlan, +) -> String { + let has_journal = plan.header_start < plan.entry_count; + if !has_journal { + return String::new(); + } + let mut text = String::from("[Earlier in this conversation — from your journal]\n\n"); - + for entry in &entries[plan.header_start..plan.full_start] { - let first_line = entry.content.lines().find(|l| !l.trim().is_empty()).unwrap_or("(empty)"); - text.push_str(&format!("## {} — {}\n", entry.timestamp.format("%Y-%m-%dT%H:%M"), first_line)); + let first_line = entry + .content + .lines() + .find(|l| !l.trim().is_empty()) + .unwrap_or("(empty)"); + text.push_str(&format!( + "## {} — {}\n", + entry.timestamp.format("%Y-%m-%dT%H:%M"), + first_line, + )); } let n_headers = plan.full_start - plan.header_start; let n_full = plan.entry_count - plan.full_start; - if n_headers > 0 && n_full > 0 { text.push_str("\n---\n\n"); } + if n_headers > 0 && n_full > 0 { + text.push_str("\n---\n\n"); + } for entry in &entries[plan.full_start..] { - text.push_str(&format!("## {}\n\n{}\n\n", entry.timestamp.format("%Y-%m-%dT%H:%M"), entry.content)); + text.push_str(&format!( + "## {}\n\n{}\n\n", + entry.timestamp.format("%Y-%m-%dT%H:%M"), + entry.content + )); } + text } @@ -143,10 +238,12 @@ fn assemble_context( plan: &ContextPlan, ) -> Vec { let mut messages = vec![Message::system(system_prompt)]; - if !context_message.is_empty() { messages.push(Message::user(context_message)); } - + if !context_message.is_empty() { + messages.push(Message::user(context_message)); + } + let final_recent = &recent[plan.conv_trim..]; - + if !journal_text.is_empty() { messages.push(Message::user(journal_text.to_string())); } else if !final_recent.is_empty() { @@ -157,6 +254,7 @@ fn assemble_context( .to_string(), )); } + messages.extend(final_recent.iter().cloned()); messages } @@ -174,26 +272,42 @@ fn truncate_at_section(text: &str, max_tokens: usize, count: &dyn Fn(&str) -> us let mut best = 0; for &end in &boundaries[1..] { let slice = &text[..end]; - if count(slice) <= max_tokens { best = end; } - else { break; } + if count(slice) <= max_tokens { + best = end; + } else { + break; + } + } + + if best == 0 { + best = text.len().min(max_tokens * 3); } - if best == 0 { best = text.len().min(max_tokens * 3); } let truncated = &text[..best]; - crate::dbglog!("[ctx] truncated memory from {} to {} chars ({} tokens)", text.len(), truncated.len(), count(truncated)); + dbglog!("[ctx] truncated memory from {} to {} chars ({} tokens)", + text.len(), truncated.len(), count(truncated)); truncated.to_string() } -fn find_journal_cutoff(conversation: &[Message], newest_entry: Option<&journal::JournalEntry>) -> usize { - let cutoff = match newest_entry { Some(entry) => entry.timestamp, None => return 0 }; - +fn find_journal_cutoff( + conversation: &[Message], + newest_entry: Option<&journal::JournalEntry>, +) -> usize { + let cutoff = match newest_entry { + Some(entry) => entry.timestamp, + None => return 0, + }; + let mut split = conversation.len(); for (i, msg) in conversation.iter().enumerate() { if let Some(ts) = parse_msg_timestamp(msg) { - if ts > cutoff { split = i; break; } + if ts > cutoff { + split = i; + break; + } } } - while split > 0 && split < conversation.len() && conversation[split].role != crate::types::Role::User { + while split > 0 && split < conversation.len() && conversation[split].role != Role::User { split -= 1; } split @@ -201,32 +315,51 @@ fn find_journal_cutoff(conversation: &[Message], newest_entry: Option<&journal:: fn msg_token_count_fn(msg: &Message, count: &dyn Fn(&str) -> usize) -> usize { let content = msg.content.as_ref().map_or(0, |c| match c { - crate::types::MessageContent::Text(s) => count(s), - crate::types::MessageContent::Parts(parts) => parts.iter().map(|p| match p { - crate::types::ContentPart::Text { text } => count(text), - crate::types::ContentPart::ImageUrl { .. } => 85, - }).sum(), + MessageContent::Text(s) => count(s), + MessageContent::Parts(parts) => parts + .iter() + .map(|p| match p { + ContentPart::Text { text } => count(text), + ContentPart::ImageUrl { .. } => 85, + }) + .sum(), + }); + let tools = msg.tool_calls.as_ref().map_or(0, |calls| { + calls + .iter() + .map(|c| count(&c.function.arguments) + count(&c.function.name)) + .sum() }); - let tools = msg.tool_calls.as_ref().map_or(0, |calls| calls.iter().map(|c| count(&c.function.arguments) + count(&c.function.name)).sum()); content + tools } -fn parse_msg_timestamp(msg: &Message) -> Option> { - msg.timestamp.as_ref().and_then(|ts| DateTime::parse_from_rfc3339(ts).ok()).map(|dt| dt.with_timezone(&Utc)) +/// Count the token footprint of a message using BPE tokenization. +pub fn msg_token_count(tokenizer: &CoreBPE, msg: &Message) -> usize { + msg_token_count_fn(msg, &|s| tokenizer.encode_with_special_tokens(s).len()) } +/// Detect context window overflow errors from the API. pub fn is_context_overflow(err: &anyhow::Error) -> bool { let msg = err.to_string().to_lowercase(); - msg.contains("context length") || msg.contains("token limit") || msg.contains("too many tokens") - || msg.contains("maximum context") || msg.contains("prompt is too long") || msg.contains("request too large") - || msg.contains("input validation error") || msg.contains("content length limit") + msg.contains("context length") + || msg.contains("token limit") + || msg.contains("too many tokens") + || msg.contains("maximum context") + || msg.contains("prompt is too long") + || msg.contains("request too large") + || msg.contains("input validation error") + || msg.contains("content length limit") || (msg.contains("400") && msg.contains("tokens")) } +/// Detect model/provider errors delivered inside the SSE stream. pub fn is_stream_error(err: &anyhow::Error) -> bool { err.to_string().contains("model stream error") } -pub fn msg_token_count(tokenizer: &CoreBPE, msg: &Message) -> usize { - msg_token_count_fn(msg, &|s| tokenizer.encode_with_special_tokens(s).len()) +fn parse_msg_timestamp(msg: &Message) -> Option> { + msg.timestamp + .as_ref() + .and_then(|ts| DateTime::parse_from_rfc3339(ts).ok()) + .map(|dt| dt.with_timezone(&Utc)) } diff --git a/poc-agent/src/main.rs b/poc-agent/src/main.rs index 02e90cf..19b020d 100644 --- a/poc-agent/src/main.rs +++ b/poc-agent/src/main.rs @@ -37,6 +37,7 @@ mod agent; mod api; mod cli; mod config; +mod context; mod dmn; mod journal; mod log; @@ -66,13 +67,13 @@ use crate::ui_channel::{ContextInfo, StatusInfo, StreamTarget, UiMessage}; /// Hard compaction threshold — context is rebuilt immediately. /// Uses config percentage of model context window. fn compaction_threshold(model: &str, app: &AppConfig) -> u32 { - (agent::model_context_window(model) as u32) * app.compaction.hard_threshold_pct / 100 + (context::model_context_window(model) as u32) * app.compaction.hard_threshold_pct / 100 } /// Soft threshold — nudge the model to journal before compaction. /// Fires once; the hard threshold handles the actual rebuild. fn pre_compaction_threshold(model: &str, app: &AppConfig) -> u32 { - (agent::model_context_window(model) as u32) * app.compaction.soft_threshold_pct / 100 + (context::model_context_window(model) as u32) * app.compaction.soft_threshold_pct / 100 } #[tokio::main]