diff --git a/src/agent/context.rs b/src/agent/context.rs index cbb667b..00c1ea5 100644 --- a/src/agent/context.rs +++ b/src/agent/context.rs @@ -1096,6 +1096,16 @@ impl ContextState { self.section_mut(section).clear(); } + /// Total tokens across every section that gets serialized into the prompt. + /// Cheap sum over cached `node.tokens()`; call this before assembling to + /// decide whether to trim. + pub fn total_tokens(&self) -> usize { + self.system().iter().map(|n| n.tokens()).sum::() + + self.identity().iter().map(|n| n.tokens()).sum::() + + self.journal().iter().map(|n| n.tokens()).sum::() + + self.conversation().iter().map(|n| n.tokens()).sum::() + } + /// Dedup and trim conversation entries to fit within the context budget. /// /// Phase 1: Drop duplicate memories (keep last) and DMN entries. diff --git a/src/agent/mod.rs b/src/agent/mod.rs index f8ebb24..2c3a98a 100644 --- a/src/agent/mod.rs +++ b/src/agent/mod.rs @@ -182,9 +182,6 @@ pub struct AgentState { /// vLLM scheduling priority (lower = higher priority). /// 0 = interactive, 1 = surface agent, 2 = other subconscious, 10 = unconscious. pub priority: Option, - /// Forked agents should not compact on overflow — it blows the - /// KV cache prefix and evicts the step prompts. - pub no_compact: bool, pub changed: Arc, } @@ -246,7 +243,6 @@ impl Agent { generation: 0, active_tools, priority: Some(0), - no_compact: false, changed: Arc::new(tokio::sync::Notify::new()), }), }); @@ -315,7 +311,6 @@ impl Agent { generation: 0, active_tools: tools::ActiveTools::new(), priority: None, - no_compact: true, changed: Arc::new(tokio::sync::Notify::new()), }), }) @@ -328,8 +323,18 @@ impl Agent { /// Assemble a ready-to-send prompt: token stream in wire form (each /// image collapsed to a single `<|image_pad|>`) paired with the /// images to attach as multi_modal_data. + /// + /// Pre-send size check: if the context has grown past budget since the + /// last compact (accumulation between turns, a fork's context getting + /// bigger than expected, etc.), trim here rather than letting vLLM + /// reject the request. Client-side tokenization means we already know + /// the exact token count so there's no reason to round-trip an + /// oversize request. pub async fn assemble_prompt(&self) -> (Vec, Vec) { - let ctx = self.context.lock().await; + let mut ctx = self.context.lock().await; + if ctx.total_tokens() > context::context_budget_tokens() { + ctx.trim_conversation(); + } let st = self.state.lock().await; let (mut tokens, images, _) = ctx.wire_prompt(0..ctx.conversation().len(), |_| false); @@ -451,21 +456,16 @@ impl Agent { // Check for stream/parse errors match parser_handle.await { Ok(Err(e)) => { - if context::is_context_overflow(&e) { - if agent.state.lock().await.no_compact { - return Err(e); - } - if overflow_retries < 2 { - overflow_retries += 1; - let msg = format!("context overflow — compacting ({}/2)", overflow_retries); - match &overflow_activity { - Some(a) => a.update(&msg).await, - None => overflow_activity = Some( - start_activity(&agent, &msg).await), - } - agent.compact().await; - continue; + if context::is_context_overflow(&e) && overflow_retries < 2 { + overflow_retries += 1; + let msg = format!("context overflow — compacting ({}/2)", overflow_retries); + match &overflow_activity { + Some(a) => a.update(&msg).await, + None => overflow_activity = Some( + start_activity(&agent, &msg).await), } + agent.compact().await; + continue; } return Err(e); }