From c7b0052f1d6c097ff12b20fd26fd7d22b4b4ced9 Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Sat, 18 Apr 2026 12:59:30 -0400 Subject: [PATCH] agent: kill no_compact, add pre-send size check in assemble_prompt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two related fixes for last night's crash diagnosis: 1. Kill AgentState::no_compact. The reasoning ("forked agents shouldn't compact because it blows the KV cache prefix") wasn't worth the cost — forks with no compact recovery just *died* on any oversize prompt, with no fallback. The KV cache invalidation is a performance loss; failing the request entirely is a correctness loss. Remove the flag, let every agent's overflow- retry path call compact() up to 2 times. 2. Add pre-send size check in Agent::assemble_prompt. If the context has grown past budget (context_window * 80%) since the last compact — accumulation between turns, a fork assembling more than expected, etc. — trim_conversation() is called before wire_prompt. Since we tokenize client-side, we already know the exact count, so there's no reason to round-trip an oversize request to vLLM and get rejected. Together these prevent the failure mode from last night: a subconscious/unconscious agent's prompt exceeded max_model_len, vLLM returned 400, agent had no_compact=true so it couldn't recover, request failed. Now: the trim happens before send, so the request rarely hits the 400 path at all; and if it somehow does, compact+retry works for every agent. Also adds ContextState::total_tokens() as the cheap pre-send budget check. Co-Authored-By: Proof of Concept --- src/agent/context.rs | 10 ++++++++++ src/agent/mod.rs | 40 ++++++++++++++++++++-------------------- 2 files changed, 30 insertions(+), 20 deletions(-) diff --git a/src/agent/context.rs b/src/agent/context.rs index cbb667b..00c1ea5 100644 --- a/src/agent/context.rs +++ b/src/agent/context.rs @@ -1096,6 +1096,16 @@ impl ContextState { self.section_mut(section).clear(); } + /// Total tokens across every section that gets serialized into the prompt. + /// Cheap sum over cached `node.tokens()`; call this before assembling to + /// decide whether to trim. + pub fn total_tokens(&self) -> usize { + self.system().iter().map(|n| n.tokens()).sum::() + + self.identity().iter().map(|n| n.tokens()).sum::() + + self.journal().iter().map(|n| n.tokens()).sum::() + + self.conversation().iter().map(|n| n.tokens()).sum::() + } + /// Dedup and trim conversation entries to fit within the context budget. /// /// Phase 1: Drop duplicate memories (keep last) and DMN entries. diff --git a/src/agent/mod.rs b/src/agent/mod.rs index f8ebb24..2c3a98a 100644 --- a/src/agent/mod.rs +++ b/src/agent/mod.rs @@ -182,9 +182,6 @@ pub struct AgentState { /// vLLM scheduling priority (lower = higher priority). /// 0 = interactive, 1 = surface agent, 2 = other subconscious, 10 = unconscious. pub priority: Option, - /// Forked agents should not compact on overflow — it blows the - /// KV cache prefix and evicts the step prompts. - pub no_compact: bool, pub changed: Arc, } @@ -246,7 +243,6 @@ impl Agent { generation: 0, active_tools, priority: Some(0), - no_compact: false, changed: Arc::new(tokio::sync::Notify::new()), }), }); @@ -315,7 +311,6 @@ impl Agent { generation: 0, active_tools: tools::ActiveTools::new(), priority: None, - no_compact: true, changed: Arc::new(tokio::sync::Notify::new()), }), }) @@ -328,8 +323,18 @@ impl Agent { /// Assemble a ready-to-send prompt: token stream in wire form (each /// image collapsed to a single `<|image_pad|>`) paired with the /// images to attach as multi_modal_data. + /// + /// Pre-send size check: if the context has grown past budget since the + /// last compact (accumulation between turns, a fork's context getting + /// bigger than expected, etc.), trim here rather than letting vLLM + /// reject the request. Client-side tokenization means we already know + /// the exact token count so there's no reason to round-trip an + /// oversize request. pub async fn assemble_prompt(&self) -> (Vec, Vec) { - let ctx = self.context.lock().await; + let mut ctx = self.context.lock().await; + if ctx.total_tokens() > context::context_budget_tokens() { + ctx.trim_conversation(); + } let st = self.state.lock().await; let (mut tokens, images, _) = ctx.wire_prompt(0..ctx.conversation().len(), |_| false); @@ -451,21 +456,16 @@ impl Agent { // Check for stream/parse errors match parser_handle.await { Ok(Err(e)) => { - if context::is_context_overflow(&e) { - if agent.state.lock().await.no_compact { - return Err(e); - } - if overflow_retries < 2 { - overflow_retries += 1; - let msg = format!("context overflow — compacting ({}/2)", overflow_retries); - match &overflow_activity { - Some(a) => a.update(&msg).await, - None => overflow_activity = Some( - start_activity(&agent, &msg).await), - } - agent.compact().await; - continue; + if context::is_context_overflow(&e) && overflow_retries < 2 { + overflow_retries += 1; + let msg = format!("context overflow — compacting ({}/2)", overflow_retries); + match &overflow_activity { + Some(a) => a.update(&msg).await, + None => overflow_activity = Some( + start_activity(&agent, &msg).await), } + agent.compact().await; + continue; } return Err(e); }