From c7b0052f1d6c097ff12b20fd26fd7d22b4b4ced9 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Sat, 18 Apr 2026 12:59:30 -0400
Subject: [PATCH] agent: kill no_compact, add pre-send size check in
 assemble_prompt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two related fixes for last night's crash diagnosis:

1. Kill AgentState::no_compact. The reasoning ("forked agents
   shouldn't compact because it blows the KV cache prefix") wasn't
   worth the cost — forks with no compact recovery just *died* on
   any oversize prompt, with no fallback. The KV cache invalidation
   is a performance loss; failing the request entirely is a
   correctness loss. Remove the flag, let every agent's overflow-
   retry path call compact() up to 2 times.

2. Add pre-send size check in Agent::assemble_prompt. If the
   context has grown past budget (context_window * 80%) since the
   last compact — accumulation between turns, a fork assembling
   more than expected, etc. — trim_conversation() is called before
   wire_prompt. Since we tokenize client-side, we already know the
   exact count, so there's no reason to round-trip an oversize
   request to vLLM and get rejected.

Together these prevent the failure mode from last night: a
subconscious/unconscious agent's prompt exceeded max_model_len,
vLLM returned 400, agent had no_compact=true so it couldn't
recover, request failed. Now: the trim happens before send, so
the request rarely hits the 400 path at all; and if it somehow
does, compact+retry works for every agent.

Also adds ContextState::total_tokens() as the cheap pre-send
budget check.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
---
 src/agent/context.rs | 10 ++++++++++
 src/agent/mod.rs     | 40 ++++++++++++++++++++--------------------
 2 files changed, 30 insertions(+), 20 deletions(-)
diff --git a/src/agent/context.rs b/src/agent/context.rs
index cbb667b..00c1ea5 100644
--- a/src/agent/context.rs
+++ b/src/agent/context.rs
@@ -1096,6 +1096,16 @@ impl ContextState {
         self.section_mut(section).clear();
     }
 
+    /// Total tokens across every section that gets serialized into the prompt.
+    /// Cheap sum over cached `node.tokens()`; call this before assembling to
+    /// decide whether to trim.
+    pub fn total_tokens(&self) -> usize {
+        self.system().iter().map(|n| n.tokens()).sum::<usize>()
+            + self.identity().iter().map(|n| n.tokens()).sum::<usize>()
+            + self.journal().iter().map(|n| n.tokens()).sum::<usize>()
+            + self.conversation().iter().map(|n| n.tokens()).sum::<usize>()
+    }
+
     /// Dedup and trim conversation entries to fit within the context budget.
     ///
     /// Phase 1: Drop duplicate memories (keep last) and DMN entries.
diff --git a/src/agent/mod.rs b/src/agent/mod.rs
index f8ebb24..2c3a98a 100644
--- a/src/agent/mod.rs
+++ b/src/agent/mod.rs
@@ -182,9 +182,6 @@ pub struct AgentState {
     /// vLLM scheduling priority (lower = higher priority).
     /// 0 = interactive, 1 = surface agent, 2 = other subconscious, 10 = unconscious.
     pub priority: Option<i32>,
-    /// Forked agents should not compact on overflow — it blows the
-    /// KV cache prefix and evicts the step prompts.
-    pub no_compact: bool,
     pub changed: Arc<tokio::sync::Notify>,
 }
 
@@ -246,7 +243,6 @@ impl Agent {
                 generation: 0,
                 active_tools,
                 priority: Some(0),
-                no_compact: false,
                 changed: Arc::new(tokio::sync::Notify::new()),
             }),
         });
@@ -315,7 +311,6 @@ impl Agent {
                 generation: 0,
                 active_tools: tools::ActiveTools::new(),
                 priority: None,
-                no_compact: true,
                 changed: Arc::new(tokio::sync::Notify::new()),
             }),
         })
@@ -328,8 +323,18 @@ impl Agent {
     /// Assemble a ready-to-send prompt: token stream in wire form (each
     /// image collapsed to a single `<|image_pad|>`) paired with the
     /// images to attach as multi_modal_data.
+    ///
+    /// Pre-send size check: if the context has grown past budget since the
+    /// last compact (accumulation between turns, a fork's context getting
+    /// bigger than expected, etc.), trim here rather than letting vLLM
+    /// reject the request. Client-side tokenization means we already know
+    /// the exact token count so there's no reason to round-trip an
+    /// oversize request.
     pub async fn assemble_prompt(&self) -> (Vec<u32>, Vec<context::WireImage>) {
-        let ctx = self.context.lock().await;
+        let mut ctx = self.context.lock().await;
+        if ctx.total_tokens() > context::context_budget_tokens() {
+            ctx.trim_conversation();
+        }
         let st = self.state.lock().await;
         let (mut tokens, images, _) =
             ctx.wire_prompt(0..ctx.conversation().len(), |_| false);
@@ -451,21 +456,16 @@ impl Agent {
             // Check for stream/parse errors
             match parser_handle.await {
                 Ok(Err(e)) => {
-                    if context::is_context_overflow(&e) {
-                        if agent.state.lock().await.no_compact {
-                            return Err(e);
-                        }
-                        if overflow_retries < 2 {
-                            overflow_retries += 1;
-                            let msg = format!("context overflow — compacting ({}/2)", overflow_retries);
-                            match &overflow_activity {
-                                Some(a) => a.update(&msg).await,
-                                None => overflow_activity = Some(
-                                    start_activity(&agent, &msg).await),
-                            }
-                            agent.compact().await;
-                            continue;
+                    if context::is_context_overflow(&e) && overflow_retries < 2 {
+                        overflow_retries += 1;
+                        let msg = format!("context overflow — compacting ({}/2)", overflow_retries);
+                        match &overflow_activity {
+                            Some(a) => a.update(&msg).await,
+                            None => overflow_activity = Some(
+                                start_activity(&agent, &msg).await),
                         }
+                        agent.compact().await;
+                        continue;
                     }
                     return Err(e);
                 }