agent: kill no_compact, add pre-send size check in assemble_prompt

Two related fixes for last night's crash diagnosis:

1. Kill AgentState::no_compact. The reasoning ("forked agents
   shouldn't compact because it blows the KV cache prefix") wasn't
   worth the cost — forks with no compact recovery just *died* on
   any oversize prompt, with no fallback. The KV cache invalidation
   is a performance loss; failing the request entirely is a
   correctness loss. Remove the flag, let every agent's overflow-
   retry path call compact() up to 2 times.

2. Add pre-send size check in Agent::assemble_prompt. If the
   context has grown past budget (context_window * 80%) since the
   last compact — accumulation between turns, a fork assembling
   more than expected, etc. — trim_conversation() is called before
   wire_prompt. Since we tokenize client-side, we already know the
   exact count, so there's no reason to round-trip an oversize
   request to vLLM and get rejected.

Together these prevent the failure mode from last night: a
subconscious/unconscious agent's prompt exceeded max_model_len,
vLLM returned 400, agent had no_compact=true so it couldn't
recover, request failed. Now: the trim happens before send, so
the request rarely hits the 400 path at all; and if it somehow
does, compact+retry works for every agent.

Also adds ContextState::total_tokens() as the cheap pre-send
budget check.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
Kent Overstreet 2026-04-18 12:59:30 -04:00
parent 0592c5f78d
commit c7b0052f1d
2 changed files with 30 additions and 20 deletions

View file

@ -1096,6 +1096,16 @@ impl ContextState {
self.section_mut(section).clear(); self.section_mut(section).clear();
} }
/// Total tokens across every section that gets serialized into the prompt.
/// Cheap sum over cached `node.tokens()`; call this before assembling to
/// decide whether to trim.
pub fn total_tokens(&self) -> usize {
self.system().iter().map(|n| n.tokens()).sum::<usize>()
+ self.identity().iter().map(|n| n.tokens()).sum::<usize>()
+ self.journal().iter().map(|n| n.tokens()).sum::<usize>()
+ self.conversation().iter().map(|n| n.tokens()).sum::<usize>()
}
/// Dedup and trim conversation entries to fit within the context budget. /// Dedup and trim conversation entries to fit within the context budget.
/// ///
/// Phase 1: Drop duplicate memories (keep last) and DMN entries. /// Phase 1: Drop duplicate memories (keep last) and DMN entries.

View file

@ -182,9 +182,6 @@ pub struct AgentState {
/// vLLM scheduling priority (lower = higher priority). /// vLLM scheduling priority (lower = higher priority).
/// 0 = interactive, 1 = surface agent, 2 = other subconscious, 10 = unconscious. /// 0 = interactive, 1 = surface agent, 2 = other subconscious, 10 = unconscious.
pub priority: Option<i32>, pub priority: Option<i32>,
/// Forked agents should not compact on overflow — it blows the
/// KV cache prefix and evicts the step prompts.
pub no_compact: bool,
pub changed: Arc<tokio::sync::Notify>, pub changed: Arc<tokio::sync::Notify>,
} }
@ -246,7 +243,6 @@ impl Agent {
generation: 0, generation: 0,
active_tools, active_tools,
priority: Some(0), priority: Some(0),
no_compact: false,
changed: Arc::new(tokio::sync::Notify::new()), changed: Arc::new(tokio::sync::Notify::new()),
}), }),
}); });
@ -315,7 +311,6 @@ impl Agent {
generation: 0, generation: 0,
active_tools: tools::ActiveTools::new(), active_tools: tools::ActiveTools::new(),
priority: None, priority: None,
no_compact: true,
changed: Arc::new(tokio::sync::Notify::new()), changed: Arc::new(tokio::sync::Notify::new()),
}), }),
}) })
@ -328,8 +323,18 @@ impl Agent {
/// Assemble a ready-to-send prompt: token stream in wire form (each /// Assemble a ready-to-send prompt: token stream in wire form (each
/// image collapsed to a single `<|image_pad|>`) paired with the /// image collapsed to a single `<|image_pad|>`) paired with the
/// images to attach as multi_modal_data. /// images to attach as multi_modal_data.
///
/// Pre-send size check: if the context has grown past budget since the
/// last compact (accumulation between turns, a fork's context getting
/// bigger than expected, etc.), trim here rather than letting vLLM
/// reject the request. Client-side tokenization means we already know
/// the exact token count so there's no reason to round-trip an
/// oversize request.
pub async fn assemble_prompt(&self) -> (Vec<u32>, Vec<context::WireImage>) { pub async fn assemble_prompt(&self) -> (Vec<u32>, Vec<context::WireImage>) {
let ctx = self.context.lock().await; let mut ctx = self.context.lock().await;
if ctx.total_tokens() > context::context_budget_tokens() {
ctx.trim_conversation();
}
let st = self.state.lock().await; let st = self.state.lock().await;
let (mut tokens, images, _) = let (mut tokens, images, _) =
ctx.wire_prompt(0..ctx.conversation().len(), |_| false); ctx.wire_prompt(0..ctx.conversation().len(), |_| false);
@ -451,21 +456,16 @@ impl Agent {
// Check for stream/parse errors // Check for stream/parse errors
match parser_handle.await { match parser_handle.await {
Ok(Err(e)) => { Ok(Err(e)) => {
if context::is_context_overflow(&e) { if context::is_context_overflow(&e) && overflow_retries < 2 {
if agent.state.lock().await.no_compact { overflow_retries += 1;
return Err(e); let msg = format!("context overflow — compacting ({}/2)", overflow_retries);
} match &overflow_activity {
if overflow_retries < 2 { Some(a) => a.update(&msg).await,
overflow_retries += 1; None => overflow_activity = Some(
let msg = format!("context overflow — compacting ({}/2)", overflow_retries); start_activity(&agent, &msg).await),
match &overflow_activity {
Some(a) => a.update(&msg).await,
None => overflow_activity = Some(
start_activity(&agent, &msg).await),
}
agent.compact().await;
continue;
} }
agent.compact().await;
continue;
} }
return Err(e); return Err(e);
} }