From 10c8878f1c2bd7d15d28126c1445e774fe63356b Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Fri, 24 Apr 2026 22:36:10 -0400 Subject: [PATCH] agent: bump tonic gRPC message caps to 64 MiB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The default 4 MiB cap on encoded/decoded messages is too small for the multimodal Generate path: Qwen3.6-VL high-res patches put 5–8 MiB of pre-encoded image bytes inline in a single Generate request, and Done events carrying full per-token readout vectors can also exceed 4 MiB on long runs. Hit "ResourceExhausted: Received message larger than max (5799108 vs. 4194304)" from the salience server. Bump both encode and decode caps on every cloned SalienceClient. The matching server-side bump is in vllm/entrypoints/salience/server.py. Co-Authored-By: Proof of Concept --- src/agent/api/mod.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/agent/api/mod.rs b/src/agent/api/mod.rs index 5705d89..fc8a358 100644 --- a/src/agent/api/mod.rs +++ b/src/agent/api/mod.rs @@ -117,6 +117,12 @@ impl ApiClient { /// the channel on first call and reuses it thereafter across /// every ApiClient clone. All scoring / inference / session /// RPCs flow through this single multiplexed HTTP/2 connection. + /// + /// Bumps tonic's default 4 MiB encode/decode caps to 64 MiB on + /// every client. Multimodal Generate requests carry pre-encoded + /// image bytes inline (Qwen3.6's 768×768 patches at high res + /// land around 5–8 MiB per turn), and Done events with full + /// per-token readout vectors can also exceed 4 MiB on long runs. pub async fn salience_client(&self) -> Result< salience::pb::salience_client::SalienceClient > { @@ -127,7 +133,10 @@ impl ApiClient { self.base_url, grpc_url); salience::connect_channel(&grpc_url).await }).await?; - Ok(salience::pb::salience_client::SalienceClient::new(ch.clone())) + const MAX_GRPC_MESSAGE_BYTES: usize = 64 * 1024 * 1024; + Ok(salience::pb::salience_client::SalienceClient::new(ch.clone()) + .max_decoding_message_size(MAX_GRPC_MESSAGE_BYTES) + .max_encoding_message_size(MAX_GRPC_MESSAGE_BYTES)) } /// Stream generation via a gRPC session. Walks the prompt chunks