From 0bf71b91101c644d03721442b4bfea242ff67a7e Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 16 Apr 2026 18:00:10 -0400 Subject: [PATCH] agent: add NodeBody::Image for Qwen3-VL vision input MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Images are rendered as `<|vision_start|>` + N × `<|image_pad|>` + `<|vision_end|>` where N is computed from the image dimensions using Qwen3-VL's smart_resize rules (patch_size=16, merge_size=2, min=64K, max=16M pixels). The token count matches what vLLM will produce at request time, so budget accounting stays accurate. Bytes are stored inline on the leaf and base64-encoded in the JSON form. Token IDs are hand-assembled instead of re-running the tokenizer on a potentially-huge placeholder string. Follow-ups: view_image tool rewrite, multi_modal_data on the vLLM request, API-layer plumbing from leaf bytes to request body. Co-Authored-By: Proof of Concept --- src/agent/context.rs | 223 +++++++++++++++++++++++++++++++++++++---- src/agent/tokenizer.rs | 3 + src/user/chat.rs | 5 + 3 files changed, 211 insertions(+), 20 deletions(-) diff --git a/src/agent/context.rs b/src/agent/context.rs index 37dbf48..57b2c7a 100644 --- a/src/agent/context.rs +++ b/src/agent/context.rs @@ -81,10 +81,33 @@ pub enum NodeBody { Memory { key: String, text: String, score: Option }, Dmn(String), + // Vision input — rendered as <|vision_start|> <|image_pad|>×N <|vision_end|>. + // `token_count` is N, the count vLLM will compute for this image's grid. + Image { + #[serde(with = "b64_bytes")] + bytes: Vec, + mime: String, + orig_height: u32, + orig_width: u32, + token_count: u32, + }, + // Non-visible (0 tokens in prompt) Log(String), } +mod b64_bytes { + use base64::{Engine, engine::general_purpose::STANDARD}; + use serde::{Serializer, Deserializer, Deserialize}; + pub fn serialize(bytes: &[u8], s: S) -> Result { + s.serialize_str(&STANDARD.encode(bytes)) + } + pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result, D::Error> { + let s = String::deserialize(d)?; + STANDARD.decode(s).map_err(serde::de::Error::custom) + } +} + /// A leaf node: typed content with cached token IDs. /// Token IDs are not serialized — they're recomputed on deserialization. #[derive(Debug, Clone, Serialize)] @@ -103,11 +126,7 @@ impl<'de> Deserialize<'de> for NodeLeaf { timestamp: DateTime, } let raw = Raw::deserialize(deserializer)?; - let token_ids = if raw.body.is_prompt_visible() { - tokenizer::encode(&raw.body.render()) - } else { - vec![] - }; + let token_ids = raw.body.compute_token_ids(); Ok(NodeLeaf { body: raw.body, token_ids, timestamp: raw.timestamp }) } } @@ -221,6 +240,13 @@ impl NodeBody { out.push_str(text); out.push_str("<|im_end|>\n"); } + Self::Image { token_count, .. } => { + out.push_str("<|vision_start|>"); + for _ in 0..*token_count { + out.push_str("<|image_pad|>"); + } + out.push_str("<|vision_end|>"); + } } } @@ -235,6 +261,26 @@ impl NodeBody { !matches!(self, Self::Thinking(_) | Self::Log(_)) } + /// Hand-assemble token IDs for body types where running the tokenizer + /// on the rendered text would be needlessly expensive (Image). Falls + /// back to encoding the rendered text for everything else. + fn compute_token_ids(&self) -> Vec { + if !self.is_prompt_visible() { + return Vec::new(); + } + match self { + Self::Image { token_count, .. } => { + let mut ids = Vec::with_capacity(*token_count as usize + 2); + ids.push(tokenizer::VISION_START); + ids.extend(std::iter::repeat(tokenizer::IMAGE_PAD) + .take(*token_count as usize)); + ids.push(tokenizer::VISION_END); + ids + } + _ => tokenizer::encode(&self.render()), + } + } + /// The text content of this leaf (for display, not rendering). pub fn text(&self) -> &str { match self { @@ -242,17 +288,14 @@ impl NodeBody { | Self::ToolResult(t) | Self::Dmn(t) => t, Self::ToolCall { name, .. } => name, Self::Memory { text, .. } => text, + Self::Image { mime, .. } => mime, } } } impl NodeLeaf { fn new(body: NodeBody) -> Self { - let token_ids = if body.is_prompt_visible() { - tokenizer::encode(&body.render()) - } else { - vec![] - }; + let token_ids = body.compute_token_ids(); Self { body, token_ids, timestamp: Utc::now() } } @@ -305,6 +348,24 @@ impl AstNode { Self::Leaf(NodeLeaf::new(NodeBody::Log(text.into()))) } + /// Build an Image leaf. `token_count` is computed from the image + /// dimensions using Qwen3-VL's resizing rules. + pub fn image( + bytes: Vec, + mime: impl Into, + orig_height: u32, + orig_width: u32, + ) -> Self { + let token_count = qwen3_image_token_count(orig_height, orig_width); + Self::Leaf(NodeLeaf::new(NodeBody::Image { + bytes, + mime: mime.into(), + orig_height, + orig_width, + token_count, + })) + } + // -- Branch constructors -------------------------------------------------- pub fn branch(role: Role, children: Vec) -> Self { @@ -334,11 +395,7 @@ impl AstNode { pub fn retokenize(self) -> Self { match self { Self::Leaf(leaf) => { - let token_ids = if leaf.body.is_prompt_visible() { - tokenizer::encode(&leaf.body.render()) - } else { - vec![] - }; + let token_ids = leaf.body.compute_token_ids(); Self::Leaf(NodeLeaf { token_ids, ..leaf }) } Self::Branch { role, children, timestamp, memory_scores } => Self::Branch { @@ -397,6 +454,8 @@ impl AstNode { None => format!("mem: {}", key), }, NodeBody::Dmn(_) => "dmn".into(), + NodeBody::Image { orig_height, orig_width, token_count, .. } => + format!("image: {}x{} ({} tokens)", orig_width, orig_height, token_count), NodeBody::Log(t) => format!("log: {}", truncate_preview(t, 60)), }, } @@ -857,11 +916,7 @@ impl ContextState { let node = &mut nodes[index]; match node { AstNode::Leaf(leaf) => { - let token_ids = if body.is_prompt_visible() { - tokenizer::encode(&body.render()) - } else { - vec![] - }; + let token_ids = body.compute_token_ids(); leaf.body = body; leaf.token_ids = token_ids; } @@ -991,6 +1046,58 @@ impl ContextState { } } +// --------------------------------------------------------------------------- +// Qwen3-VL image token count +// +// Port of Qwen2VLImageProcessor.smart_resize + image_token_count. We need the +// exact same answer that vLLM's Qwen3VL processor will produce, because the +// token stream in our context must match what vLLM expands `<|image_pad|>` +// to at request time. Constants come from Qwen3.5-27B's preprocessor_config. +// --------------------------------------------------------------------------- + +const QWEN3_PATCH_SIZE: u32 = 16; +const QWEN3_MERGE_SIZE: u32 = 2; +const QWEN3_MIN_PIXELS: u64 = 65_536; +const QWEN3_MAX_PIXELS: u64 = 16_777_216; + +fn smart_resize(h: u32, w: u32, factor: u32, min_pixels: u64, max_pixels: u64) -> (u32, u32) { + let max_s = h.max(w) as f64; + let min_s = h.min(w) as f64; + assert!(max_s / min_s <= 200.0, "aspect ratio too extreme: {}x{}", h, w); + + let fh = h as f64; + let fw = w as f64; + let ff = factor as f64; + + let h_bar = ((fh / ff).round() as u32) * factor; + let w_bar = ((fw / ff).round() as u32) * factor; + let total = (h_bar as u64) * (w_bar as u64); + + if total > max_pixels { + let beta = ((fh * fw) / max_pixels as f64).sqrt(); + let hf = ((fh / beta / ff).floor() as u32) * factor; + let wf = ((fw / beta / ff).floor() as u32) * factor; + (hf.max(factor), wf.max(factor)) + } else if total < min_pixels { + let beta = (min_pixels as f64 / (fh * fw)).sqrt(); + let hc = ((fh * beta / ff).ceil() as u32) * factor; + let wc = ((fw * beta / ff).ceil() as u32) * factor; + (hc, wc) + } else { + (h_bar, w_bar) + } +} + +/// Compute how many `<|image_pad|>` tokens vLLM will emit for an image of +/// the given dimensions. Matches Qwen3VL's feature-size calculation exactly: +/// (grid_h * grid_w) / merge_size^2 +/// where (grid_h, grid_w) = resized dims / patch_size. +fn qwen3_image_token_count(orig_h: u32, orig_w: u32) -> u32 { + let factor = QWEN3_PATCH_SIZE * QWEN3_MERGE_SIZE; + let (rh, rw) = smart_resize(orig_h, orig_w, factor, QWEN3_MIN_PIXELS, QWEN3_MAX_PIXELS); + (rh / QWEN3_PATCH_SIZE) * (rw / QWEN3_PATCH_SIZE) / (QWEN3_MERGE_SIZE * QWEN3_MERGE_SIZE) +} + pub fn context_window() -> usize { let app = crate::config::app(); app.backends.get(&app.default_backend) @@ -1370,6 +1477,82 @@ mod tests { assert!(serde_json::from_str::(json).is_err()); } + // -- Image leaf tests --------------------------------------------------------- + + #[test] + fn test_smart_resize_within_bounds() { + // Typical case: 1024x768 → rounded to multiples of 32, under max. + let (h, w) = smart_resize(768, 1024, 32, 65_536, 16_777_216); + assert_eq!(h, 768); + assert_eq!(w, 1024); + } + + #[test] + fn test_smart_resize_upscales_tiny() { + // 32x32 = 1024 pixels, below min_pixels=65536. Should scale up. + let (h, w) = smart_resize(32, 32, 32, 65_536, 16_777_216); + assert!((h as u64) * (w as u64) >= 65_536, + "resized {}x{} is under min_pixels", h, w); + assert_eq!(h % 32, 0); + assert_eq!(w % 32, 0); + } + + #[test] + fn test_smart_resize_downscales_huge() { + // 8000x6000 = 48M pixels, above max_pixels=16M. Should scale down. + let (h, w) = smart_resize(8000, 6000, 32, 65_536, 16_777_216); + assert!((h as u64) * (w as u64) <= 16_777_216, + "resized {}x{} exceeds max_pixels", h, w); + assert_eq!(h % 32, 0); + assert_eq!(w % 32, 0); + } + + #[test] + fn test_qwen3_token_count_matches_formula() { + // 512x512 → resized to 512x512 (already multiple of 32, within bounds). + // grid = 32x32, tokens = 32*32/4 = 256. + assert_eq!(qwen3_image_token_count(512, 512), 256); + } + + #[test] + fn test_image_render_and_token_ids() { + let node = AstNode::image(vec![0u8, 1, 2, 3], "image/png", 512, 512); + let leaf = node.leaf().unwrap(); + // 3 tokens of bookend + 256 image_pad tokens + assert_eq!(leaf.token_ids().len(), 258); + assert_eq!(leaf.token_ids()[0], tokenizer::VISION_START); + assert_eq!(leaf.token_ids()[257], tokenizer::VISION_END); + for pad in &leaf.token_ids()[1..257] { + assert_eq!(*pad, tokenizer::IMAGE_PAD); + } + // Rendered text has the expected bookends. + let rendered = leaf.body().render(); + assert!(rendered.starts_with("<|vision_start|>")); + assert!(rendered.ends_with("<|vision_end|>")); + } + + #[test] + fn test_image_serde_roundtrip() { + let node = AstNode::image(vec![0xDE, 0xAD, 0xBE, 0xEF], "image/png", 64, 64); + let json = serde_json::to_string(&node).unwrap(); + // bytes must be base64-encoded in the JSON form + assert!(json.contains("3q2+7w==")); + let back: AstNode = serde_json::from_str(&json).unwrap(); + let leaf = back.leaf().unwrap(); + match leaf.body() { + NodeBody::Image { bytes, mime, orig_height, orig_width, token_count } => { + assert_eq!(bytes, &[0xDE, 0xAD, 0xBE, 0xEF]); + assert_eq!(mime, "image/png"); + assert_eq!(*orig_height, 64); + assert_eq!(*orig_width, 64); + assert_eq!(*token_count, qwen3_image_token_count(64, 64)); + } + other => panic!("expected Image, got {:?}", other), + } + // token_ids are recomputed on deserialization + assert_eq!(leaf.token_ids().len(), leaf.tokens()); + } + #[test] fn test_timestamp_present_accepted() { let json = r#"{"Leaf":{"body":{"Content":"hi"},"timestamp":"2026-04-16T12:00:00Z"}}"#; diff --git a/src/agent/tokenizer.rs b/src/agent/tokenizer.rs index 85ac823..cd0acaf 100644 --- a/src/agent/tokenizer.rs +++ b/src/agent/tokenizer.rs @@ -16,6 +16,9 @@ static TOKENIZER: OnceLock = OnceLock::new(); /// Special token IDs for Qwen 3.5 pub const IM_START: u32 = 248045; pub const IM_END: u32 = 248046; +pub const VISION_START: u32 = 248053; +pub const VISION_END: u32 = 248054; +pub const IMAGE_PAD: u32 = 248056; /// Initialize the global tokenizer from a file path. /// Call once at startup. Panics if the file can't be loaded. diff --git a/src/user/chat.rs b/src/user/chat.rs index 47c5d56..fe3db5b 100644 --- a/src/user/chat.rs +++ b/src/user/chat.rs @@ -486,6 +486,11 @@ impl InteractScreen { if t.is_empty() { vec![] } else { vec![(PaneTarget::ToolResult, text, Marker::None)] } } + NodeBody::Image { orig_height, orig_width, .. } => { + vec![(PaneTarget::Conversation, + format!("[image {}x{}]", orig_width, orig_height), + Marker::None)] + } } } AstNode::Branch { role, children, .. } => {