agent: send images as multi_modal_data on completion requests

Split the prompt assembly into two forms: the AST keeps the fully-expanded representation (N image_pads per image, for accurate context budget accounting), while the request wire form collapses each image to a single <|image_pad|> bookended by vision_start/end and ships the raw bytes out-of-band as a base64 data URI in a new `multi_modal_data.image` field on /v1/completions. vLLM's Qwen3VL processor uses PromptReplacement with target=single <|image_pad|> and replacement=N image_pads, so the wire-form matches what the processor expects and it re-expands to N server-side. Server side needs /v1/completions to accept multi_modal_data for this to land images end-to-end — that's the next piece. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-16 18:08:26 -04:00 · 2026-04-16 18:08:26 -04:00 · 204ba5570a
commit 204ba5570a
parent 91106deaa1
3 changed files with 115 additions and 5 deletions
--- a/src/agent/context.rs
+++ b/src/agent/context.rs
@ -884,6 +884,58 @@ impl Ast for ContextState {
    }
 }

+/// An image collected from the AST for a request body. The AST stores
+/// the pre-expanded token form (N image_pads) for accurate budget
+/// accounting; the wire form collapses each Image to a single
+/// `<|image_pad|>` between vision bookends and ships the bytes
+/// separately as multi_modal_data.
+pub struct WireImage {
+    pub bytes: Vec<u8>,
+    pub mime: String,
+}
+
+fn wire_into(node: &AstNode, tokens: &mut Vec<u32>, images: &mut Vec<WireImage>) {
+    match node {
+        AstNode::Leaf(leaf) => match leaf.body() {
+            NodeBody::Image { bytes, mime, .. } => {
+                tokens.push(tokenizer::VISION_START);
+                tokens.push(tokenizer::IMAGE_PAD);
+                tokens.push(tokenizer::VISION_END);
+                images.push(WireImage {
+                    bytes: bytes.clone(),
+                    mime: mime.clone(),
+                });
+            }
+            _ => tokens.extend_from_slice(leaf.token_ids()),
+        },
+        AstNode::Branch { role, children, .. } => {
+            tokens.push(tokenizer::IM_START);
+            tokens.extend(tokenizer::encode(&format!("{}\n", role.as_str())));
+            for c in children {
+                wire_into(c, tokens, images);
+            }
+            tokens.push(tokenizer::IM_END);
+            tokens.extend(tokenizer::encode("\n"));
+        }
+    }
+}
+
+impl ContextState {
+    /// Assemble the prompt in wire form: token stream with a single
+    /// `<|image_pad|>` per image (vLLM expands back to N), plus the list
+    /// of images to send as multi_modal_data.
+    pub fn wire_prompt(&self) -> (Vec<u32>, Vec<WireImage>) {
+        let mut tokens = Vec::new();
+        let mut images = Vec::new();
+        for section in self.sections() {
+            for node in section {
+                wire_into(node, &mut tokens, &mut images);
+            }
+        }
+        (tokens, images)
+    }
+}
+
 impl ContextState {
    fn section_mut(&mut self, section: Section) -> &mut Vec<AstNode> {
        match section {
@ -1531,6 +1583,34 @@ mod tests {
        assert!(rendered.ends_with("<|vision_end|>"));
    }

+    #[test]
+    fn test_wire_prompt_collapses_image_pads() {
+        let mut ctx = ContextState::new();
+        ctx.push_no_log(Section::Conversation, AstNode::branch(Role::User, vec![
+            AstNode::content("look:"),
+            AstNode::image(vec![0xDE, 0xAD], "image/png", 512, 512),
+        ]));
+
+        // AST side: N image_pads + bookends, full budget accounting.
+        let full = ctx.token_ids();
+        let n_image_pads_full = full.iter()
+            .filter(|&&t| t == tokenizer::IMAGE_PAD).count();
+        assert_eq!(n_image_pads_full, qwen3_image_token_count(512, 512) as usize);
+
+        // Wire side: single image_pad, bytes moved to images list.
+        let (wire, images) = ctx.wire_prompt();
+        let n_image_pads_wire = wire.iter()
+            .filter(|&&t| t == tokenizer::IMAGE_PAD).count();
+        assert_eq!(n_image_pads_wire, 1);
+        assert_eq!(images.len(), 1);
+        assert_eq!(images[0].bytes, vec![0xDE, 0xAD]);
+        assert_eq!(images[0].mime, "image/png");
+
+        // vision_start/vision_end bookends are preserved in wire form.
+        assert_eq!(wire.iter().filter(|&&t| t == tokenizer::VISION_START).count(), 1);
+        assert_eq!(wire.iter().filter(|&&t| t == tokenizer::VISION_END).count(), 1);
+    }
+
    #[test]
    fn test_image_serde_roundtrip() {
        let node = AstNode::image(vec![0xDE, 0xAD, 0xBE, 0xEF], "image/png", 64, 64);