agent: send images as multi_modal_data on completion requests

Split the prompt assembly into two forms: the AST keeps the fully-expanded representation (N image_pads per image, for accurate context budget accounting), while the request wire form collapses each image to a single <|image_pad|> bookended by vision_start/end and ships the raw bytes out-of-band as a base64 data URI in a new `multi_modal_data.image` field on /v1/completions. vLLM's Qwen3VL processor uses PromptReplacement with target=single <|image_pad|> and replacement=N image_pads, so the wire-form matches what the processor expects and it re-expands to N server-side. Server side needs /v1/completions to accept multi_modal_data for this to land images end-to-end — that's the next piece. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-16 18:08:26 -04:00 · 2026-04-16 18:08:26 -04:00 · 204ba5570a
commit 204ba5570a
parent 91106deaa1
3 changed files with 115 additions and 5 deletions
--- a/src/agent/api/mod.rs
+++ b/src/agent/api/mod.rs
@ -78,18 +78,31 @@ impl ApiClient {
        prompt_tokens: &[u32],
        sampling: SamplingParams,
        priority: Option<i32>,
+    ) -> (mpsc::UnboundedReceiver<StreamToken>, AbortOnDrop) {
+        self.stream_completion_mm(prompt_tokens, &[], sampling, priority)
+    }
+
+    pub(crate) fn stream_completion_mm(
+        &self,
+        prompt_tokens: &[u32],
+        images: &[super::context::WireImage],
+        sampling: SamplingParams,
+        priority: Option<i32>,
    ) -> (mpsc::UnboundedReceiver<StreamToken>, AbortOnDrop) {
        let (tx, rx) = mpsc::unbounded_channel();
        let client = self.client.clone();
        let api_key = self.api_key.clone();
        let model = self.model.clone();
        let prompt_tokens = prompt_tokens.to_vec();
+        let images: Vec<(Vec<u8>, String)> = images.iter()
+            .map(|i| (i.bytes.clone(), i.mime.clone()))
+            .collect();
        let base_url = self.base_url.clone();

        let handle = tokio::spawn(async move {
            let result = stream_completions(
                &client, &base_url, &api_key, &model,
-                &prompt_tokens, &tx, sampling, priority,
+                &prompt_tokens, &images, &tx, sampling, priority,
            ).await;
            if let Err(e) = result {
                let _ = tx.send(StreamToken::Error(e.to_string()));
@ -110,6 +123,7 @@ async fn stream_completions(
    api_key: &str,
    model: &str,
    prompt_tokens: &[u32],
+    images: &[(Vec<u8>, String)],
    tx: &mpsc::UnboundedSender<StreamToken>,
    sampling: SamplingParams,
    priority: Option<i32>,
@ -126,6 +140,14 @@ async fn stream_completions(
        "skip_special_tokens": false,
        "stop_token_ids": [super::tokenizer::IM_END],
    });
+    if !images.is_empty() {
+        use base64::Engine;
+        let b64 = base64::engine::general_purpose::STANDARD;
+        let uris: Vec<String> = images.iter()
+            .map(|(bytes, mime)| format!("data:{};base64,{}", mime, b64.encode(bytes)))
+            .collect();
+        request["multi_modal_data"] = serde_json::json!({ "image": uris });
+    }
    if let Some(p) = priority {
        request["priority"] = serde_json::json!(p);
    }