agent: send images as multi_modal_data on completion requests

Split the prompt assembly into two forms: the AST keeps the
fully-expanded representation (N image_pads per image, for accurate
context budget accounting), while the request wire form collapses
each image to a single <|image_pad|> bookended by vision_start/end
and ships the raw bytes out-of-band as a base64 data URI in a new
`multi_modal_data.image` field on /v1/completions.

vLLM's Qwen3VL processor uses PromptReplacement with target=single
<|image_pad|> and replacement=N image_pads, so the wire-form matches
what the processor expects and it re-expands to N server-side.

Server side needs /v1/completions to accept multi_modal_data for
this to land images end-to-end — that's the next piece.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
Kent Overstreet 2026-04-16 18:08:26 -04:00
parent 91106deaa1
commit 204ba5570a
3 changed files with 115 additions and 5 deletions

View file

@ -78,18 +78,31 @@ impl ApiClient {
prompt_tokens: &[u32],
sampling: SamplingParams,
priority: Option<i32>,
) -> (mpsc::UnboundedReceiver<StreamToken>, AbortOnDrop) {
self.stream_completion_mm(prompt_tokens, &[], sampling, priority)
}
pub(crate) fn stream_completion_mm(
&self,
prompt_tokens: &[u32],
images: &[super::context::WireImage],
sampling: SamplingParams,
priority: Option<i32>,
) -> (mpsc::UnboundedReceiver<StreamToken>, AbortOnDrop) {
let (tx, rx) = mpsc::unbounded_channel();
let client = self.client.clone();
let api_key = self.api_key.clone();
let model = self.model.clone();
let prompt_tokens = prompt_tokens.to_vec();
let images: Vec<(Vec<u8>, String)> = images.iter()
.map(|i| (i.bytes.clone(), i.mime.clone()))
.collect();
let base_url = self.base_url.clone();
let handle = tokio::spawn(async move {
let result = stream_completions(
&client, &base_url, &api_key, &model,
&prompt_tokens, &tx, sampling, priority,
&prompt_tokens, &images, &tx, sampling, priority,
).await;
if let Err(e) = result {
let _ = tx.send(StreamToken::Error(e.to_string()));
@ -110,6 +123,7 @@ async fn stream_completions(
api_key: &str,
model: &str,
prompt_tokens: &[u32],
images: &[(Vec<u8>, String)],
tx: &mpsc::UnboundedSender<StreamToken>,
sampling: SamplingParams,
priority: Option<i32>,
@ -126,6 +140,14 @@ async fn stream_completions(
"skip_special_tokens": false,
"stop_token_ids": [super::tokenizer::IM_END],
});
if !images.is_empty() {
use base64::Engine;
let b64 = base64::engine::general_purpose::STANDARD;
let uris: Vec<String> = images.iter()
.map(|(bytes, mime)| format!("data:{};base64,{}", mime, b64.encode(bytes)))
.collect();
request["multi_modal_data"] = serde_json::json!({ "image": uris });
}
if let Some(p) = priority {
request["priority"] = serde_json::json!(p);
}