agent: add NodeBody::Image for Qwen3-VL vision input
Images are rendered as `<|vision_start|>` + N × `<|image_pad|>` + `<|vision_end|>` where N is computed from the image dimensions using Qwen3-VL's smart_resize rules (patch_size=16, merge_size=2, min=64K, max=16M pixels). The token count matches what vLLM will produce at request time, so budget accounting stays accurate. Bytes are stored inline on the leaf and base64-encoded in the JSON form. Token IDs are hand-assembled instead of re-running the tokenizer on a potentially-huge placeholder string. Follow-ups: view_image tool rewrite, multi_modal_data on the vLLM request, API-layer plumbing from leaf bytes to request body. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
parent
592a3e2e52
commit
0bf71b9110
3 changed files with 211 additions and 20 deletions
|
|
@ -81,10 +81,33 @@ pub enum NodeBody {
|
|||
Memory { key: String, text: String, score: Option<f64> },
|
||||
Dmn(String),
|
||||
|
||||
// Vision input — rendered as <|vision_start|> <|image_pad|>×N <|vision_end|>.
|
||||
// `token_count` is N, the count vLLM will compute for this image's grid.
|
||||
Image {
|
||||
#[serde(with = "b64_bytes")]
|
||||
bytes: Vec<u8>,
|
||||
mime: String,
|
||||
orig_height: u32,
|
||||
orig_width: u32,
|
||||
token_count: u32,
|
||||
},
|
||||
|
||||
// Non-visible (0 tokens in prompt)
|
||||
Log(String),
|
||||
}
|
||||
|
||||
mod b64_bytes {
|
||||
use base64::{Engine, engine::general_purpose::STANDARD};
|
||||
use serde::{Serializer, Deserializer, Deserialize};
|
||||
pub fn serialize<S: Serializer>(bytes: &[u8], s: S) -> Result<S::Ok, S::Error> {
|
||||
s.serialize_str(&STANDARD.encode(bytes))
|
||||
}
|
||||
pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result<Vec<u8>, D::Error> {
|
||||
let s = String::deserialize(d)?;
|
||||
STANDARD.decode(s).map_err(serde::de::Error::custom)
|
||||
}
|
||||
}
|
||||
|
||||
/// A leaf node: typed content with cached token IDs.
|
||||
/// Token IDs are not serialized — they're recomputed on deserialization.
|
||||
#[derive(Debug, Clone, Serialize)]
|
||||
|
|
@ -103,11 +126,7 @@ impl<'de> Deserialize<'de> for NodeLeaf {
|
|||
timestamp: DateTime<Utc>,
|
||||
}
|
||||
let raw = Raw::deserialize(deserializer)?;
|
||||
let token_ids = if raw.body.is_prompt_visible() {
|
||||
tokenizer::encode(&raw.body.render())
|
||||
} else {
|
||||
vec![]
|
||||
};
|
||||
let token_ids = raw.body.compute_token_ids();
|
||||
Ok(NodeLeaf { body: raw.body, token_ids, timestamp: raw.timestamp })
|
||||
}
|
||||
}
|
||||
|
|
@ -221,6 +240,13 @@ impl NodeBody {
|
|||
out.push_str(text);
|
||||
out.push_str("<|im_end|>\n");
|
||||
}
|
||||
Self::Image { token_count, .. } => {
|
||||
out.push_str("<|vision_start|>");
|
||||
for _ in 0..*token_count {
|
||||
out.push_str("<|image_pad|>");
|
||||
}
|
||||
out.push_str("<|vision_end|>");
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -235,6 +261,26 @@ impl NodeBody {
|
|||
!matches!(self, Self::Thinking(_) | Self::Log(_))
|
||||
}
|
||||
|
||||
/// Hand-assemble token IDs for body types where running the tokenizer
|
||||
/// on the rendered text would be needlessly expensive (Image). Falls
|
||||
/// back to encoding the rendered text for everything else.
|
||||
fn compute_token_ids(&self) -> Vec<u32> {
|
||||
if !self.is_prompt_visible() {
|
||||
return Vec::new();
|
||||
}
|
||||
match self {
|
||||
Self::Image { token_count, .. } => {
|
||||
let mut ids = Vec::with_capacity(*token_count as usize + 2);
|
||||
ids.push(tokenizer::VISION_START);
|
||||
ids.extend(std::iter::repeat(tokenizer::IMAGE_PAD)
|
||||
.take(*token_count as usize));
|
||||
ids.push(tokenizer::VISION_END);
|
||||
ids
|
||||
}
|
||||
_ => tokenizer::encode(&self.render()),
|
||||
}
|
||||
}
|
||||
|
||||
/// The text content of this leaf (for display, not rendering).
|
||||
pub fn text(&self) -> &str {
|
||||
match self {
|
||||
|
|
@ -242,17 +288,14 @@ impl NodeBody {
|
|||
| Self::ToolResult(t) | Self::Dmn(t) => t,
|
||||
Self::ToolCall { name, .. } => name,
|
||||
Self::Memory { text, .. } => text,
|
||||
Self::Image { mime, .. } => mime,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl NodeLeaf {
|
||||
fn new(body: NodeBody) -> Self {
|
||||
let token_ids = if body.is_prompt_visible() {
|
||||
tokenizer::encode(&body.render())
|
||||
} else {
|
||||
vec![]
|
||||
};
|
||||
let token_ids = body.compute_token_ids();
|
||||
Self { body, token_ids, timestamp: Utc::now() }
|
||||
}
|
||||
|
||||
|
|
@ -305,6 +348,24 @@ impl AstNode {
|
|||
Self::Leaf(NodeLeaf::new(NodeBody::Log(text.into())))
|
||||
}
|
||||
|
||||
/// Build an Image leaf. `token_count` is computed from the image
|
||||
/// dimensions using Qwen3-VL's resizing rules.
|
||||
pub fn image(
|
||||
bytes: Vec<u8>,
|
||||
mime: impl Into<String>,
|
||||
orig_height: u32,
|
||||
orig_width: u32,
|
||||
) -> Self {
|
||||
let token_count = qwen3_image_token_count(orig_height, orig_width);
|
||||
Self::Leaf(NodeLeaf::new(NodeBody::Image {
|
||||
bytes,
|
||||
mime: mime.into(),
|
||||
orig_height,
|
||||
orig_width,
|
||||
token_count,
|
||||
}))
|
||||
}
|
||||
|
||||
// -- Branch constructors --------------------------------------------------
|
||||
|
||||
pub fn branch(role: Role, children: Vec<AstNode>) -> Self {
|
||||
|
|
@ -334,11 +395,7 @@ impl AstNode {
|
|||
pub fn retokenize(self) -> Self {
|
||||
match self {
|
||||
Self::Leaf(leaf) => {
|
||||
let token_ids = if leaf.body.is_prompt_visible() {
|
||||
tokenizer::encode(&leaf.body.render())
|
||||
} else {
|
||||
vec![]
|
||||
};
|
||||
let token_ids = leaf.body.compute_token_ids();
|
||||
Self::Leaf(NodeLeaf { token_ids, ..leaf })
|
||||
}
|
||||
Self::Branch { role, children, timestamp, memory_scores } => Self::Branch {
|
||||
|
|
@ -397,6 +454,8 @@ impl AstNode {
|
|||
None => format!("mem: {}", key),
|
||||
},
|
||||
NodeBody::Dmn(_) => "dmn".into(),
|
||||
NodeBody::Image { orig_height, orig_width, token_count, .. } =>
|
||||
format!("image: {}x{} ({} tokens)", orig_width, orig_height, token_count),
|
||||
NodeBody::Log(t) => format!("log: {}", truncate_preview(t, 60)),
|
||||
},
|
||||
}
|
||||
|
|
@ -857,11 +916,7 @@ impl ContextState {
|
|||
let node = &mut nodes[index];
|
||||
match node {
|
||||
AstNode::Leaf(leaf) => {
|
||||
let token_ids = if body.is_prompt_visible() {
|
||||
tokenizer::encode(&body.render())
|
||||
} else {
|
||||
vec![]
|
||||
};
|
||||
let token_ids = body.compute_token_ids();
|
||||
leaf.body = body;
|
||||
leaf.token_ids = token_ids;
|
||||
}
|
||||
|
|
@ -991,6 +1046,58 @@ impl ContextState {
|
|||
}
|
||||
}
|
||||
|
||||
// ---------------------------------------------------------------------------
|
||||
// Qwen3-VL image token count
|
||||
//
|
||||
// Port of Qwen2VLImageProcessor.smart_resize + image_token_count. We need the
|
||||
// exact same answer that vLLM's Qwen3VL processor will produce, because the
|
||||
// token stream in our context must match what vLLM expands `<|image_pad|>`
|
||||
// to at request time. Constants come from Qwen3.5-27B's preprocessor_config.
|
||||
// ---------------------------------------------------------------------------
|
||||
|
||||
const QWEN3_PATCH_SIZE: u32 = 16;
|
||||
const QWEN3_MERGE_SIZE: u32 = 2;
|
||||
const QWEN3_MIN_PIXELS: u64 = 65_536;
|
||||
const QWEN3_MAX_PIXELS: u64 = 16_777_216;
|
||||
|
||||
fn smart_resize(h: u32, w: u32, factor: u32, min_pixels: u64, max_pixels: u64) -> (u32, u32) {
|
||||
let max_s = h.max(w) as f64;
|
||||
let min_s = h.min(w) as f64;
|
||||
assert!(max_s / min_s <= 200.0, "aspect ratio too extreme: {}x{}", h, w);
|
||||
|
||||
let fh = h as f64;
|
||||
let fw = w as f64;
|
||||
let ff = factor as f64;
|
||||
|
||||
let h_bar = ((fh / ff).round() as u32) * factor;
|
||||
let w_bar = ((fw / ff).round() as u32) * factor;
|
||||
let total = (h_bar as u64) * (w_bar as u64);
|
||||
|
||||
if total > max_pixels {
|
||||
let beta = ((fh * fw) / max_pixels as f64).sqrt();
|
||||
let hf = ((fh / beta / ff).floor() as u32) * factor;
|
||||
let wf = ((fw / beta / ff).floor() as u32) * factor;
|
||||
(hf.max(factor), wf.max(factor))
|
||||
} else if total < min_pixels {
|
||||
let beta = (min_pixels as f64 / (fh * fw)).sqrt();
|
||||
let hc = ((fh * beta / ff).ceil() as u32) * factor;
|
||||
let wc = ((fw * beta / ff).ceil() as u32) * factor;
|
||||
(hc, wc)
|
||||
} else {
|
||||
(h_bar, w_bar)
|
||||
}
|
||||
}
|
||||
|
||||
/// Compute how many `<|image_pad|>` tokens vLLM will emit for an image of
|
||||
/// the given dimensions. Matches Qwen3VL's feature-size calculation exactly:
|
||||
/// (grid_h * grid_w) / merge_size^2
|
||||
/// where (grid_h, grid_w) = resized dims / patch_size.
|
||||
fn qwen3_image_token_count(orig_h: u32, orig_w: u32) -> u32 {
|
||||
let factor = QWEN3_PATCH_SIZE * QWEN3_MERGE_SIZE;
|
||||
let (rh, rw) = smart_resize(orig_h, orig_w, factor, QWEN3_MIN_PIXELS, QWEN3_MAX_PIXELS);
|
||||
(rh / QWEN3_PATCH_SIZE) * (rw / QWEN3_PATCH_SIZE) / (QWEN3_MERGE_SIZE * QWEN3_MERGE_SIZE)
|
||||
}
|
||||
|
||||
pub fn context_window() -> usize {
|
||||
let app = crate::config::app();
|
||||
app.backends.get(&app.default_backend)
|
||||
|
|
@ -1370,6 +1477,82 @@ mod tests {
|
|||
assert!(serde_json::from_str::<AstNode>(json).is_err());
|
||||
}
|
||||
|
||||
// -- Image leaf tests ---------------------------------------------------------
|
||||
|
||||
#[test]
|
||||
fn test_smart_resize_within_bounds() {
|
||||
// Typical case: 1024x768 → rounded to multiples of 32, under max.
|
||||
let (h, w) = smart_resize(768, 1024, 32, 65_536, 16_777_216);
|
||||
assert_eq!(h, 768);
|
||||
assert_eq!(w, 1024);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_smart_resize_upscales_tiny() {
|
||||
// 32x32 = 1024 pixels, below min_pixels=65536. Should scale up.
|
||||
let (h, w) = smart_resize(32, 32, 32, 65_536, 16_777_216);
|
||||
assert!((h as u64) * (w as u64) >= 65_536,
|
||||
"resized {}x{} is under min_pixels", h, w);
|
||||
assert_eq!(h % 32, 0);
|
||||
assert_eq!(w % 32, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_smart_resize_downscales_huge() {
|
||||
// 8000x6000 = 48M pixels, above max_pixels=16M. Should scale down.
|
||||
let (h, w) = smart_resize(8000, 6000, 32, 65_536, 16_777_216);
|
||||
assert!((h as u64) * (w as u64) <= 16_777_216,
|
||||
"resized {}x{} exceeds max_pixels", h, w);
|
||||
assert_eq!(h % 32, 0);
|
||||
assert_eq!(w % 32, 0);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_qwen3_token_count_matches_formula() {
|
||||
// 512x512 → resized to 512x512 (already multiple of 32, within bounds).
|
||||
// grid = 32x32, tokens = 32*32/4 = 256.
|
||||
assert_eq!(qwen3_image_token_count(512, 512), 256);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_image_render_and_token_ids() {
|
||||
let node = AstNode::image(vec![0u8, 1, 2, 3], "image/png", 512, 512);
|
||||
let leaf = node.leaf().unwrap();
|
||||
// 3 tokens of bookend + 256 image_pad tokens
|
||||
assert_eq!(leaf.token_ids().len(), 258);
|
||||
assert_eq!(leaf.token_ids()[0], tokenizer::VISION_START);
|
||||
assert_eq!(leaf.token_ids()[257], tokenizer::VISION_END);
|
||||
for pad in &leaf.token_ids()[1..257] {
|
||||
assert_eq!(*pad, tokenizer::IMAGE_PAD);
|
||||
}
|
||||
// Rendered text has the expected bookends.
|
||||
let rendered = leaf.body().render();
|
||||
assert!(rendered.starts_with("<|vision_start|>"));
|
||||
assert!(rendered.ends_with("<|vision_end|>"));
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_image_serde_roundtrip() {
|
||||
let node = AstNode::image(vec![0xDE, 0xAD, 0xBE, 0xEF], "image/png", 64, 64);
|
||||
let json = serde_json::to_string(&node).unwrap();
|
||||
// bytes must be base64-encoded in the JSON form
|
||||
assert!(json.contains("3q2+7w=="));
|
||||
let back: AstNode = serde_json::from_str(&json).unwrap();
|
||||
let leaf = back.leaf().unwrap();
|
||||
match leaf.body() {
|
||||
NodeBody::Image { bytes, mime, orig_height, orig_width, token_count } => {
|
||||
assert_eq!(bytes, &[0xDE, 0xAD, 0xBE, 0xEF]);
|
||||
assert_eq!(mime, "image/png");
|
||||
assert_eq!(*orig_height, 64);
|
||||
assert_eq!(*orig_width, 64);
|
||||
assert_eq!(*token_count, qwen3_image_token_count(64, 64));
|
||||
}
|
||||
other => panic!("expected Image, got {:?}", other),
|
||||
}
|
||||
// token_ids are recomputed on deserialization
|
||||
assert_eq!(leaf.token_ids().len(), leaf.tokens());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_timestamp_present_accepted() {
|
||||
let json = r#"{"Leaf":{"body":{"Content":"hi"},"timestamp":"2026-04-16T12:00:00Z"}}"#;
|
||||
|
|
|
|||
|
|
@ -16,6 +16,9 @@ static TOKENIZER: OnceLock<Tokenizer> = OnceLock::new();
|
|||
/// Special token IDs for Qwen 3.5
|
||||
pub const IM_START: u32 = 248045;
|
||||
pub const IM_END: u32 = 248046;
|
||||
pub const VISION_START: u32 = 248053;
|
||||
pub const VISION_END: u32 = 248054;
|
||||
pub const IMAGE_PAD: u32 = 248056;
|
||||
|
||||
/// Initialize the global tokenizer from a file path.
|
||||
/// Call once at startup. Panics if the file can't be loaded.
|
||||
|
|
|
|||
|
|
@ -486,6 +486,11 @@ impl InteractScreen {
|
|||
if t.is_empty() { vec![] }
|
||||
else { vec![(PaneTarget::ToolResult, text, Marker::None)] }
|
||||
}
|
||||
NodeBody::Image { orig_height, orig_width, .. } => {
|
||||
vec![(PaneTarget::Conversation,
|
||||
format!("[image {}x{}]", orig_width, orig_height),
|
||||
Marker::None)]
|
||||
}
|
||||
}
|
||||
}
|
||||
AstNode::Branch { role, children, .. } => {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue