From 91106deaa12233aeab38d84644e13de5b97d9dda Mon Sep 17 00:00:00 2001 From: Kent Overstreet Date: Thu, 16 Apr 2026 18:06:25 -0400 Subject: [PATCH] agent: rewrite view_image to emit Image leaves MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit view_image now reads the file, grabs dimensions via imagesize (no full decode), and pushes a user-role branch containing a NodeBody::Image leaf straight into the conversation. The tool_result is just a short acknowledgment — the actual pixels ride in the Image leaf for the API layer to extract into multi_modal_data. Drops the capture_tmux_pane path, which had no business living under "vision" (tmux text capture belongs in bash or a dedicated tool, and this one just returned rendered text anyway). Co-Authored-By: Proof of Concept --- Cargo.lock | 7 +++ Cargo.toml | 1 + src/agent/tools/mod.rs | 8 +-- src/agent/tools/vision.rs | 104 ++++++++++++++------------------------ 4 files changed, 48 insertions(+), 72 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index dfca607..c76a7cd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -492,6 +492,7 @@ dependencies = [ "http-body-util", "hyper", "hyper-util", + "imagesize", "json-five", "libc", "log", @@ -1423,6 +1424,12 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "imagesize" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09e54e57b4c48b40f7aec75635392b12b3421fa26fe8b4332e63138ed278459c" + [[package]] name = "indexmap" version = "2.14.0" diff --git a/Cargo.toml b/Cargo.toml index 7cdf851..0996f94 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -68,6 +68,7 @@ hyper-util = { version = "0.1", features = ["tokio"], default-features = false } http-body-util = "0.1" bytes = "1" base64 = "0.22" +imagesize = "0.14" rustls = "0.23" tokio-rustls = "0.26" diff --git a/src/agent/tools/mod.rs b/src/agent/tools/mod.rs index f72b015..8904fc3 100644 --- a/src/agent/tools/mod.rs +++ b/src/agent/tools/mod.rs @@ -242,13 +242,7 @@ pub fn summarize_args(tool_name: &str, args: &serde_json::Value) -> String { .as_str() .unwrap_or("") .to_string(), - "view_image" => { - if let Some(pane) = args["pane_id"].as_str() { - format!("pane {}", pane) - } else { - args["file_path"].as_str().unwrap_or("").to_string() - } - } + "view_image" => args["file_path"].as_str().unwrap_or("").to_string(), "journal" => { let entry = args["entry"].as_str().unwrap_or(""); if entry.len() > 60 { diff --git a/src/agent/tools/vision.rs b/src/agent/tools/vision.rs index 83559f6..0e36888 100644 --- a/src/agent/tools/vision.rs +++ b/src/agent/tools/vision.rs @@ -1,96 +1,71 @@ -use std::sync::Arc; // tools/vision.rs — Image viewing tool // -// Reads image files from disk and returns them as base64 data URIs -// for multimodal models. Also supports capturing tmux pane contents -// as screenshots. +// Reads an image file from disk, decodes its dimensions, and injects it +// into the context as a user-role message containing a NodeBody::Image +// leaf. The leaf carries raw bytes; the API layer extracts them into +// multi_modal_data when building vLLM requests. + +use std::sync::Arc; use anyhow::{Context, Result}; -use base64::Engine; use serde::Deserialize; +use crate::agent::context::{AstNode, Role, Section}; + #[derive(Deserialize)] struct Args { - file_path: Option, - pane_id: Option, - #[serde(default = "default_lines")] - lines: usize, + file_path: String, } -fn default_lines() -> usize { 50 } - pub fn tool() -> super::Tool { super::Tool { name: "view_image", - description: "View an image file or capture a tmux pane screenshot. Supports PNG, JPEG, GIF, WebP. Use pane_id to capture a tmux pane instead.", - parameters_json: r#"{"type":"object","properties":{"file_path":{"type":"string","description":"Path to an image file"},"pane_id":{"type":"string","description":"Tmux pane ID to capture (e.g. '0:1.0')"},"lines":{"type":"integer","description":"Lines to capture from tmux pane (default 50)"}}}"#, - handler: Arc::new(|_a, v| Box::pin(async move { view_image_text(&v) })), + description: "View an image file. Supports PNG, JPEG, GIF, WebP, BMP. The image is inserted into the conversation and can be analyzed by the vision model.", + parameters_json: r#"{"type":"object","properties":{"file_path":{"type":"string","description":"Path to the image file"}},"required":["file_path"]}"#, + handler: Arc::new(|agent, v| Box::pin(async move { + view_image(agent, v).await + })), } } -fn view_image_text(args: &serde_json::Value) -> anyhow::Result { - let a: Args = serde_json::from_value(args.clone()) +const MAX_SIZE: usize = 20 * 1024 * 1024; + +async fn view_image( + agent: Option>, + args: serde_json::Value, +) -> Result { + let a: Args = serde_json::from_value(args) .context("invalid view_image arguments")?; - if let Some(ref pane_id) = a.pane_id { - return capture_tmux_pane(pane_id, a.lines); - } - - let file_path = a.file_path - .as_deref() - .context("view_image requires either file_path or pane_id")?; - - let path = std::path::Path::new(file_path); + let path = std::path::Path::new(&a.file_path); if !path.exists() { - anyhow::bail!("File not found: {}", file_path); + anyhow::bail!("file not found: {}", a.file_path); } - let data = std::fs::read(path).with_context(|| format!("Failed to read {}", file_path))?; + let bytes = std::fs::read(path) + .with_context(|| format!("reading {}", a.file_path))?; - // Sanity check file size (don't send huge images) - const MAX_SIZE: usize = 20 * 1024 * 1024; // 20 MB - if data.len() > MAX_SIZE { + if bytes.len() > MAX_SIZE { anyhow::bail!( - "Image too large: {} bytes (max {} MB)", - data.len(), - MAX_SIZE / (1024 * 1024) + "image too large: {} bytes (max {} MB)", + bytes.len(), MAX_SIZE / (1024 * 1024), ); } + let dim = imagesize::blob_size(&bytes) + .with_context(|| format!("decoding dimensions of {}", a.file_path))?; + let (w, h) = (dim.width as u32, dim.height as u32); let mime = mime_from_extension(path); - let b64 = base64::engine::general_purpose::STANDARD.encode(&data); - let data_uri = format!("data:{};base64,{}", mime, b64); - Ok(format!("Image loaded: {} ({}, {} bytes)\n{}", file_path, mime, data.len(), data_uri)) -} + let image_leaf = AstNode::image(bytes.clone(), mime, h, w); + let token_count = image_leaf.leaf().unwrap().tokens().saturating_sub(2); -/// Capture a tmux pane's text content. -fn capture_tmux_pane(pane_id: &str, lines: usize) -> Result { + let agent = agent.context("view_image requires agent context")?; + let branch = AstNode::branch(Role::User, vec![image_leaf]); + agent.context.lock().await.push_log(Section::Conversation, branch); - // Use tmux capture-pane to get text content, then render to image - // via a simple approach: capture text and return it (the model can - // read text directly, which is often more useful than a screenshot). - // - // For actual pixel-level screenshots we'd need a terminal renderer, - // but text capture covers 95% of use cases. - let output = std::process::Command::new("tmux") - .args(["capture-pane", "-t", pane_id, "-p", "-S", &format!("-{}", lines)]) - .output() - .context("Failed to run tmux capture-pane")?; - - if !output.status.success() { - let stderr = String::from_utf8_lossy(&output.stderr); - anyhow::bail!("tmux capture-pane failed: {}", stderr.trim()); - } - - let text = String::from_utf8_lossy(&output.stdout).to_string(); - - // Return as text — the model can read terminal output directly. - // This is actually more useful than a screenshot for most tasks. - Ok(format!( - "Tmux pane {} (last {} lines):\n```\n{}\n```", - pane_id, lines, text.trim_end() - )) + Ok(format!("loaded {} ({}, {}x{}, {} tokens)", + a.file_path, mime, w, h, token_count)) } fn mime_from_extension(path: &std::path::Path) -> &'static str { @@ -104,8 +79,7 @@ fn mime_from_extension(path: &std::path::Path) -> &'static str { Some("jpg" | "jpeg") => "image/jpeg", Some("gif") => "image/gif", Some("webp") => "image/webp", - Some("svg") => "image/svg+xml", Some("bmp") => "image/bmp", - _ => "image/png", // default assumption + _ => "application/octet-stream", } }