From 91106deaa12233aeab38d84644e13de5b97d9dda Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Thu, 16 Apr 2026 18:06:25 -0400
Subject: [PATCH] agent: rewrite view_image to emit Image leaves
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

view_image now reads the file, grabs dimensions via imagesize (no full
decode), and pushes a user-role branch containing a NodeBody::Image
leaf straight into the conversation. The tool_result is just a short
acknowledgment — the actual pixels ride in the Image leaf for the API
layer to extract into multi_modal_data.

Drops the capture_tmux_pane path, which had no business living under
"vision" (tmux text capture belongs in bash or a dedicated tool, and
this one just returned rendered text anyway).

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
---
 Cargo.lock                |   7 +++
 Cargo.toml                |   1 +
 src/agent/tools/mod.rs    |   8 +--
 src/agent/tools/vision.rs | 104 ++++++++++++++------------------------
 4 files changed, 48 insertions(+), 72 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index dfca607..c76a7cd 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -492,6 +492,7 @@ dependencies = [
  "http-body-util",
  "hyper",
  "hyper-util",
+ "imagesize",
  "json-five",
  "libc",
  "log",
@@ -1423,6 +1424,12 @@ dependencies = [
  "winapi-util",
 ]
 
+[[package]]
+name = "imagesize"
+version = "0.14.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "09e54e57b4c48b40f7aec75635392b12b3421fa26fe8b4332e63138ed278459c"
+
 [[package]]
 name = "indexmap"
 version = "2.14.0"
diff --git a/Cargo.toml b/Cargo.toml
index 7cdf851..0996f94 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -68,6 +68,7 @@ hyper-util = { version = "0.1", features = ["tokio"], default-features = false }
 http-body-util = "0.1"
 bytes = "1"
 base64 = "0.22"
+imagesize = "0.14"
 
 rustls = "0.23"
 tokio-rustls = "0.26"
diff --git a/src/agent/tools/mod.rs b/src/agent/tools/mod.rs
index f72b015..8904fc3 100644
--- a/src/agent/tools/mod.rs
+++ b/src/agent/tools/mod.rs
@@ -242,13 +242,7 @@ pub fn summarize_args(tool_name: &str, args: &serde_json::Value) -> String {
             .as_str()
             .unwrap_or("")
             .to_string(),
-        "view_image" => {
-            if let Some(pane) = args["pane_id"].as_str() {
-                format!("pane {}", pane)
-            } else {
-                args["file_path"].as_str().unwrap_or("").to_string()
-            }
-        }
+        "view_image" => args["file_path"].as_str().unwrap_or("").to_string(),
         "journal" => {
             let entry = args["entry"].as_str().unwrap_or("");
             if entry.len() > 60 {
diff --git a/src/agent/tools/vision.rs b/src/agent/tools/vision.rs
index 83559f6..0e36888 100644
--- a/src/agent/tools/vision.rs
+++ b/src/agent/tools/vision.rs
@@ -1,96 +1,71 @@
-use std::sync::Arc;
 // tools/vision.rs — Image viewing tool
 //
-// Reads image files from disk and returns them as base64 data URIs
-// for multimodal models. Also supports capturing tmux pane contents
-// as screenshots.
+// Reads an image file from disk, decodes its dimensions, and injects it
+// into the context as a user-role message containing a NodeBody::Image
+// leaf. The leaf carries raw bytes; the API layer extracts them into
+// multi_modal_data when building vLLM requests.
+
+use std::sync::Arc;
 
 use anyhow::{Context, Result};
-use base64::Engine;
 use serde::Deserialize;
 
+use crate::agent::context::{AstNode, Role, Section};
+
 #[derive(Deserialize)]
 struct Args {
-    file_path: Option<String>,
-    pane_id: Option<String>,
-    #[serde(default = "default_lines")]
-    lines: usize,
+    file_path: String,
 }
 
-fn default_lines() -> usize { 50 }
-
 pub fn tool() -> super::Tool {
     super::Tool {
         name: "view_image",
-        description: "View an image file or capture a tmux pane screenshot. Supports PNG, JPEG, GIF, WebP. Use pane_id to capture a tmux pane instead.",
-        parameters_json: r#"{"type":"object","properties":{"file_path":{"type":"string","description":"Path to an image file"},"pane_id":{"type":"string","description":"Tmux pane ID to capture (e.g. '0:1.0')"},"lines":{"type":"integer","description":"Lines to capture from tmux pane (default 50)"}}}"#,
-        handler: Arc::new(|_a, v| Box::pin(async move { view_image_text(&v) })),
+        description: "View an image file. Supports PNG, JPEG, GIF, WebP, BMP. The image is inserted into the conversation and can be analyzed by the vision model.",
+        parameters_json: r#"{"type":"object","properties":{"file_path":{"type":"string","description":"Path to the image file"}},"required":["file_path"]}"#,
+        handler: Arc::new(|agent, v| Box::pin(async move {
+            view_image(agent, v).await
+        })),
     }
 }
 
-fn view_image_text(args: &serde_json::Value) -> anyhow::Result<String> {
-    let a: Args = serde_json::from_value(args.clone())
+const MAX_SIZE: usize = 20 * 1024 * 1024;
+
+async fn view_image(
+    agent: Option<Arc<crate::agent::Agent>>,
+    args: serde_json::Value,
+) -> Result<String> {
+    let a: Args = serde_json::from_value(args)
         .context("invalid view_image arguments")?;
 
-    if let Some(ref pane_id) = a.pane_id {
-        return capture_tmux_pane(pane_id, a.lines);
-    }
-
-    let file_path = a.file_path
-        .as_deref()
-        .context("view_image requires either file_path or pane_id")?;
-
-    let path = std::path::Path::new(file_path);
+    let path = std::path::Path::new(&a.file_path);
     if !path.exists() {
-        anyhow::bail!("File not found: {}", file_path);
+        anyhow::bail!("file not found: {}", a.file_path);
     }
 
-    let data = std::fs::read(path).with_context(|| format!("Failed to read {}", file_path))?;
+    let bytes = std::fs::read(path)
+        .with_context(|| format!("reading {}", a.file_path))?;
 
-    // Sanity check file size (don't send huge images)
-    const MAX_SIZE: usize = 20 * 1024 * 1024; // 20 MB
-    if data.len() > MAX_SIZE {
+    if bytes.len() > MAX_SIZE {
         anyhow::bail!(
-            "Image too large: {} bytes (max {} MB)",
-            data.len(),
-            MAX_SIZE / (1024 * 1024)
+            "image too large: {} bytes (max {} MB)",
+            bytes.len(), MAX_SIZE / (1024 * 1024),
         );
     }
 
+    let dim = imagesize::blob_size(&bytes)
+        .with_context(|| format!("decoding dimensions of {}", a.file_path))?;
+    let (w, h) = (dim.width as u32, dim.height as u32);
     let mime = mime_from_extension(path);
-    let b64 = base64::engine::general_purpose::STANDARD.encode(&data);
-    let data_uri = format!("data:{};base64,{}", mime, b64);
 
-    Ok(format!("Image loaded: {} ({}, {} bytes)\n{}", file_path, mime, data.len(), data_uri))
-}
+    let image_leaf = AstNode::image(bytes.clone(), mime, h, w);
+    let token_count = image_leaf.leaf().unwrap().tokens().saturating_sub(2);
 
-/// Capture a tmux pane's text content.
-fn capture_tmux_pane(pane_id: &str, lines: usize) -> Result<String> {
+    let agent = agent.context("view_image requires agent context")?;
+    let branch = AstNode::branch(Role::User, vec![image_leaf]);
+    agent.context.lock().await.push_log(Section::Conversation, branch);
 
-    // Use tmux capture-pane to get text content, then render to image
-    // via a simple approach: capture text and return it (the model can
-    // read text directly, which is often more useful than a screenshot).
-    //
-    // For actual pixel-level screenshots we'd need a terminal renderer,
-    // but text capture covers 95% of use cases.
-    let output = std::process::Command::new("tmux")
-        .args(["capture-pane", "-t", pane_id, "-p", "-S", &format!("-{}", lines)])
-        .output()
-        .context("Failed to run tmux capture-pane")?;
-
-    if !output.status.success() {
-        let stderr = String::from_utf8_lossy(&output.stderr);
-        anyhow::bail!("tmux capture-pane failed: {}", stderr.trim());
-    }
-
-    let text = String::from_utf8_lossy(&output.stdout).to_string();
-
-    // Return as text — the model can read terminal output directly.
-    // This is actually more useful than a screenshot for most tasks.
-    Ok(format!(
-        "Tmux pane {} (last {} lines):\n```\n{}\n```",
-        pane_id, lines, text.trim_end()
-    ))
+    Ok(format!("loaded {} ({}, {}x{}, {} tokens)",
+        a.file_path, mime, w, h, token_count))
 }
 
 fn mime_from_extension(path: &std::path::Path) -> &'static str {
@@ -104,8 +79,7 @@ fn mime_from_extension(path: &std::path::Path) -> &'static str {
         Some("jpg" | "jpeg") => "image/jpeg",
         Some("gif") => "image/gif",
         Some("webp") => "image/webp",
-        Some("svg") => "image/svg+xml",
         Some("bmp") => "image/bmp",
-        _ => "image/png", // default assumption
+        _ => "application/octet-stream",
     }
 }