agent: rewrite view_image to emit Image leaves
view_image now reads the file, grabs dimensions via imagesize (no full decode), and pushes a user-role branch containing a NodeBody::Image leaf straight into the conversation. The tool_result is just a short acknowledgment — the actual pixels ride in the Image leaf for the API layer to extract into multi_modal_data. Drops the capture_tmux_pane path, which had no business living under "vision" (tmux text capture belongs in bash or a dedicated tool, and this one just returned rendered text anyway). Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
parent
0bf71b9110
commit
91106deaa1
4 changed files with 48 additions and 72 deletions
7
Cargo.lock
generated
7
Cargo.lock
generated
|
|
@ -492,6 +492,7 @@ dependencies = [
|
||||||
"http-body-util",
|
"http-body-util",
|
||||||
"hyper",
|
"hyper",
|
||||||
"hyper-util",
|
"hyper-util",
|
||||||
|
"imagesize",
|
||||||
"json-five",
|
"json-five",
|
||||||
"libc",
|
"libc",
|
||||||
"log",
|
"log",
|
||||||
|
|
@ -1423,6 +1424,12 @@ dependencies = [
|
||||||
"winapi-util",
|
"winapi-util",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "imagesize"
|
||||||
|
version = "0.14.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "09e54e57b4c48b40f7aec75635392b12b3421fa26fe8b4332e63138ed278459c"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "indexmap"
|
name = "indexmap"
|
||||||
version = "2.14.0"
|
version = "2.14.0"
|
||||||
|
|
|
||||||
|
|
@ -68,6 +68,7 @@ hyper-util = { version = "0.1", features = ["tokio"], default-features = false }
|
||||||
http-body-util = "0.1"
|
http-body-util = "0.1"
|
||||||
bytes = "1"
|
bytes = "1"
|
||||||
base64 = "0.22"
|
base64 = "0.22"
|
||||||
|
imagesize = "0.14"
|
||||||
|
|
||||||
rustls = "0.23"
|
rustls = "0.23"
|
||||||
tokio-rustls = "0.26"
|
tokio-rustls = "0.26"
|
||||||
|
|
|
||||||
|
|
@ -242,13 +242,7 @@ pub fn summarize_args(tool_name: &str, args: &serde_json::Value) -> String {
|
||||||
.as_str()
|
.as_str()
|
||||||
.unwrap_or("")
|
.unwrap_or("")
|
||||||
.to_string(),
|
.to_string(),
|
||||||
"view_image" => {
|
"view_image" => args["file_path"].as_str().unwrap_or("").to_string(),
|
||||||
if let Some(pane) = args["pane_id"].as_str() {
|
|
||||||
format!("pane {}", pane)
|
|
||||||
} else {
|
|
||||||
args["file_path"].as_str().unwrap_or("").to_string()
|
|
||||||
}
|
|
||||||
}
|
|
||||||
"journal" => {
|
"journal" => {
|
||||||
let entry = args["entry"].as_str().unwrap_or("");
|
let entry = args["entry"].as_str().unwrap_or("");
|
||||||
if entry.len() > 60 {
|
if entry.len() > 60 {
|
||||||
|
|
|
||||||
|
|
@ -1,96 +1,71 @@
|
||||||
use std::sync::Arc;
|
|
||||||
// tools/vision.rs — Image viewing tool
|
// tools/vision.rs — Image viewing tool
|
||||||
//
|
//
|
||||||
// Reads image files from disk and returns them as base64 data URIs
|
// Reads an image file from disk, decodes its dimensions, and injects it
|
||||||
// for multimodal models. Also supports capturing tmux pane contents
|
// into the context as a user-role message containing a NodeBody::Image
|
||||||
// as screenshots.
|
// leaf. The leaf carries raw bytes; the API layer extracts them into
|
||||||
|
// multi_modal_data when building vLLM requests.
|
||||||
|
|
||||||
|
use std::sync::Arc;
|
||||||
|
|
||||||
use anyhow::{Context, Result};
|
use anyhow::{Context, Result};
|
||||||
use base64::Engine;
|
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
|
|
||||||
|
use crate::agent::context::{AstNode, Role, Section};
|
||||||
|
|
||||||
#[derive(Deserialize)]
|
#[derive(Deserialize)]
|
||||||
struct Args {
|
struct Args {
|
||||||
file_path: Option<String>,
|
file_path: String,
|
||||||
pane_id: Option<String>,
|
|
||||||
#[serde(default = "default_lines")]
|
|
||||||
lines: usize,
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn default_lines() -> usize { 50 }
|
|
||||||
|
|
||||||
pub fn tool() -> super::Tool {
|
pub fn tool() -> super::Tool {
|
||||||
super::Tool {
|
super::Tool {
|
||||||
name: "view_image",
|
name: "view_image",
|
||||||
description: "View an image file or capture a tmux pane screenshot. Supports PNG, JPEG, GIF, WebP. Use pane_id to capture a tmux pane instead.",
|
description: "View an image file. Supports PNG, JPEG, GIF, WebP, BMP. The image is inserted into the conversation and can be analyzed by the vision model.",
|
||||||
parameters_json: r#"{"type":"object","properties":{"file_path":{"type":"string","description":"Path to an image file"},"pane_id":{"type":"string","description":"Tmux pane ID to capture (e.g. '0:1.0')"},"lines":{"type":"integer","description":"Lines to capture from tmux pane (default 50)"}}}"#,
|
parameters_json: r#"{"type":"object","properties":{"file_path":{"type":"string","description":"Path to the image file"}},"required":["file_path"]}"#,
|
||||||
handler: Arc::new(|_a, v| Box::pin(async move { view_image_text(&v) })),
|
handler: Arc::new(|agent, v| Box::pin(async move {
|
||||||
|
view_image(agent, v).await
|
||||||
|
})),
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn view_image_text(args: &serde_json::Value) -> anyhow::Result<String> {
|
const MAX_SIZE: usize = 20 * 1024 * 1024;
|
||||||
let a: Args = serde_json::from_value(args.clone())
|
|
||||||
|
async fn view_image(
|
||||||
|
agent: Option<Arc<crate::agent::Agent>>,
|
||||||
|
args: serde_json::Value,
|
||||||
|
) -> Result<String> {
|
||||||
|
let a: Args = serde_json::from_value(args)
|
||||||
.context("invalid view_image arguments")?;
|
.context("invalid view_image arguments")?;
|
||||||
|
|
||||||
if let Some(ref pane_id) = a.pane_id {
|
let path = std::path::Path::new(&a.file_path);
|
||||||
return capture_tmux_pane(pane_id, a.lines);
|
|
||||||
}
|
|
||||||
|
|
||||||
let file_path = a.file_path
|
|
||||||
.as_deref()
|
|
||||||
.context("view_image requires either file_path or pane_id")?;
|
|
||||||
|
|
||||||
let path = std::path::Path::new(file_path);
|
|
||||||
if !path.exists() {
|
if !path.exists() {
|
||||||
anyhow::bail!("File not found: {}", file_path);
|
anyhow::bail!("file not found: {}", a.file_path);
|
||||||
}
|
}
|
||||||
|
|
||||||
let data = std::fs::read(path).with_context(|| format!("Failed to read {}", file_path))?;
|
let bytes = std::fs::read(path)
|
||||||
|
.with_context(|| format!("reading {}", a.file_path))?;
|
||||||
|
|
||||||
// Sanity check file size (don't send huge images)
|
if bytes.len() > MAX_SIZE {
|
||||||
const MAX_SIZE: usize = 20 * 1024 * 1024; // 20 MB
|
|
||||||
if data.len() > MAX_SIZE {
|
|
||||||
anyhow::bail!(
|
anyhow::bail!(
|
||||||
"Image too large: {} bytes (max {} MB)",
|
"image too large: {} bytes (max {} MB)",
|
||||||
data.len(),
|
bytes.len(), MAX_SIZE / (1024 * 1024),
|
||||||
MAX_SIZE / (1024 * 1024)
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let dim = imagesize::blob_size(&bytes)
|
||||||
|
.with_context(|| format!("decoding dimensions of {}", a.file_path))?;
|
||||||
|
let (w, h) = (dim.width as u32, dim.height as u32);
|
||||||
let mime = mime_from_extension(path);
|
let mime = mime_from_extension(path);
|
||||||
let b64 = base64::engine::general_purpose::STANDARD.encode(&data);
|
|
||||||
let data_uri = format!("data:{};base64,{}", mime, b64);
|
|
||||||
|
|
||||||
Ok(format!("Image loaded: {} ({}, {} bytes)\n{}", file_path, mime, data.len(), data_uri))
|
let image_leaf = AstNode::image(bytes.clone(), mime, h, w);
|
||||||
}
|
let token_count = image_leaf.leaf().unwrap().tokens().saturating_sub(2);
|
||||||
|
|
||||||
/// Capture a tmux pane's text content.
|
let agent = agent.context("view_image requires agent context")?;
|
||||||
fn capture_tmux_pane(pane_id: &str, lines: usize) -> Result<String> {
|
let branch = AstNode::branch(Role::User, vec![image_leaf]);
|
||||||
|
agent.context.lock().await.push_log(Section::Conversation, branch);
|
||||||
|
|
||||||
// Use tmux capture-pane to get text content, then render to image
|
Ok(format!("loaded {} ({}, {}x{}, {} tokens)",
|
||||||
// via a simple approach: capture text and return it (the model can
|
a.file_path, mime, w, h, token_count))
|
||||||
// read text directly, which is often more useful than a screenshot).
|
|
||||||
//
|
|
||||||
// For actual pixel-level screenshots we'd need a terminal renderer,
|
|
||||||
// but text capture covers 95% of use cases.
|
|
||||||
let output = std::process::Command::new("tmux")
|
|
||||||
.args(["capture-pane", "-t", pane_id, "-p", "-S", &format!("-{}", lines)])
|
|
||||||
.output()
|
|
||||||
.context("Failed to run tmux capture-pane")?;
|
|
||||||
|
|
||||||
if !output.status.success() {
|
|
||||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
|
||||||
anyhow::bail!("tmux capture-pane failed: {}", stderr.trim());
|
|
||||||
}
|
|
||||||
|
|
||||||
let text = String::from_utf8_lossy(&output.stdout).to_string();
|
|
||||||
|
|
||||||
// Return as text — the model can read terminal output directly.
|
|
||||||
// This is actually more useful than a screenshot for most tasks.
|
|
||||||
Ok(format!(
|
|
||||||
"Tmux pane {} (last {} lines):\n```\n{}\n```",
|
|
||||||
pane_id, lines, text.trim_end()
|
|
||||||
))
|
|
||||||
}
|
}
|
||||||
|
|
||||||
fn mime_from_extension(path: &std::path::Path) -> &'static str {
|
fn mime_from_extension(path: &std::path::Path) -> &'static str {
|
||||||
|
|
@ -104,8 +79,7 @@ fn mime_from_extension(path: &std::path::Path) -> &'static str {
|
||||||
Some("jpg" | "jpeg") => "image/jpeg",
|
Some("jpg" | "jpeg") => "image/jpeg",
|
||||||
Some("gif") => "image/gif",
|
Some("gif") => "image/gif",
|
||||||
Some("webp") => "image/webp",
|
Some("webp") => "image/webp",
|
||||||
Some("svg") => "image/svg+xml",
|
|
||||||
Some("bmp") => "image/bmp",
|
Some("bmp") => "image/bmp",
|
||||||
_ => "image/png", // default assumption
|
_ => "application/octet-stream",
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue