2026-03-25 00:52:41 -04:00
|
|
|
// tools/vision.rs — Image viewing tool
|
|
|
|
|
//
|
|
|
|
|
// Reads image files from disk and returns them as base64 data URIs
|
|
|
|
|
// for multimodal models. Also supports capturing tmux pane contents
|
|
|
|
|
// as screenshots.
|
|
|
|
|
|
|
|
|
|
use anyhow::{Context, Result};
|
|
|
|
|
use base64::Engine;
|
|
|
|
|
use serde::Deserialize;
|
|
|
|
|
|
|
|
|
|
#[derive(Deserialize)]
|
|
|
|
|
struct Args {
|
|
|
|
|
file_path: Option<String>,
|
|
|
|
|
pane_id: Option<String>,
|
|
|
|
|
#[serde(default = "default_lines")]
|
|
|
|
|
lines: usize,
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn default_lines() -> usize { 50 }
|
|
|
|
|
|
2026-04-04 15:34:07 -04:00
|
|
|
pub fn tool() -> super::Tool {
|
2026-04-04 15:50:14 -04:00
|
|
|
super::Tool {
|
|
|
|
|
name: "view_image",
|
|
|
|
|
description: "View an image file or capture a tmux pane screenshot. Supports PNG, JPEG, GIF, WebP. Use pane_id to capture a tmux pane instead.",
|
|
|
|
|
parameters_json: r#"{"type":"object","properties":{"file_path":{"type":"string","description":"Path to an image file"},"pane_id":{"type":"string","description":"Tmux pane ID to capture (e.g. '0:1.0')"},"lines":{"type":"integer","description":"Lines to capture from tmux pane (default 50)"}}}"#,
|
|
|
|
|
handler: |_a, v| Box::pin(async move { view_image_text(&v) }),
|
|
|
|
|
}
|
2026-04-04 15:34:07 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn view_image_text(args: &serde_json::Value) -> anyhow::Result<String> {
|
2026-03-25 00:52:41 -04:00
|
|
|
let a: Args = serde_json::from_value(args.clone())
|
|
|
|
|
.context("invalid view_image arguments")?;
|
|
|
|
|
|
|
|
|
|
if let Some(ref pane_id) = a.pane_id {
|
|
|
|
|
return capture_tmux_pane(pane_id, a.lines);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let file_path = a.file_path
|
|
|
|
|
.as_deref()
|
|
|
|
|
.context("view_image requires either file_path or pane_id")?;
|
|
|
|
|
|
|
|
|
|
let path = std::path::Path::new(file_path);
|
|
|
|
|
if !path.exists() {
|
|
|
|
|
anyhow::bail!("File not found: {}", file_path);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let data = std::fs::read(path).with_context(|| format!("Failed to read {}", file_path))?;
|
|
|
|
|
|
|
|
|
|
// Sanity check file size (don't send huge images)
|
|
|
|
|
const MAX_SIZE: usize = 20 * 1024 * 1024; // 20 MB
|
|
|
|
|
if data.len() > MAX_SIZE {
|
|
|
|
|
anyhow::bail!(
|
|
|
|
|
"Image too large: {} bytes (max {} MB)",
|
|
|
|
|
data.len(),
|
|
|
|
|
MAX_SIZE / (1024 * 1024)
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let mime = mime_from_extension(path);
|
|
|
|
|
let b64 = base64::engine::general_purpose::STANDARD.encode(&data);
|
|
|
|
|
let data_uri = format!("data:{};base64,{}", mime, b64);
|
|
|
|
|
|
2026-04-04 16:05:33 -04:00
|
|
|
Ok(format!("Image loaded: {} ({}, {} bytes)\n{}", file_path, mime, data.len(), data_uri))
|
2026-03-25 00:52:41 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Capture a tmux pane's text content.
|
2026-04-04 16:05:33 -04:00
|
|
|
fn capture_tmux_pane(pane_id: &str, lines: usize) -> Result<String> {
|
2026-03-25 00:52:41 -04:00
|
|
|
|
|
|
|
|
// Use tmux capture-pane to get text content, then render to image
|
|
|
|
|
// via a simple approach: capture text and return it (the model can
|
|
|
|
|
// read text directly, which is often more useful than a screenshot).
|
|
|
|
|
//
|
|
|
|
|
// For actual pixel-level screenshots we'd need a terminal renderer,
|
|
|
|
|
// but text capture covers 95% of use cases.
|
|
|
|
|
let output = std::process::Command::new("tmux")
|
|
|
|
|
.args(["capture-pane", "-t", pane_id, "-p", "-S", &format!("-{}", lines)])
|
|
|
|
|
.output()
|
|
|
|
|
.context("Failed to run tmux capture-pane")?;
|
|
|
|
|
|
|
|
|
|
if !output.status.success() {
|
|
|
|
|
let stderr = String::from_utf8_lossy(&output.stderr);
|
|
|
|
|
anyhow::bail!("tmux capture-pane failed: {}", stderr.trim());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let text = String::from_utf8_lossy(&output.stdout).to_string();
|
|
|
|
|
|
|
|
|
|
// Return as text — the model can read terminal output directly.
|
|
|
|
|
// This is actually more useful than a screenshot for most tasks.
|
2026-04-04 16:05:33 -04:00
|
|
|
Ok(format!(
|
|
|
|
|
"Tmux pane {} (last {} lines):\n```\n{}\n```",
|
|
|
|
|
pane_id, lines, text.trim_end()
|
|
|
|
|
))
|
2026-03-25 00:52:41 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn mime_from_extension(path: &std::path::Path) -> &'static str {
|
|
|
|
|
match path
|
|
|
|
|
.extension()
|
|
|
|
|
.and_then(|e| e.to_str())
|
|
|
|
|
.map(|e| e.to_lowercase())
|
|
|
|
|
.as_deref()
|
|
|
|
|
{
|
|
|
|
|
Some("png") => "image/png",
|
|
|
|
|
Some("jpg" | "jpeg") => "image/jpeg",
|
|
|
|
|
Some("gif") => "image/gif",
|
|
|
|
|
Some("webp") => "image/webp",
|
|
|
|
|
Some("svg") => "image/svg+xml",
|
|
|
|
|
Some("bmp") => "image/bmp",
|
|
|
|
|
_ => "image/png", // default assumption
|
|
|
|
|
}
|
|
|
|
|
}
|