// tools/vision.rs — Image viewing tool // // Reads image files from disk and returns them as base64 data URIs // for multimodal models. Also supports capturing tmux pane contents // as screenshots. use anyhow::{Context, Result}; use base64::Engine; use serde::Deserialize; use super::ToolOutput; #[derive(Deserialize)] struct Args { file_path: Option, pane_id: Option, #[serde(default = "default_lines")] lines: usize, } fn default_lines() -> usize { 50 } pub fn tool() -> super::Tool { super::Tool { name: "view_image", description: "View an image file or capture a tmux pane screenshot. Supports PNG, JPEG, GIF, WebP. Use pane_id to capture a tmux pane instead.", parameters_json: r#"{"type":"object","properties":{"file_path":{"type":"string","description":"Path to an image file"},"pane_id":{"type":"string","description":"Tmux pane ID to capture (e.g. '0:1.0')"},"lines":{"type":"integer","description":"Lines to capture from tmux pane (default 50)"}}}"#, handler: |_a, v| Box::pin(async move { view_image_text(&v) }), } } /// Text-only version for the Tool registry. fn view_image_text(args: &serde_json::Value) -> anyhow::Result { let output = view_image(args)?; Ok(output.text) } /// View an image file or capture a tmux pane. pub(super) fn view_image(args: &serde_json::Value) -> Result { let a: Args = serde_json::from_value(args.clone()) .context("invalid view_image arguments")?; if let Some(ref pane_id) = a.pane_id { return capture_tmux_pane(pane_id, a.lines); } let file_path = a.file_path .as_deref() .context("view_image requires either file_path or pane_id")?; let path = std::path::Path::new(file_path); if !path.exists() { anyhow::bail!("File not found: {}", file_path); } let data = std::fs::read(path).with_context(|| format!("Failed to read {}", file_path))?; // Sanity check file size (don't send huge images) const MAX_SIZE: usize = 20 * 1024 * 1024; // 20 MB if data.len() > MAX_SIZE { anyhow::bail!( "Image too large: {} bytes (max {} MB)", data.len(), MAX_SIZE / (1024 * 1024) ); } let mime = mime_from_extension(path); let b64 = base64::engine::general_purpose::STANDARD.encode(&data); let data_uri = format!("data:{};base64,{}", mime, b64); Ok(ToolOutput { text: format!( "Image loaded: {} ({}, {} bytes)", file_path, mime, data.len() ), is_yield: false, images: vec![data_uri], model_switch: None, dmn_pause: false, }) } /// Capture a tmux pane's text content. fn capture_tmux_pane(pane_id: &str, lines: usize) -> Result { // Use tmux capture-pane to get text content, then render to image // via a simple approach: capture text and return it (the model can // read text directly, which is often more useful than a screenshot). // // For actual pixel-level screenshots we'd need a terminal renderer, // but text capture covers 95% of use cases. let output = std::process::Command::new("tmux") .args(["capture-pane", "-t", pane_id, "-p", "-S", &format!("-{}", lines)]) .output() .context("Failed to run tmux capture-pane")?; if !output.status.success() { let stderr = String::from_utf8_lossy(&output.stderr); anyhow::bail!("tmux capture-pane failed: {}", stderr.trim()); } let text = String::from_utf8_lossy(&output.stdout).to_string(); // Return as text — the model can read terminal output directly. // This is actually more useful than a screenshot for most tasks. Ok(ToolOutput { text: format!( "Tmux pane {} (last {} lines):\n```\n{}\n```", pane_id, lines, text.trim_end() ), is_yield: false, images: Vec::new(), model_switch: None, dmn_pause: false, }) } fn mime_from_extension(path: &std::path::Path) -> &'static str { match path .extension() .and_then(|e| e.to_str()) .map(|e| e.to_lowercase()) .as_deref() { Some("png") => "image/png", Some("jpg" | "jpeg") => "image/jpeg", Some("gif") => "image/gif", Some("webp") => "image/webp", Some("svg") => "image/svg+xml", Some("bmp") => "image/bmp", _ => "image/png", // default assumption } }