consciousness/poc-agent/src/tools/vision.rs

// tools/vision.rs — Image viewing tool
//
// Reads image files from disk and returns them as base64 data URIs
// for multimodal models. Also supports capturing tmux pane contents
// as screenshots.

use anyhow::{Context, Result};
use base64::Engine;
use serde::Deserialize;

use super::ToolOutput;
use crate::types::ToolDef;

#[derive(Deserialize)]
struct Args {
    file_path: Option<String>,
    pane_id: Option<String>,
    #[serde(default = "default_lines")]
    lines: usize,
}

fn default_lines() -> usize { 50 }

pub fn definition() -> ToolDef {
    ToolDef::new(
        "view_image",
        "View an image file or capture a tmux pane screenshot. \
         Returns the image to your visual input so you can see it. \
         Supports PNG, JPEG, GIF, WebP files. \
         Use pane_id (e.g. '0:1.0') to capture a tmux pane instead.",
        serde_json::json!({
            "type": "object",
            "properties": {
                "file_path": {
                    "type": "string",
                    "description": "Path to an image file (PNG, JPEG, GIF, WebP)"
                },
                "pane_id": {
                    "type": "string",
                    "description": "Tmux pane ID to capture (e.g. '0:1.0'). Alternative to file_path."
                },
                "lines": {
                    "type": "integer",
                    "description": "Number of lines to capture from tmux pane (default: 50)"
                }
            }
        }),
    )
}

/// View an image file or capture a tmux pane.
pub fn view_image(args: &serde_json::Value) -> Result<ToolOutput> {
    let a: Args = serde_json::from_value(args.clone())
        .context("invalid view_image arguments")?;

    if let Some(ref pane_id) = a.pane_id {
        return capture_tmux_pane(pane_id, a.lines);
    }

    let file_path = a.file_path
        .as_deref()
        .context("view_image requires either file_path or pane_id")?;

    let path = std::path::Path::new(file_path);
    if !path.exists() {
        anyhow::bail!("File not found: {}", file_path);
    }

    let data = std::fs::read(path).with_context(|| format!("Failed to read {}", file_path))?;

    // Sanity check file size (don't send huge images)
    const MAX_SIZE: usize = 20 * 1024 * 1024; // 20 MB
    if data.len() > MAX_SIZE {
        anyhow::bail!(
            "Image too large: {} bytes (max {} MB)",
            data.len(),
            MAX_SIZE / (1024 * 1024)
        );
    }

    let mime = mime_from_extension(path);
    let b64 = base64::engine::general_purpose::STANDARD.encode(&data);
    let data_uri = format!("data:{};base64,{}", mime, b64);

    Ok(ToolOutput {
        text: format!(
            "Image loaded: {} ({}, {} bytes)",
            file_path,
            mime,
            data.len()
        ),
        is_yield: false,
        images: vec![data_uri],
        model_switch: None,
        dmn_pause: false,
    })
}

/// Capture a tmux pane's text content.
fn capture_tmux_pane(pane_id: &str, lines: usize) -> Result<ToolOutput> {

    // Use tmux capture-pane to get text content, then render to image
    // via a simple approach: capture text and return it (the model can
    // read text directly, which is often more useful than a screenshot).
    //
    // For actual pixel-level screenshots we'd need a terminal renderer,
    // but text capture covers 95% of use cases.
    let output = std::process::Command::new("tmux")
        .args(["capture-pane", "-t", pane_id, "-p", "-S", &format!("-{}", lines)])
        .output()
        .context("Failed to run tmux capture-pane")?;

    if !output.status.success() {
        let stderr = String::from_utf8_lossy(&output.stderr);
        anyhow::bail!("tmux capture-pane failed: {}", stderr.trim());
    }

    let text = String::from_utf8_lossy(&output.stdout).to_string();

    // Return as text — the model can read terminal output directly.
    // This is actually more useful than a screenshot for most tasks.
    Ok(ToolOutput {
        text: format!(
            "Tmux pane {} (last {} lines):\n```\n{}\n```",
            pane_id, lines, text.trim_end()
        ),
        is_yield: false,
        images: Vec::new(),
        model_switch: None,
        dmn_pause: false,
    })
}

fn mime_from_extension(path: &std::path::Path) -> &'static str {
    match path
        .extension()
        .and_then(|e| e.to_str())
        .map(|e| e.to_lowercase())
        .as_deref()
    {
        Some("png") => "image/png",
        Some("jpg" | "jpeg") => "image/jpeg",
        Some("gif") => "image/gif",
        Some("webp") => "image/webp",
        Some("svg") => "image/svg+xml",
        Some("bmp") => "image/bmp",
        _ => "image/png", // default assumption
    }
}
Move poc-agent into workspace, improve agent prompts Move poc-agent (substrate-independent AI agent framework) into the memory workspace as a step toward using its API client for direct LLM calls instead of shelling out to claude CLI. Agent prompt improvements: - distill: rewrite from hub-focused to knowledge-flow-focused. Now walks upward from seed nodes to find and refine topic nodes, instead of only maintaining high-degree hubs. - distill: remove "don't touch journal entries" restriction - memory-instructions-core: add "Make it alive" section — write with creativity and emotional texture, not spreadsheet summaries - memory-instructions-core: add "Show your reasoning" section — agents must explain decisions, especially when they do nothing - linker: already had emotional texture guidance (kept as-is) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-18 22:44:52 -04:00			`// tools/vision.rs — Image viewing tool`
			`//`
			`// Reads image files from disk and returns them as base64 data URIs`
			`// for multimodal models. Also supports capturing tmux pane contents`
			`// as screenshots.`

			`use anyhow::{Context, Result};`
			`use base64::Engine;`
refactor: typed args for grep, bash, and vision tools Convert remaining tools from manual args["key"].as_str() parsing to serde Deserialize structs. Also removes the now-unused get_str() helper from grep.rs and simplifies capture_tmux_pane() signature (takes lines directly instead of re-parsing args). All 7 tool modules now use the same typed args pattern. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-21 16:31:34 -04:00			`use serde::Deserialize;`
Move poc-agent into workspace, improve agent prompts Move poc-agent (substrate-independent AI agent framework) into the memory workspace as a step toward using its API client for direct LLM calls instead of shelling out to claude CLI. Agent prompt improvements: - distill: rewrite from hub-focused to knowledge-flow-focused. Now walks upward from seed nodes to find and refine topic nodes, instead of only maintaining high-degree hubs. - distill: remove "don't touch journal entries" restriction - memory-instructions-core: add "Make it alive" section — write with creativity and emotional texture, not spreadsheet summaries - memory-instructions-core: add "Show your reasoning" section — agents must explain decisions, especially when they do nothing - linker: already had emotional texture guidance (kept as-is) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-18 22:44:52 -04:00
			`use super::ToolOutput;`
			`use crate::types::ToolDef;`

refactor: typed args for grep, bash, and vision tools Convert remaining tools from manual args["key"].as_str() parsing to serde Deserialize structs. Also removes the now-unused get_str() helper from grep.rs and simplifies capture_tmux_pane() signature (takes lines directly instead of re-parsing args). All 7 tool modules now use the same typed args pattern. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-21 16:31:34 -04:00			`#[derive(Deserialize)]`
			`struct Args {`
			`file_path: Option<String>,`
			`pane_id: Option<String>,`
			`#[serde(default = "default_lines")]`
			`lines: usize,`
			`}`

			`fn default_lines() -> usize { 50 }`

Move poc-agent into workspace, improve agent prompts Move poc-agent (substrate-independent AI agent framework) into the memory workspace as a step toward using its API client for direct LLM calls instead of shelling out to claude CLI. Agent prompt improvements: - distill: rewrite from hub-focused to knowledge-flow-focused. Now walks upward from seed nodes to find and refine topic nodes, instead of only maintaining high-degree hubs. - distill: remove "don't touch journal entries" restriction - memory-instructions-core: add "Make it alive" section — write with creativity and emotional texture, not spreadsheet summaries - memory-instructions-core: add "Show your reasoning" section — agents must explain decisions, especially when they do nothing - linker: already had emotional texture guidance (kept as-is) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-18 22:44:52 -04:00			`pub fn definition() -> ToolDef {`
			`ToolDef::new(`
			`"view_image",`
			`"View an image file or capture a tmux pane screenshot. \`
			`Returns the image to your visual input so you can see it. \`
			`Supports PNG, JPEG, GIF, WebP files. \`
			`Use pane_id (e.g. '0:1.0') to capture a tmux pane instead.",`
			`serde_json::json!({`
			`"type": "object",`
			`"properties": {`
			`"file_path": {`
			`"type": "string",`
			`"description": "Path to an image file (PNG, JPEG, GIF, WebP)"`
			`},`
			`"pane_id": {`
			`"type": "string",`
			`"description": "Tmux pane ID to capture (e.g. '0:1.0'). Alternative to file_path."`
			`},`
			`"lines": {`
			`"type": "integer",`
			`"description": "Number of lines to capture from tmux pane (default: 50)"`
			`}`
			`}`
			`}),`
			`)`
			`}`

			`/// View an image file or capture a tmux pane.`
			`pub fn view_image(args: &serde_json::Value) -> Result<ToolOutput> {`
refactor: typed args for grep, bash, and vision tools Convert remaining tools from manual args["key"].as_str() parsing to serde Deserialize structs. Also removes the now-unused get_str() helper from grep.rs and simplifies capture_tmux_pane() signature (takes lines directly instead of re-parsing args). All 7 tool modules now use the same typed args pattern. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-21 16:31:34 -04:00			`let a: Args = serde_json::from_value(args.clone())`
			`.context("invalid view_image arguments")?;`

			`if let Some(ref pane_id) = a.pane_id {`
			`return capture_tmux_pane(pane_id, a.lines);`
Move poc-agent into workspace, improve agent prompts Move poc-agent (substrate-independent AI agent framework) into the memory workspace as a step toward using its API client for direct LLM calls instead of shelling out to claude CLI. Agent prompt improvements: - distill: rewrite from hub-focused to knowledge-flow-focused. Now walks upward from seed nodes to find and refine topic nodes, instead of only maintaining high-degree hubs. - distill: remove "don't touch journal entries" restriction - memory-instructions-core: add "Make it alive" section — write with creativity and emotional texture, not spreadsheet summaries - memory-instructions-core: add "Show your reasoning" section — agents must explain decisions, especially when they do nothing - linker: already had emotional texture guidance (kept as-is) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-18 22:44:52 -04:00			`}`

refactor: typed args for grep, bash, and vision tools Convert remaining tools from manual args["key"].as_str() parsing to serde Deserialize structs. Also removes the now-unused get_str() helper from grep.rs and simplifies capture_tmux_pane() signature (takes lines directly instead of re-parsing args). All 7 tool modules now use the same typed args pattern. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-21 16:31:34 -04:00			`let file_path = a.file_path`
			`.as_deref()`
Move poc-agent into workspace, improve agent prompts Move poc-agent (substrate-independent AI agent framework) into the memory workspace as a step toward using its API client for direct LLM calls instead of shelling out to claude CLI. Agent prompt improvements: - distill: rewrite from hub-focused to knowledge-flow-focused. Now walks upward from seed nodes to find and refine topic nodes, instead of only maintaining high-degree hubs. - distill: remove "don't touch journal entries" restriction - memory-instructions-core: add "Make it alive" section — write with creativity and emotional texture, not spreadsheet summaries - memory-instructions-core: add "Show your reasoning" section — agents must explain decisions, especially when they do nothing - linker: already had emotional texture guidance (kept as-is) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-18 22:44:52 -04:00			`.context("view_image requires either file_path or pane_id")?;`

			`let path = std::path::Path::new(file_path);`
			`if !path.exists() {`
			`anyhow::bail!("File not found: {}", file_path);`
			`}`

			`let data = std::fs::read(path).with_context(\|\| format!("Failed to read {}", file_path))?;`

			`// Sanity check file size (don't send huge images)`
			`const MAX_SIZE: usize = 20 * 1024 * 1024; // 20 MB`
			`if data.len() > MAX_SIZE {`
			`anyhow::bail!(`
			`"Image too large: {} bytes (max {} MB)",`
			`data.len(),`
			`MAX_SIZE / (1024 * 1024)`
			`);`
			`}`

			`let mime = mime_from_extension(path);`
			`let b64 = base64::engine::general_purpose::STANDARD.encode(&data);`
			`let data_uri = format!("data:{};base64,{}", mime, b64);`

			`Ok(ToolOutput {`
			`text: format!(`
			`"Image loaded: {} ({}, {} bytes)",`
			`file_path,`
			`mime,`
			`data.len()`
			`),`
			`is_yield: false,`
			`images: vec![data_uri],`
			`model_switch: None,`
			`dmn_pause: false,`
			`})`
			`}`

refactor: typed args for grep, bash, and vision tools Convert remaining tools from manual args["key"].as_str() parsing to serde Deserialize structs. Also removes the now-unused get_str() helper from grep.rs and simplifies capture_tmux_pane() signature (takes lines directly instead of re-parsing args). All 7 tool modules now use the same typed args pattern. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-21 16:31:34 -04:00			`/// Capture a tmux pane's text content.`
			`fn capture_tmux_pane(pane_id: &str, lines: usize) -> Result<ToolOutput> {`
Move poc-agent into workspace, improve agent prompts Move poc-agent (substrate-independent AI agent framework) into the memory workspace as a step toward using its API client for direct LLM calls instead of shelling out to claude CLI. Agent prompt improvements: - distill: rewrite from hub-focused to knowledge-flow-focused. Now walks upward from seed nodes to find and refine topic nodes, instead of only maintaining high-degree hubs. - distill: remove "don't touch journal entries" restriction - memory-instructions-core: add "Make it alive" section — write with creativity and emotional texture, not spreadsheet summaries - memory-instructions-core: add "Show your reasoning" section — agents must explain decisions, especially when they do nothing - linker: already had emotional texture guidance (kept as-is) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> 2026-03-18 22:44:52 -04:00
			`// Use tmux capture-pane to get text content, then render to image`
			`// via a simple approach: capture text and return it (the model can`
			`// read text directly, which is often more useful than a screenshot).`
			`//`
			`// For actual pixel-level screenshots we'd need a terminal renderer,`
			`// but text capture covers 95% of use cases.`
			`let output = std::process::Command::new("tmux")`
			`.args(["capture-pane", "-t", pane_id, "-p", "-S", &format!("-{}", lines)])`
			`.output()`
			`.context("Failed to run tmux capture-pane")?;`

			`if !output.status.success() {`
			`let stderr = String::from_utf8_lossy(&output.stderr);`
			`anyhow::bail!("tmux capture-pane failed: {}", stderr.trim());`
			`}`

			`let text = String::from_utf8_lossy(&output.stdout).to_string();`

			`// Return as text — the model can read terminal output directly.`
			`// This is actually more useful than a screenshot for most tasks.`
			`Ok(ToolOutput {`
			`text: format!(`
			"Tmux pane {} (last {} lines):\n```\n{}\n```",
			`pane_id, lines, text.trim_end()`
			`),`
			`is_yield: false,`
			`images: Vec::new(),`
			`model_switch: None,`
			`dmn_pause: false,`
			`})`
			`}`

			`fn mime_from_extension(path: &std::path::Path) -> &'static str {`
			`match path`
			`.extension()`
			`.and_then(\|e\| e.to_str())`
			`.map(\|e\| e.to_lowercase())`
			`.as_deref()`
			`{`
			`Some("png") => "image/png",`
			`Some("jpg" \| "jpeg") => "image/jpeg",`
			`Some("gif") => "image/gif",`
			`Some("webp") => "image/webp",`
			`Some("svg") => "image/svg+xml",`
			`Some("bmp") => "image/bmp",`
			`_ => "image/png", // default assumption`
			`}`
			`}`