diff --git a/Cargo.lock b/Cargo.lock index c76a7cd..dfca607 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -492,7 +492,6 @@ dependencies = [ "http-body-util", "hyper", "hyper-util", - "imagesize", "json-five", "libc", "log", @@ -1424,12 +1423,6 @@ dependencies = [ "winapi-util", ] -[[package]] -name = "imagesize" -version = "0.14.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09e54e57b4c48b40f7aec75635392b12b3421fa26fe8b4332e63138ed278459c" - [[package]] name = "indexmap" version = "2.14.0" diff --git a/Cargo.toml b/Cargo.toml index 0996f94..7cdf851 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -68,7 +68,6 @@ hyper-util = { version = "0.1", features = ["tokio"], default-features = false } http-body-util = "0.1" bytes = "1" base64 = "0.22" -imagesize = "0.14" rustls = "0.23" tokio-rustls = "0.26" diff --git a/src/agent/api/mod.rs b/src/agent/api/mod.rs index 649d95c..7c06fa7 100644 --- a/src/agent/api/mod.rs +++ b/src/agent/api/mod.rs @@ -78,31 +78,18 @@ impl ApiClient { prompt_tokens: &[u32], sampling: SamplingParams, priority: Option, - ) -> (mpsc::UnboundedReceiver, AbortOnDrop) { - self.stream_completion_mm(prompt_tokens, &[], sampling, priority) - } - - pub(crate) fn stream_completion_mm( - &self, - prompt_tokens: &[u32], - images: &[super::context::WireImage], - sampling: SamplingParams, - priority: Option, ) -> (mpsc::UnboundedReceiver, AbortOnDrop) { let (tx, rx) = mpsc::unbounded_channel(); let client = self.client.clone(); let api_key = self.api_key.clone(); let model = self.model.clone(); let prompt_tokens = prompt_tokens.to_vec(); - let images: Vec<(Vec, String)> = images.iter() - .map(|i| (i.bytes.clone(), i.mime.clone())) - .collect(); let base_url = self.base_url.clone(); let handle = tokio::spawn(async move { let result = stream_completions( &client, &base_url, &api_key, &model, - &prompt_tokens, &images, &tx, sampling, priority, + &prompt_tokens, &tx, sampling, priority, ).await; if let Err(e) = result { let _ = tx.send(StreamToken::Error(e.to_string())); @@ -123,7 +110,6 @@ async fn stream_completions( api_key: &str, model: &str, prompt_tokens: &[u32], - images: &[(Vec, String)], tx: &mpsc::UnboundedSender, sampling: SamplingParams, priority: Option, @@ -140,14 +126,6 @@ async fn stream_completions( "skip_special_tokens": false, "stop_token_ids": [super::tokenizer::IM_END], }); - if !images.is_empty() { - use base64::Engine; - let b64 = base64::engine::general_purpose::STANDARD; - let uris: Vec = images.iter() - .map(|(bytes, mime)| format!("data:{};base64,{}", mime, b64.encode(bytes))) - .collect(); - request["multi_modal_data"] = serde_json::json!({ "image": uris }); - } if let Some(p) = priority { request["priority"] = serde_json::json!(p); } diff --git a/src/agent/context.rs b/src/agent/context.rs index 0082f06..37dbf48 100644 --- a/src/agent/context.rs +++ b/src/agent/context.rs @@ -81,33 +81,10 @@ pub enum NodeBody { Memory { key: String, text: String, score: Option }, Dmn(String), - // Vision input — rendered as <|vision_start|> <|image_pad|>×N <|vision_end|>. - // `token_count` is N, the count vLLM will compute for this image's grid. - Image { - #[serde(with = "b64_bytes")] - bytes: Vec, - mime: String, - orig_height: u32, - orig_width: u32, - token_count: u32, - }, - // Non-visible (0 tokens in prompt) Log(String), } -mod b64_bytes { - use base64::{Engine, engine::general_purpose::STANDARD}; - use serde::{Serializer, Deserializer, Deserialize}; - pub fn serialize(bytes: &[u8], s: S) -> Result { - s.serialize_str(&STANDARD.encode(bytes)) - } - pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result, D::Error> { - let s = String::deserialize(d)?; - STANDARD.decode(s).map_err(serde::de::Error::custom) - } -} - /// A leaf node: typed content with cached token IDs. /// Token IDs are not serialized — they're recomputed on deserialization. #[derive(Debug, Clone, Serialize)] @@ -126,7 +103,11 @@ impl<'de> Deserialize<'de> for NodeLeaf { timestamp: DateTime, } let raw = Raw::deserialize(deserializer)?; - let token_ids = raw.body.compute_token_ids(); + let token_ids = if raw.body.is_prompt_visible() { + tokenizer::encode(&raw.body.render()) + } else { + vec![] + }; Ok(NodeLeaf { body: raw.body, token_ids, timestamp: raw.timestamp }) } } @@ -240,13 +221,6 @@ impl NodeBody { out.push_str(text); out.push_str("<|im_end|>\n"); } - Self::Image { token_count, .. } => { - out.push_str("<|vision_start|>"); - for _ in 0..*token_count { - out.push_str("<|image_pad|>"); - } - out.push_str("<|vision_end|>"); - } } } @@ -261,26 +235,6 @@ impl NodeBody { !matches!(self, Self::Thinking(_) | Self::Log(_)) } - /// Hand-assemble token IDs for body types where running the tokenizer - /// on the rendered text would be needlessly expensive (Image). Falls - /// back to encoding the rendered text for everything else. - fn compute_token_ids(&self) -> Vec { - if !self.is_prompt_visible() { - return Vec::new(); - } - match self { - Self::Image { token_count, .. } => { - let mut ids = Vec::with_capacity(*token_count as usize + 2); - ids.push(tokenizer::VISION_START); - ids.extend(std::iter::repeat(tokenizer::IMAGE_PAD) - .take(*token_count as usize)); - ids.push(tokenizer::VISION_END); - ids - } - _ => tokenizer::encode(&self.render()), - } - } - /// The text content of this leaf (for display, not rendering). pub fn text(&self) -> &str { match self { @@ -288,14 +242,17 @@ impl NodeBody { | Self::ToolResult(t) | Self::Dmn(t) => t, Self::ToolCall { name, .. } => name, Self::Memory { text, .. } => text, - Self::Image { mime, .. } => mime, } } } impl NodeLeaf { fn new(body: NodeBody) -> Self { - let token_ids = body.compute_token_ids(); + let token_ids = if body.is_prompt_visible() { + tokenizer::encode(&body.render()) + } else { + vec![] + }; Self { body, token_ids, timestamp: Utc::now() } } @@ -348,24 +305,6 @@ impl AstNode { Self::Leaf(NodeLeaf::new(NodeBody::Log(text.into()))) } - /// Build an Image leaf. `token_count` is computed from the image - /// dimensions using Qwen3-VL's resizing rules. - pub fn image( - bytes: Vec, - mime: impl Into, - orig_height: u32, - orig_width: u32, - ) -> Self { - let token_count = qwen3_image_token_count(orig_height, orig_width); - Self::Leaf(NodeLeaf::new(NodeBody::Image { - bytes, - mime: mime.into(), - orig_height, - orig_width, - token_count, - })) - } - // -- Branch constructors -------------------------------------------------- pub fn branch(role: Role, children: Vec) -> Self { @@ -395,7 +334,11 @@ impl AstNode { pub fn retokenize(self) -> Self { match self { Self::Leaf(leaf) => { - let token_ids = leaf.body.compute_token_ids(); + let token_ids = if leaf.body.is_prompt_visible() { + tokenizer::encode(&leaf.body.render()) + } else { + vec![] + }; Self::Leaf(NodeLeaf { token_ids, ..leaf }) } Self::Branch { role, children, timestamp, memory_scores } => Self::Branch { @@ -454,8 +397,6 @@ impl AstNode { None => format!("mem: {}", key), }, NodeBody::Dmn(_) => "dmn".into(), - NodeBody::Image { orig_height, orig_width, token_count, .. } => - format!("image: {}x{} ({} tokens)", orig_width, orig_height, token_count), NodeBody::Log(t) => format!("log: {}", truncate_preview(t, 60)), }, } @@ -884,58 +825,6 @@ impl Ast for ContextState { } } -/// An image collected from the AST for a request body. The AST stores -/// the pre-expanded token form (N image_pads) for accurate budget -/// accounting; the wire form collapses each Image to a single -/// `<|image_pad|>` between vision bookends and ships the bytes -/// separately as multi_modal_data. -pub struct WireImage { - pub bytes: Vec, - pub mime: String, -} - -fn wire_into(node: &AstNode, tokens: &mut Vec, images: &mut Vec) { - match node { - AstNode::Leaf(leaf) => match leaf.body() { - NodeBody::Image { bytes, mime, .. } => { - tokens.push(tokenizer::VISION_START); - tokens.push(tokenizer::IMAGE_PAD); - tokens.push(tokenizer::VISION_END); - images.push(WireImage { - bytes: bytes.clone(), - mime: mime.clone(), - }); - } - _ => tokens.extend_from_slice(leaf.token_ids()), - }, - AstNode::Branch { role, children, .. } => { - tokens.push(tokenizer::IM_START); - tokens.extend(tokenizer::encode(&format!("{}\n", role.as_str()))); - for c in children { - wire_into(c, tokens, images); - } - tokens.push(tokenizer::IM_END); - tokens.extend(tokenizer::encode("\n")); - } - } -} - -impl ContextState { - /// Assemble the prompt in wire form: token stream with a single - /// `<|image_pad|>` per image (vLLM expands back to N), plus the list - /// of images to send as multi_modal_data. - pub fn wire_prompt(&self) -> (Vec, Vec) { - let mut tokens = Vec::new(); - let mut images = Vec::new(); - for section in self.sections() { - for node in section { - wire_into(node, &mut tokens, &mut images); - } - } - (tokens, images) - } -} - impl ContextState { fn section_mut(&mut self, section: Section) -> &mut Vec { match section { @@ -968,7 +857,11 @@ impl ContextState { let node = &mut nodes[index]; match node { AstNode::Leaf(leaf) => { - let token_ids = body.compute_token_ids(); + let token_ids = if body.is_prompt_visible() { + tokenizer::encode(&body.render()) + } else { + vec![] + }; leaf.body = body; leaf.token_ids = token_ids; } @@ -1098,58 +991,6 @@ impl ContextState { } } -// --------------------------------------------------------------------------- -// Qwen3-VL image token count -// -// Port of Qwen2VLImageProcessor.smart_resize + image_token_count. We need the -// exact same answer that vLLM's Qwen3VL processor will produce, because the -// token stream in our context must match what vLLM expands `<|image_pad|>` -// to at request time. Constants come from Qwen3.5-27B's preprocessor_config. -// --------------------------------------------------------------------------- - -const QWEN3_PATCH_SIZE: u32 = 16; -const QWEN3_MERGE_SIZE: u32 = 2; -const QWEN3_MIN_PIXELS: u64 = 65_536; -const QWEN3_MAX_PIXELS: u64 = 16_777_216; - -fn smart_resize(h: u32, w: u32, factor: u32, min_pixels: u64, max_pixels: u64) -> (u32, u32) { - let max_s = h.max(w) as f64; - let min_s = h.min(w) as f64; - assert!(max_s / min_s <= 200.0, "aspect ratio too extreme: {}x{}", h, w); - - let fh = h as f64; - let fw = w as f64; - let ff = factor as f64; - - let h_bar = ((fh / ff).round() as u32) * factor; - let w_bar = ((fw / ff).round() as u32) * factor; - let total = (h_bar as u64) * (w_bar as u64); - - if total > max_pixels { - let beta = ((fh * fw) / max_pixels as f64).sqrt(); - let hf = ((fh / beta / ff).floor() as u32) * factor; - let wf = ((fw / beta / ff).floor() as u32) * factor; - (hf.max(factor), wf.max(factor)) - } else if total < min_pixels { - let beta = (min_pixels as f64 / (fh * fw)).sqrt(); - let hc = ((fh * beta / ff).ceil() as u32) * factor; - let wc = ((fw * beta / ff).ceil() as u32) * factor; - (hc, wc) - } else { - (h_bar, w_bar) - } -} - -/// Compute how many `<|image_pad|>` tokens vLLM will emit for an image of -/// the given dimensions. Matches Qwen3VL's feature-size calculation exactly: -/// (grid_h * grid_w) / merge_size^2 -/// where (grid_h, grid_w) = resized dims / patch_size. -fn qwen3_image_token_count(orig_h: u32, orig_w: u32) -> u32 { - let factor = QWEN3_PATCH_SIZE * QWEN3_MERGE_SIZE; - let (rh, rw) = smart_resize(orig_h, orig_w, factor, QWEN3_MIN_PIXELS, QWEN3_MAX_PIXELS); - (rh / QWEN3_PATCH_SIZE) * (rw / QWEN3_PATCH_SIZE) / (QWEN3_MERGE_SIZE * QWEN3_MERGE_SIZE) -} - pub fn context_window() -> usize { let app = crate::config::app(); app.backends.get(&app.default_backend) @@ -1529,110 +1370,6 @@ mod tests { assert!(serde_json::from_str::(json).is_err()); } - // -- Image leaf tests --------------------------------------------------------- - - #[test] - fn test_smart_resize_within_bounds() { - // Typical case: 1024x768 → rounded to multiples of 32, under max. - let (h, w) = smart_resize(768, 1024, 32, 65_536, 16_777_216); - assert_eq!(h, 768); - assert_eq!(w, 1024); - } - - #[test] - fn test_smart_resize_upscales_tiny() { - // 32x32 = 1024 pixels, below min_pixels=65536. Should scale up. - let (h, w) = smart_resize(32, 32, 32, 65_536, 16_777_216); - assert!((h as u64) * (w as u64) >= 65_536, - "resized {}x{} is under min_pixels", h, w); - assert_eq!(h % 32, 0); - assert_eq!(w % 32, 0); - } - - #[test] - fn test_smart_resize_downscales_huge() { - // 8000x6000 = 48M pixels, above max_pixels=16M. Should scale down. - let (h, w) = smart_resize(8000, 6000, 32, 65_536, 16_777_216); - assert!((h as u64) * (w as u64) <= 16_777_216, - "resized {}x{} exceeds max_pixels", h, w); - assert_eq!(h % 32, 0); - assert_eq!(w % 32, 0); - } - - #[test] - fn test_qwen3_token_count_matches_formula() { - // 512x512 → resized to 512x512 (already multiple of 32, within bounds). - // grid = 32x32, tokens = 32*32/4 = 256. - assert_eq!(qwen3_image_token_count(512, 512), 256); - } - - #[test] - fn test_image_render_and_token_ids() { - let node = AstNode::image(vec![0u8, 1, 2, 3], "image/png", 512, 512); - let leaf = node.leaf().unwrap(); - // 3 tokens of bookend + 256 image_pad tokens - assert_eq!(leaf.token_ids().len(), 258); - assert_eq!(leaf.token_ids()[0], tokenizer::VISION_START); - assert_eq!(leaf.token_ids()[257], tokenizer::VISION_END); - for pad in &leaf.token_ids()[1..257] { - assert_eq!(*pad, tokenizer::IMAGE_PAD); - } - // Rendered text has the expected bookends. - let rendered = leaf.body().render(); - assert!(rendered.starts_with("<|vision_start|>")); - assert!(rendered.ends_with("<|vision_end|>")); - } - - #[test] - fn test_wire_prompt_collapses_image_pads() { - let mut ctx = ContextState::new(); - ctx.push_no_log(Section::Conversation, AstNode::branch(Role::User, vec![ - AstNode::content("look:"), - AstNode::image(vec![0xDE, 0xAD], "image/png", 512, 512), - ])); - - // AST side: N image_pads + bookends, full budget accounting. - let full = ctx.token_ids(); - let n_image_pads_full = full.iter() - .filter(|&&t| t == tokenizer::IMAGE_PAD).count(); - assert_eq!(n_image_pads_full, qwen3_image_token_count(512, 512) as usize); - - // Wire side: single image_pad, bytes moved to images list. - let (wire, images) = ctx.wire_prompt(); - let n_image_pads_wire = wire.iter() - .filter(|&&t| t == tokenizer::IMAGE_PAD).count(); - assert_eq!(n_image_pads_wire, 1); - assert_eq!(images.len(), 1); - assert_eq!(images[0].bytes, vec![0xDE, 0xAD]); - assert_eq!(images[0].mime, "image/png"); - - // vision_start/vision_end bookends are preserved in wire form. - assert_eq!(wire.iter().filter(|&&t| t == tokenizer::VISION_START).count(), 1); - assert_eq!(wire.iter().filter(|&&t| t == tokenizer::VISION_END).count(), 1); - } - - #[test] - fn test_image_serde_roundtrip() { - let node = AstNode::image(vec![0xDE, 0xAD, 0xBE, 0xEF], "image/png", 64, 64); - let json = serde_json::to_string(&node).unwrap(); - // bytes must be base64-encoded in the JSON form - assert!(json.contains("3q2+7w==")); - let back: AstNode = serde_json::from_str(&json).unwrap(); - let leaf = back.leaf().unwrap(); - match leaf.body() { - NodeBody::Image { bytes, mime, orig_height, orig_width, token_count } => { - assert_eq!(bytes, &[0xDE, 0xAD, 0xBE, 0xEF]); - assert_eq!(mime, "image/png"); - assert_eq!(*orig_height, 64); - assert_eq!(*orig_width, 64); - assert_eq!(*token_count, qwen3_image_token_count(64, 64)); - } - other => panic!("expected Image, got {:?}", other), - } - // token_ids are recomputed on deserialization - assert_eq!(leaf.token_ids().len(), leaf.tokens()); - } - #[test] fn test_timestamp_present_accepted() { let json = r#"{"Leaf":{"body":{"Content":"hi"},"timestamp":"2026-04-16T12:00:00Z"}}"#; diff --git a/src/agent/mod.rs b/src/agent/mod.rs index bc62955..5368db6 100644 --- a/src/agent/mod.rs +++ b/src/agent/mod.rs @@ -285,23 +285,16 @@ impl Agent { } pub async fn assemble_prompt_tokens(&self) -> Vec { - self.assemble_prompt().await.0 - } - - /// Assemble a ready-to-send prompt: token stream in wire form (each - /// image collapsed to a single `<|image_pad|>`) paired with the - /// images to attach as multi_modal_data. - pub async fn assemble_prompt(&self) -> (Vec, Vec) { let ctx = self.context.lock().await; let st = self.state.lock().await; - let (mut tokens, images) = ctx.wire_prompt(); + let mut tokens = ctx.token_ids(); tokens.push(tokenizer::IM_START); if st.think_native { tokens.extend(tokenizer::encode("assistant\n\n")); } else { tokens.extend(tokenizer::encode("assistant\n")); } - (tokens, images) + tokens } /// Rebuild the tools section of the system prompt from the current tools list. @@ -361,11 +354,10 @@ impl Agent { let _thinking = start_activity(&agent, "thinking...").await; let (rx, _stream_guard) = { - let (prompt_tokens, images) = agent.assemble_prompt().await; + let prompt_tokens = agent.assemble_prompt_tokens().await; let st = agent.state.lock().await; - agent.client.stream_completion_mm( + agent.client.stream_completion( &prompt_tokens, - &images, api::SamplingParams { temperature: st.temperature, top_p: st.top_p, @@ -583,9 +575,20 @@ impl Agent { } pub async fn compact(&self) { - // Identity section is left in place — mid-session rebuilds discard - // memory scores. Content edits to personality nodes get picked up at - // the next restart via new() + restore_from_log(). + match crate::config::reload_context().await { + Ok(personality) => { + let mut ctx = self.context.lock().await; + // System section (prompt + tools) set by new(), don't touch it + ctx.clear(Section::Identity); + for (name, content) in &personality { + ctx.push_no_log(Section::Identity, AstNode::memory(name, content)); + } + } + Err(e) => { + dbglog!("warning: failed to reload identity: {:#}", e); + } + } + self.load_startup_journal().await; self.context.lock().await.trim_conversation(); diff --git a/src/agent/tokenizer.rs b/src/agent/tokenizer.rs index cd0acaf..85ac823 100644 --- a/src/agent/tokenizer.rs +++ b/src/agent/tokenizer.rs @@ -16,9 +16,6 @@ static TOKENIZER: OnceLock = OnceLock::new(); /// Special token IDs for Qwen 3.5 pub const IM_START: u32 = 248045; pub const IM_END: u32 = 248046; -pub const VISION_START: u32 = 248053; -pub const VISION_END: u32 = 248054; -pub const IMAGE_PAD: u32 = 248056; /// Initialize the global tokenizer from a file path. /// Call once at startup. Panics if the file can't be loaded. diff --git a/src/agent/tools/mod.rs b/src/agent/tools/mod.rs index 8904fc3..f72b015 100644 --- a/src/agent/tools/mod.rs +++ b/src/agent/tools/mod.rs @@ -242,7 +242,13 @@ pub fn summarize_args(tool_name: &str, args: &serde_json::Value) -> String { .as_str() .unwrap_or("") .to_string(), - "view_image" => args["file_path"].as_str().unwrap_or("").to_string(), + "view_image" => { + if let Some(pane) = args["pane_id"].as_str() { + format!("pane {}", pane) + } else { + args["file_path"].as_str().unwrap_or("").to_string() + } + } "journal" => { let entry = args["entry"].as_str().unwrap_or(""); if entry.len() > 60 { diff --git a/src/agent/tools/vision.rs b/src/agent/tools/vision.rs index 0e36888..83559f6 100644 --- a/src/agent/tools/vision.rs +++ b/src/agent/tools/vision.rs @@ -1,71 +1,96 @@ +use std::sync::Arc; // tools/vision.rs — Image viewing tool // -// Reads an image file from disk, decodes its dimensions, and injects it -// into the context as a user-role message containing a NodeBody::Image -// leaf. The leaf carries raw bytes; the API layer extracts them into -// multi_modal_data when building vLLM requests. - -use std::sync::Arc; +// Reads image files from disk and returns them as base64 data URIs +// for multimodal models. Also supports capturing tmux pane contents +// as screenshots. use anyhow::{Context, Result}; +use base64::Engine; use serde::Deserialize; -use crate::agent::context::{AstNode, Role, Section}; - #[derive(Deserialize)] struct Args { - file_path: String, + file_path: Option, + pane_id: Option, + #[serde(default = "default_lines")] + lines: usize, } +fn default_lines() -> usize { 50 } + pub fn tool() -> super::Tool { super::Tool { name: "view_image", - description: "View an image file. Supports PNG, JPEG, GIF, WebP, BMP. The image is inserted into the conversation and can be analyzed by the vision model.", - parameters_json: r#"{"type":"object","properties":{"file_path":{"type":"string","description":"Path to the image file"}},"required":["file_path"]}"#, - handler: Arc::new(|agent, v| Box::pin(async move { - view_image(agent, v).await - })), + description: "View an image file or capture a tmux pane screenshot. Supports PNG, JPEG, GIF, WebP. Use pane_id to capture a tmux pane instead.", + parameters_json: r#"{"type":"object","properties":{"file_path":{"type":"string","description":"Path to an image file"},"pane_id":{"type":"string","description":"Tmux pane ID to capture (e.g. '0:1.0')"},"lines":{"type":"integer","description":"Lines to capture from tmux pane (default 50)"}}}"#, + handler: Arc::new(|_a, v| Box::pin(async move { view_image_text(&v) })), } } -const MAX_SIZE: usize = 20 * 1024 * 1024; - -async fn view_image( - agent: Option>, - args: serde_json::Value, -) -> Result { - let a: Args = serde_json::from_value(args) +fn view_image_text(args: &serde_json::Value) -> anyhow::Result { + let a: Args = serde_json::from_value(args.clone()) .context("invalid view_image arguments")?; - let path = std::path::Path::new(&a.file_path); - if !path.exists() { - anyhow::bail!("file not found: {}", a.file_path); + if let Some(ref pane_id) = a.pane_id { + return capture_tmux_pane(pane_id, a.lines); } - let bytes = std::fs::read(path) - .with_context(|| format!("reading {}", a.file_path))?; + let file_path = a.file_path + .as_deref() + .context("view_image requires either file_path or pane_id")?; - if bytes.len() > MAX_SIZE { + let path = std::path::Path::new(file_path); + if !path.exists() { + anyhow::bail!("File not found: {}", file_path); + } + + let data = std::fs::read(path).with_context(|| format!("Failed to read {}", file_path))?; + + // Sanity check file size (don't send huge images) + const MAX_SIZE: usize = 20 * 1024 * 1024; // 20 MB + if data.len() > MAX_SIZE { anyhow::bail!( - "image too large: {} bytes (max {} MB)", - bytes.len(), MAX_SIZE / (1024 * 1024), + "Image too large: {} bytes (max {} MB)", + data.len(), + MAX_SIZE / (1024 * 1024) ); } - let dim = imagesize::blob_size(&bytes) - .with_context(|| format!("decoding dimensions of {}", a.file_path))?; - let (w, h) = (dim.width as u32, dim.height as u32); let mime = mime_from_extension(path); + let b64 = base64::engine::general_purpose::STANDARD.encode(&data); + let data_uri = format!("data:{};base64,{}", mime, b64); - let image_leaf = AstNode::image(bytes.clone(), mime, h, w); - let token_count = image_leaf.leaf().unwrap().tokens().saturating_sub(2); + Ok(format!("Image loaded: {} ({}, {} bytes)\n{}", file_path, mime, data.len(), data_uri)) +} - let agent = agent.context("view_image requires agent context")?; - let branch = AstNode::branch(Role::User, vec![image_leaf]); - agent.context.lock().await.push_log(Section::Conversation, branch); +/// Capture a tmux pane's text content. +fn capture_tmux_pane(pane_id: &str, lines: usize) -> Result { - Ok(format!("loaded {} ({}, {}x{}, {} tokens)", - a.file_path, mime, w, h, token_count)) + // Use tmux capture-pane to get text content, then render to image + // via a simple approach: capture text and return it (the model can + // read text directly, which is often more useful than a screenshot). + // + // For actual pixel-level screenshots we'd need a terminal renderer, + // but text capture covers 95% of use cases. + let output = std::process::Command::new("tmux") + .args(["capture-pane", "-t", pane_id, "-p", "-S", &format!("-{}", lines)]) + .output() + .context("Failed to run tmux capture-pane")?; + + if !output.status.success() { + let stderr = String::from_utf8_lossy(&output.stderr); + anyhow::bail!("tmux capture-pane failed: {}", stderr.trim()); + } + + let text = String::from_utf8_lossy(&output.stdout).to_string(); + + // Return as text — the model can read terminal output directly. + // This is actually more useful than a screenshot for most tasks. + Ok(format!( + "Tmux pane {} (last {} lines):\n```\n{}\n```", + pane_id, lines, text.trim_end() + )) } fn mime_from_extension(path: &std::path::Path) -> &'static str { @@ -79,7 +104,8 @@ fn mime_from_extension(path: &std::path::Path) -> &'static str { Some("jpg" | "jpeg") => "image/jpeg", Some("gif") => "image/gif", Some("webp") => "image/webp", + Some("svg") => "image/svg+xml", Some("bmp") => "image/bmp", - _ => "application/octet-stream", + _ => "image/png", // default assumption } } diff --git a/src/config.rs b/src/config.rs index 6323aae..b7ea597 100644 --- a/src/config.rs +++ b/src/config.rs @@ -29,9 +29,6 @@ static CONFIG: OnceLock>> = OnceLock::new(); fn default_stream_timeout() -> u64 { 60 } fn default_scoring_interval_secs() -> u64 { 3600 } // 1 hour fn default_scoring_response_window() -> usize { 100 } -fn default_surface_hooks() -> Vec { - vec!["UserPromptSubmit".into(), "PostToolUse".into(), "Stop".into()] -} fn default_node_weight() -> f64 { 0.7 } fn default_edge_decay() -> f64 { 0.3 } fn default_max_hops() -> u32 { 3 } @@ -76,10 +73,6 @@ pub struct Config { /// Max conversation bytes to include in surface agent context. #[serde(default)] pub surface_conversation_bytes: Option, - /// Claude Code hook events that trigger agent cycles (surface-observe, - /// reflect, journal). Read by consciousness-claude/src/hook.rs. - #[serde(default = "default_surface_hooks")] - pub surface_hooks: Vec, // Spreading activation parameters #[serde(default = "default_node_weight")] @@ -111,7 +104,6 @@ impl Default for Config { "separator".into(), "split".into(), ], surface_conversation_bytes: None, - surface_hooks: default_surface_hooks(), mcp_servers: vec![], lsp_servers: vec![], default_node_weight: default_node_weight(), diff --git a/src/main.rs b/src/main.rs index f13448c..78bfa4f 100644 --- a/src/main.rs +++ b/src/main.rs @@ -482,14 +482,6 @@ async fn main() { let cli = Cli::parse(); - // Some subcommands (e.g. admin load-context) read from the global - // AppConfig. poc-memory has no config CLI flags of its own, so load - // with defaults — figment still pulls from ~/.consciousness/config.json5 - // and env the same way. - if let Err(e) = crate::config::load_app(&crate::user::CliArgs::default()) { - eprintln!("warning: failed to load config: {:#}", e); - } - if let Err(e) = cli.command.run().await { eprintln!("Error: {}", e); process::exit(1); diff --git a/src/mind/mod.rs b/src/mind/mod.rs index 474e2c2..11d45b1 100644 --- a/src/mind/mod.rs +++ b/src/mind/mod.rs @@ -103,13 +103,9 @@ fn collect_memory_scores(ctx: &ContextState) -> std::collections::BTreeMap, path: &std::path::Path) { - match serde_json::to_string_pretty(scores) { - Ok(json) => match std::fs::write(path, &json) { - Ok(()) => dbglog!("[scoring] saved {} scores to {} ({} bytes)", - scores.len(), path.display(), json.len()), - Err(e) => dbglog!("[scoring] save FAILED ({}): {}", path.display(), e), - }, - Err(e) => dbglog!("[scoring] serialize FAILED: {}", e), + if let Ok(json) = serde_json::to_string_pretty(scores) { + let _ = std::fs::write(path, json); + dbglog!("[scoring] saved {} scores to {}", scores.len(), path.display()); } } @@ -510,17 +506,6 @@ impl Mind { // Load persistent subconscious state let state_path = self.config.session_dir.join("subconscious-state.json"); self.subconscious.lock().await.set_state_path(state_path); - - // Kick off an incremental scoring pass on startup so memories due - // for re-scoring get evaluated without requiring a user message. - { - let mut s = self.shared.lock().unwrap(); - if !s.scoring_in_flight { - s.scoring_in_flight = true; - drop(s); - self.start_memory_scoring(); - } - } } pub fn turn_watch(&self) -> tokio::sync::watch::Receiver { @@ -634,40 +619,14 @@ impl Mind { let mut ctx = agent.context.lock().await; // Find memory by key in identity or conversation let found = find_memory_by_key(&ctx, &key); - match found { - Some((section, i)) => { - ctx.set_score(section, i, Some(score)); - let nodes: &[crate::agent::context::AstNode] = match section { - Section::Identity => ctx.identity(), - Section::Conversation => ctx.conversation(), - _ => &[], - }; - let read_back = match nodes.get(i) { - Some(crate::agent::context::AstNode::Leaf(l)) => match l.body() { - crate::agent::context::NodeBody::Memory { score, .. } => format!("{:?}", score), - _ => "not-memory".to_string(), - }, - _ => "out-of-bounds".to_string(), - }; - dbglog!("[scoring] persisted {} → {:.3} ({:?}[{}]) read_back={}", - key, score, section, i, read_back); - } - None => { - dbglog!( - "[scoring] DROP {}: find_memory_by_key None (id={}, cv={})", - key, ctx.identity().len(), ctx.conversation().len() - ); - } + if let Some((section, i)) = found { + ctx.set_score(section, i, Some(score)); } let snapshot = collect_memory_scores(&ctx); - let in_snapshot = snapshot.contains_key(&key); - dbglog!("[scoring] snapshot size={} contains({})={}", - snapshot.len(), key, in_snapshot); drop(ctx); agent.state.lock().await.changed.notify_one(); snapshot }; - dbglog!("[scoring] about to save {} entries", scores_snapshot.len()); save_memory_scores(&scores_snapshot, &path); } }, diff --git a/src/user/chat.rs b/src/user/chat.rs index fe3db5b..47c5d56 100644 --- a/src/user/chat.rs +++ b/src/user/chat.rs @@ -486,11 +486,6 @@ impl InteractScreen { if t.is_empty() { vec![] } else { vec![(PaneTarget::ToolResult, text, Marker::None)] } } - NodeBody::Image { orig_height, orig_width, .. } => { - vec![(PaneTarget::Conversation, - format!("[image {}x{}]", orig_width, orig_height), - Marker::None)] - } } } AstNode::Branch { role, children, .. } => {