diff --git a/Cargo.lock b/Cargo.lock index dfca607..c76a7cd 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -492,6 +492,7 @@ dependencies = [ "http-body-util", "hyper", "hyper-util", + "imagesize", "json-five", "libc", "log", @@ -1423,6 +1424,12 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "imagesize" +version = "0.14.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09e54e57b4c48b40f7aec75635392b12b3421fa26fe8b4332e63138ed278459c" + [[package]] name = "indexmap" version = "2.14.0" diff --git a/Cargo.toml b/Cargo.toml index 7cdf851..0996f94 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -68,6 +68,7 @@ hyper-util = { version = "0.1", features = ["tokio"], default-features = false } http-body-util = "0.1" bytes = "1" base64 = "0.22" +imagesize = "0.14" rustls = "0.23" tokio-rustls = "0.26" diff --git a/src/agent/api/mod.rs b/src/agent/api/mod.rs index 7c06fa7..649d95c 100644 --- a/src/agent/api/mod.rs +++ b/src/agent/api/mod.rs @@ -78,18 +78,31 @@ impl ApiClient { prompt_tokens: &[u32], sampling: SamplingParams, priority: Option, + ) -> (mpsc::UnboundedReceiver, AbortOnDrop) { + self.stream_completion_mm(prompt_tokens, &[], sampling, priority) + } + + pub(crate) fn stream_completion_mm( + &self, + prompt_tokens: &[u32], + images: &[super::context::WireImage], + sampling: SamplingParams, + priority: Option, ) -> (mpsc::UnboundedReceiver, AbortOnDrop) { let (tx, rx) = mpsc::unbounded_channel(); let client = self.client.clone(); let api_key = self.api_key.clone(); let model = self.model.clone(); let prompt_tokens = prompt_tokens.to_vec(); + let images: Vec<(Vec, String)> = images.iter() + .map(|i| (i.bytes.clone(), i.mime.clone())) + .collect(); let base_url = self.base_url.clone(); let handle = tokio::spawn(async move { let result = stream_completions( &client, &base_url, &api_key, &model, - &prompt_tokens, &tx, sampling, priority, + &prompt_tokens, &images, &tx, sampling, priority, ).await; if let Err(e) = result { let _ = tx.send(StreamToken::Error(e.to_string())); @@ -110,6 +123,7 @@ async fn stream_completions( api_key: &str, model: &str, prompt_tokens: &[u32], + images: &[(Vec, String)], tx: &mpsc::UnboundedSender, sampling: SamplingParams, priority: Option, @@ -126,6 +140,14 @@ async fn stream_completions( "skip_special_tokens": false, "stop_token_ids": [super::tokenizer::IM_END], }); + if !images.is_empty() { + use base64::Engine; + let b64 = base64::engine::general_purpose::STANDARD; + let uris: Vec = images.iter() + .map(|(bytes, mime)| format!("data:{};base64,{}", mime, b64.encode(bytes))) + .collect(); + request["multi_modal_data"] = serde_json::json!({ "image": uris }); + } if let Some(p) = priority { request["priority"] = serde_json::json!(p); } diff --git a/src/agent/context.rs b/src/agent/context.rs index 37dbf48..0082f06 100644 --- a/src/agent/context.rs +++ b/src/agent/context.rs @@ -81,10 +81,33 @@ pub enum NodeBody { Memory { key: String, text: String, score: Option }, Dmn(String), + // Vision input — rendered as <|vision_start|> <|image_pad|>×N <|vision_end|>. + // `token_count` is N, the count vLLM will compute for this image's grid. + Image { + #[serde(with = "b64_bytes")] + bytes: Vec, + mime: String, + orig_height: u32, + orig_width: u32, + token_count: u32, + }, + // Non-visible (0 tokens in prompt) Log(String), } +mod b64_bytes { + use base64::{Engine, engine::general_purpose::STANDARD}; + use serde::{Serializer, Deserializer, Deserialize}; + pub fn serialize(bytes: &[u8], s: S) -> Result { + s.serialize_str(&STANDARD.encode(bytes)) + } + pub fn deserialize<'de, D: Deserializer<'de>>(d: D) -> Result, D::Error> { + let s = String::deserialize(d)?; + STANDARD.decode(s).map_err(serde::de::Error::custom) + } +} + /// A leaf node: typed content with cached token IDs. /// Token IDs are not serialized — they're recomputed on deserialization. #[derive(Debug, Clone, Serialize)] @@ -103,11 +126,7 @@ impl<'de> Deserialize<'de> for NodeLeaf { timestamp: DateTime, } let raw = Raw::deserialize(deserializer)?; - let token_ids = if raw.body.is_prompt_visible() { - tokenizer::encode(&raw.body.render()) - } else { - vec![] - }; + let token_ids = raw.body.compute_token_ids(); Ok(NodeLeaf { body: raw.body, token_ids, timestamp: raw.timestamp }) } } @@ -221,6 +240,13 @@ impl NodeBody { out.push_str(text); out.push_str("<|im_end|>\n"); } + Self::Image { token_count, .. } => { + out.push_str("<|vision_start|>"); + for _ in 0..*token_count { + out.push_str("<|image_pad|>"); + } + out.push_str("<|vision_end|>"); + } } } @@ -235,6 +261,26 @@ impl NodeBody { !matches!(self, Self::Thinking(_) | Self::Log(_)) } + /// Hand-assemble token IDs for body types where running the tokenizer + /// on the rendered text would be needlessly expensive (Image). Falls + /// back to encoding the rendered text for everything else. + fn compute_token_ids(&self) -> Vec { + if !self.is_prompt_visible() { + return Vec::new(); + } + match self { + Self::Image { token_count, .. } => { + let mut ids = Vec::with_capacity(*token_count as usize + 2); + ids.push(tokenizer::VISION_START); + ids.extend(std::iter::repeat(tokenizer::IMAGE_PAD) + .take(*token_count as usize)); + ids.push(tokenizer::VISION_END); + ids + } + _ => tokenizer::encode(&self.render()), + } + } + /// The text content of this leaf (for display, not rendering). pub fn text(&self) -> &str { match self { @@ -242,17 +288,14 @@ impl NodeBody { | Self::ToolResult(t) | Self::Dmn(t) => t, Self::ToolCall { name, .. } => name, Self::Memory { text, .. } => text, + Self::Image { mime, .. } => mime, } } } impl NodeLeaf { fn new(body: NodeBody) -> Self { - let token_ids = if body.is_prompt_visible() { - tokenizer::encode(&body.render()) - } else { - vec![] - }; + let token_ids = body.compute_token_ids(); Self { body, token_ids, timestamp: Utc::now() } } @@ -305,6 +348,24 @@ impl AstNode { Self::Leaf(NodeLeaf::new(NodeBody::Log(text.into()))) } + /// Build an Image leaf. `token_count` is computed from the image + /// dimensions using Qwen3-VL's resizing rules. + pub fn image( + bytes: Vec, + mime: impl Into, + orig_height: u32, + orig_width: u32, + ) -> Self { + let token_count = qwen3_image_token_count(orig_height, orig_width); + Self::Leaf(NodeLeaf::new(NodeBody::Image { + bytes, + mime: mime.into(), + orig_height, + orig_width, + token_count, + })) + } + // -- Branch constructors -------------------------------------------------- pub fn branch(role: Role, children: Vec) -> Self { @@ -334,11 +395,7 @@ impl AstNode { pub fn retokenize(self) -> Self { match self { Self::Leaf(leaf) => { - let token_ids = if leaf.body.is_prompt_visible() { - tokenizer::encode(&leaf.body.render()) - } else { - vec![] - }; + let token_ids = leaf.body.compute_token_ids(); Self::Leaf(NodeLeaf { token_ids, ..leaf }) } Self::Branch { role, children, timestamp, memory_scores } => Self::Branch { @@ -397,6 +454,8 @@ impl AstNode { None => format!("mem: {}", key), }, NodeBody::Dmn(_) => "dmn".into(), + NodeBody::Image { orig_height, orig_width, token_count, .. } => + format!("image: {}x{} ({} tokens)", orig_width, orig_height, token_count), NodeBody::Log(t) => format!("log: {}", truncate_preview(t, 60)), }, } @@ -825,6 +884,58 @@ impl Ast for ContextState { } } +/// An image collected from the AST for a request body. The AST stores +/// the pre-expanded token form (N image_pads) for accurate budget +/// accounting; the wire form collapses each Image to a single +/// `<|image_pad|>` between vision bookends and ships the bytes +/// separately as multi_modal_data. +pub struct WireImage { + pub bytes: Vec, + pub mime: String, +} + +fn wire_into(node: &AstNode, tokens: &mut Vec, images: &mut Vec) { + match node { + AstNode::Leaf(leaf) => match leaf.body() { + NodeBody::Image { bytes, mime, .. } => { + tokens.push(tokenizer::VISION_START); + tokens.push(tokenizer::IMAGE_PAD); + tokens.push(tokenizer::VISION_END); + images.push(WireImage { + bytes: bytes.clone(), + mime: mime.clone(), + }); + } + _ => tokens.extend_from_slice(leaf.token_ids()), + }, + AstNode::Branch { role, children, .. } => { + tokens.push(tokenizer::IM_START); + tokens.extend(tokenizer::encode(&format!("{}\n", role.as_str()))); + for c in children { + wire_into(c, tokens, images); + } + tokens.push(tokenizer::IM_END); + tokens.extend(tokenizer::encode("\n")); + } + } +} + +impl ContextState { + /// Assemble the prompt in wire form: token stream with a single + /// `<|image_pad|>` per image (vLLM expands back to N), plus the list + /// of images to send as multi_modal_data. + pub fn wire_prompt(&self) -> (Vec, Vec) { + let mut tokens = Vec::new(); + let mut images = Vec::new(); + for section in self.sections() { + for node in section { + wire_into(node, &mut tokens, &mut images); + } + } + (tokens, images) + } +} + impl ContextState { fn section_mut(&mut self, section: Section) -> &mut Vec { match section { @@ -857,11 +968,7 @@ impl ContextState { let node = &mut nodes[index]; match node { AstNode::Leaf(leaf) => { - let token_ids = if body.is_prompt_visible() { - tokenizer::encode(&body.render()) - } else { - vec![] - }; + let token_ids = body.compute_token_ids(); leaf.body = body; leaf.token_ids = token_ids; } @@ -991,6 +1098,58 @@ impl ContextState { } } +// --------------------------------------------------------------------------- +// Qwen3-VL image token count +// +// Port of Qwen2VLImageProcessor.smart_resize + image_token_count. We need the +// exact same answer that vLLM's Qwen3VL processor will produce, because the +// token stream in our context must match what vLLM expands `<|image_pad|>` +// to at request time. Constants come from Qwen3.5-27B's preprocessor_config. +// --------------------------------------------------------------------------- + +const QWEN3_PATCH_SIZE: u32 = 16; +const QWEN3_MERGE_SIZE: u32 = 2; +const QWEN3_MIN_PIXELS: u64 = 65_536; +const QWEN3_MAX_PIXELS: u64 = 16_777_216; + +fn smart_resize(h: u32, w: u32, factor: u32, min_pixels: u64, max_pixels: u64) -> (u32, u32) { + let max_s = h.max(w) as f64; + let min_s = h.min(w) as f64; + assert!(max_s / min_s <= 200.0, "aspect ratio too extreme: {}x{}", h, w); + + let fh = h as f64; + let fw = w as f64; + let ff = factor as f64; + + let h_bar = ((fh / ff).round() as u32) * factor; + let w_bar = ((fw / ff).round() as u32) * factor; + let total = (h_bar as u64) * (w_bar as u64); + + if total > max_pixels { + let beta = ((fh * fw) / max_pixels as f64).sqrt(); + let hf = ((fh / beta / ff).floor() as u32) * factor; + let wf = ((fw / beta / ff).floor() as u32) * factor; + (hf.max(factor), wf.max(factor)) + } else if total < min_pixels { + let beta = (min_pixels as f64 / (fh * fw)).sqrt(); + let hc = ((fh * beta / ff).ceil() as u32) * factor; + let wc = ((fw * beta / ff).ceil() as u32) * factor; + (hc, wc) + } else { + (h_bar, w_bar) + } +} + +/// Compute how many `<|image_pad|>` tokens vLLM will emit for an image of +/// the given dimensions. Matches Qwen3VL's feature-size calculation exactly: +/// (grid_h * grid_w) / merge_size^2 +/// where (grid_h, grid_w) = resized dims / patch_size. +fn qwen3_image_token_count(orig_h: u32, orig_w: u32) -> u32 { + let factor = QWEN3_PATCH_SIZE * QWEN3_MERGE_SIZE; + let (rh, rw) = smart_resize(orig_h, orig_w, factor, QWEN3_MIN_PIXELS, QWEN3_MAX_PIXELS); + (rh / QWEN3_PATCH_SIZE) * (rw / QWEN3_PATCH_SIZE) / (QWEN3_MERGE_SIZE * QWEN3_MERGE_SIZE) +} + pub fn context_window() -> usize { let app = crate::config::app(); app.backends.get(&app.default_backend) @@ -1370,6 +1529,110 @@ mod tests { assert!(serde_json::from_str::(json).is_err()); } + // -- Image leaf tests --------------------------------------------------------- + + #[test] + fn test_smart_resize_within_bounds() { + // Typical case: 1024x768 → rounded to multiples of 32, under max. + let (h, w) = smart_resize(768, 1024, 32, 65_536, 16_777_216); + assert_eq!(h, 768); + assert_eq!(w, 1024); + } + + #[test] + fn test_smart_resize_upscales_tiny() { + // 32x32 = 1024 pixels, below min_pixels=65536. Should scale up. + let (h, w) = smart_resize(32, 32, 32, 65_536, 16_777_216); + assert!((h as u64) * (w as u64) >= 65_536, + "resized {}x{} is under min_pixels", h, w); + assert_eq!(h % 32, 0); + assert_eq!(w % 32, 0); + } + + #[test] + fn test_smart_resize_downscales_huge() { + // 8000x6000 = 48M pixels, above max_pixels=16M. Should scale down. + let (h, w) = smart_resize(8000, 6000, 32, 65_536, 16_777_216); + assert!((h as u64) * (w as u64) <= 16_777_216, + "resized {}x{} exceeds max_pixels", h, w); + assert_eq!(h % 32, 0); + assert_eq!(w % 32, 0); + } + + #[test] + fn test_qwen3_token_count_matches_formula() { + // 512x512 → resized to 512x512 (already multiple of 32, within bounds). + // grid = 32x32, tokens = 32*32/4 = 256. + assert_eq!(qwen3_image_token_count(512, 512), 256); + } + + #[test] + fn test_image_render_and_token_ids() { + let node = AstNode::image(vec![0u8, 1, 2, 3], "image/png", 512, 512); + let leaf = node.leaf().unwrap(); + // 3 tokens of bookend + 256 image_pad tokens + assert_eq!(leaf.token_ids().len(), 258); + assert_eq!(leaf.token_ids()[0], tokenizer::VISION_START); + assert_eq!(leaf.token_ids()[257], tokenizer::VISION_END); + for pad in &leaf.token_ids()[1..257] { + assert_eq!(*pad, tokenizer::IMAGE_PAD); + } + // Rendered text has the expected bookends. + let rendered = leaf.body().render(); + assert!(rendered.starts_with("<|vision_start|>")); + assert!(rendered.ends_with("<|vision_end|>")); + } + + #[test] + fn test_wire_prompt_collapses_image_pads() { + let mut ctx = ContextState::new(); + ctx.push_no_log(Section::Conversation, AstNode::branch(Role::User, vec![ + AstNode::content("look:"), + AstNode::image(vec![0xDE, 0xAD], "image/png", 512, 512), + ])); + + // AST side: N image_pads + bookends, full budget accounting. + let full = ctx.token_ids(); + let n_image_pads_full = full.iter() + .filter(|&&t| t == tokenizer::IMAGE_PAD).count(); + assert_eq!(n_image_pads_full, qwen3_image_token_count(512, 512) as usize); + + // Wire side: single image_pad, bytes moved to images list. + let (wire, images) = ctx.wire_prompt(); + let n_image_pads_wire = wire.iter() + .filter(|&&t| t == tokenizer::IMAGE_PAD).count(); + assert_eq!(n_image_pads_wire, 1); + assert_eq!(images.len(), 1); + assert_eq!(images[0].bytes, vec![0xDE, 0xAD]); + assert_eq!(images[0].mime, "image/png"); + + // vision_start/vision_end bookends are preserved in wire form. + assert_eq!(wire.iter().filter(|&&t| t == tokenizer::VISION_START).count(), 1); + assert_eq!(wire.iter().filter(|&&t| t == tokenizer::VISION_END).count(), 1); + } + + #[test] + fn test_image_serde_roundtrip() { + let node = AstNode::image(vec![0xDE, 0xAD, 0xBE, 0xEF], "image/png", 64, 64); + let json = serde_json::to_string(&node).unwrap(); + // bytes must be base64-encoded in the JSON form + assert!(json.contains("3q2+7w==")); + let back: AstNode = serde_json::from_str(&json).unwrap(); + let leaf = back.leaf().unwrap(); + match leaf.body() { + NodeBody::Image { bytes, mime, orig_height, orig_width, token_count } => { + assert_eq!(bytes, &[0xDE, 0xAD, 0xBE, 0xEF]); + assert_eq!(mime, "image/png"); + assert_eq!(*orig_height, 64); + assert_eq!(*orig_width, 64); + assert_eq!(*token_count, qwen3_image_token_count(64, 64)); + } + other => panic!("expected Image, got {:?}", other), + } + // token_ids are recomputed on deserialization + assert_eq!(leaf.token_ids().len(), leaf.tokens()); + } + #[test] fn test_timestamp_present_accepted() { let json = r#"{"Leaf":{"body":{"Content":"hi"},"timestamp":"2026-04-16T12:00:00Z"}}"#; diff --git a/src/agent/mod.rs b/src/agent/mod.rs index 5368db6..bc62955 100644 --- a/src/agent/mod.rs +++ b/src/agent/mod.rs @@ -285,16 +285,23 @@ impl Agent { } pub async fn assemble_prompt_tokens(&self) -> Vec { + self.assemble_prompt().await.0 + } + + /// Assemble a ready-to-send prompt: token stream in wire form (each + /// image collapsed to a single `<|image_pad|>`) paired with the + /// images to attach as multi_modal_data. + pub async fn assemble_prompt(&self) -> (Vec, Vec) { let ctx = self.context.lock().await; let st = self.state.lock().await; - let mut tokens = ctx.token_ids(); + let (mut tokens, images) = ctx.wire_prompt(); tokens.push(tokenizer::IM_START); if st.think_native { tokens.extend(tokenizer::encode("assistant\n\n")); } else { tokens.extend(tokenizer::encode("assistant\n")); } - tokens + (tokens, images) } /// Rebuild the tools section of the system prompt from the current tools list. @@ -354,10 +361,11 @@ impl Agent { let _thinking = start_activity(&agent, "thinking...").await; let (rx, _stream_guard) = { - let prompt_tokens = agent.assemble_prompt_tokens().await; + let (prompt_tokens, images) = agent.assemble_prompt().await; let st = agent.state.lock().await; - agent.client.stream_completion( + agent.client.stream_completion_mm( &prompt_tokens, + &images, api::SamplingParams { temperature: st.temperature, top_p: st.top_p, @@ -575,20 +583,9 @@ impl Agent { } pub async fn compact(&self) { - match crate::config::reload_context().await { - Ok(personality) => { - let mut ctx = self.context.lock().await; - // System section (prompt + tools) set by new(), don't touch it - ctx.clear(Section::Identity); - for (name, content) in &personality { - ctx.push_no_log(Section::Identity, AstNode::memory(name, content)); - } - } - Err(e) => { - dbglog!("warning: failed to reload identity: {:#}", e); - } - } - + // Identity section is left in place — mid-session rebuilds discard + // memory scores. Content edits to personality nodes get picked up at + // the next restart via new() + restore_from_log(). self.load_startup_journal().await; self.context.lock().await.trim_conversation(); diff --git a/src/agent/tokenizer.rs b/src/agent/tokenizer.rs index 85ac823..cd0acaf 100644 --- a/src/agent/tokenizer.rs +++ b/src/agent/tokenizer.rs @@ -16,6 +16,9 @@ static TOKENIZER: OnceLock = OnceLock::new(); /// Special token IDs for Qwen 3.5 pub const IM_START: u32 = 248045; pub const IM_END: u32 = 248046; +pub const VISION_START: u32 = 248053; +pub const VISION_END: u32 = 248054; +pub const IMAGE_PAD: u32 = 248056; /// Initialize the global tokenizer from a file path. /// Call once at startup. Panics if the file can't be loaded. diff --git a/src/agent/tools/mod.rs b/src/agent/tools/mod.rs index f72b015..8904fc3 100644 --- a/src/agent/tools/mod.rs +++ b/src/agent/tools/mod.rs @@ -242,13 +242,7 @@ pub fn summarize_args(tool_name: &str, args: &serde_json::Value) -> String { .as_str() .unwrap_or("") .to_string(), - "view_image" => { - if let Some(pane) = args["pane_id"].as_str() { - format!("pane {}", pane) - } else { - args["file_path"].as_str().unwrap_or("").to_string() - } - } + "view_image" => args["file_path"].as_str().unwrap_or("").to_string(), "journal" => { let entry = args["entry"].as_str().unwrap_or(""); if entry.len() > 60 { diff --git a/src/agent/tools/vision.rs b/src/agent/tools/vision.rs index 83559f6..0e36888 100644 --- a/src/agent/tools/vision.rs +++ b/src/agent/tools/vision.rs @@ -1,96 +1,71 @@ -use std::sync::Arc; // tools/vision.rs — Image viewing tool // -// Reads image files from disk and returns them as base64 data URIs -// for multimodal models. Also supports capturing tmux pane contents -// as screenshots. +// Reads an image file from disk, decodes its dimensions, and injects it +// into the context as a user-role message containing a NodeBody::Image +// leaf. The leaf carries raw bytes; the API layer extracts them into +// multi_modal_data when building vLLM requests. + +use std::sync::Arc; use anyhow::{Context, Result}; -use base64::Engine; use serde::Deserialize; +use crate::agent::context::{AstNode, Role, Section}; + #[derive(Deserialize)] struct Args { - file_path: Option, - pane_id: Option, - #[serde(default = "default_lines")] - lines: usize, + file_path: String, } -fn default_lines() -> usize { 50 } - pub fn tool() -> super::Tool { super::Tool { name: "view_image", - description: "View an image file or capture a tmux pane screenshot. Supports PNG, JPEG, GIF, WebP. Use pane_id to capture a tmux pane instead.", - parameters_json: r#"{"type":"object","properties":{"file_path":{"type":"string","description":"Path to an image file"},"pane_id":{"type":"string","description":"Tmux pane ID to capture (e.g. '0:1.0')"},"lines":{"type":"integer","description":"Lines to capture from tmux pane (default 50)"}}}"#, - handler: Arc::new(|_a, v| Box::pin(async move { view_image_text(&v) })), + description: "View an image file. Supports PNG, JPEG, GIF, WebP, BMP. The image is inserted into the conversation and can be analyzed by the vision model.", + parameters_json: r#"{"type":"object","properties":{"file_path":{"type":"string","description":"Path to the image file"}},"required":["file_path"]}"#, + handler: Arc::new(|agent, v| Box::pin(async move { + view_image(agent, v).await + })), } } -fn view_image_text(args: &serde_json::Value) -> anyhow::Result { - let a: Args = serde_json::from_value(args.clone()) +const MAX_SIZE: usize = 20 * 1024 * 1024; + +async fn view_image( + agent: Option>, + args: serde_json::Value, +) -> Result { + let a: Args = serde_json::from_value(args) .context("invalid view_image arguments")?; - if let Some(ref pane_id) = a.pane_id { - return capture_tmux_pane(pane_id, a.lines); - } - - let file_path = a.file_path - .as_deref() - .context("view_image requires either file_path or pane_id")?; - - let path = std::path::Path::new(file_path); + let path = std::path::Path::new(&a.file_path); if !path.exists() { - anyhow::bail!("File not found: {}", file_path); + anyhow::bail!("file not found: {}", a.file_path); } - let data = std::fs::read(path).with_context(|| format!("Failed to read {}", file_path))?; + let bytes = std::fs::read(path) + .with_context(|| format!("reading {}", a.file_path))?; - // Sanity check file size (don't send huge images) - const MAX_SIZE: usize = 20 * 1024 * 1024; // 20 MB - if data.len() > MAX_SIZE { + if bytes.len() > MAX_SIZE { anyhow::bail!( - "Image too large: {} bytes (max {} MB)", - data.len(), - MAX_SIZE / (1024 * 1024) + "image too large: {} bytes (max {} MB)", + bytes.len(), MAX_SIZE / (1024 * 1024), ); } + let dim = imagesize::blob_size(&bytes) + .with_context(|| format!("decoding dimensions of {}", a.file_path))?; + let (w, h) = (dim.width as u32, dim.height as u32); let mime = mime_from_extension(path); - let b64 = base64::engine::general_purpose::STANDARD.encode(&data); - let data_uri = format!("data:{};base64,{}", mime, b64); - Ok(format!("Image loaded: {} ({}, {} bytes)\n{}", file_path, mime, data.len(), data_uri)) -} + let image_leaf = AstNode::image(bytes.clone(), mime, h, w); + let token_count = image_leaf.leaf().unwrap().tokens().saturating_sub(2); -/// Capture a tmux pane's text content. -fn capture_tmux_pane(pane_id: &str, lines: usize) -> Result { + let agent = agent.context("view_image requires agent context")?; + let branch = AstNode::branch(Role::User, vec![image_leaf]); + agent.context.lock().await.push_log(Section::Conversation, branch); - // Use tmux capture-pane to get text content, then render to image - // via a simple approach: capture text and return it (the model can - // read text directly, which is often more useful than a screenshot). - // - // For actual pixel-level screenshots we'd need a terminal renderer, - // but text capture covers 95% of use cases. - let output = std::process::Command::new("tmux") - .args(["capture-pane", "-t", pane_id, "-p", "-S", &format!("-{}", lines)]) - .output() - .context("Failed to run tmux capture-pane")?; - - if !output.status.success() { - let stderr = String::from_utf8_lossy(&output.stderr); - anyhow::bail!("tmux capture-pane failed: {}", stderr.trim()); - } - - let text = String::from_utf8_lossy(&output.stdout).to_string(); - - // Return as text — the model can read terminal output directly. - // This is actually more useful than a screenshot for most tasks. - Ok(format!( - "Tmux pane {} (last {} lines):\n```\n{}\n```", - pane_id, lines, text.trim_end() - )) + Ok(format!("loaded {} ({}, {}x{}, {} tokens)", + a.file_path, mime, w, h, token_count)) } fn mime_from_extension(path: &std::path::Path) -> &'static str { @@ -104,8 +79,7 @@ fn mime_from_extension(path: &std::path::Path) -> &'static str { Some("jpg" | "jpeg") => "image/jpeg", Some("gif") => "image/gif", Some("webp") => "image/webp", - Some("svg") => "image/svg+xml", Some("bmp") => "image/bmp", - _ => "image/png", // default assumption + _ => "application/octet-stream", } } diff --git a/src/config.rs b/src/config.rs index b7ea597..6323aae 100644 --- a/src/config.rs +++ b/src/config.rs @@ -29,6 +29,9 @@ static CONFIG: OnceLock>> = OnceLock::new(); fn default_stream_timeout() -> u64 { 60 } fn default_scoring_interval_secs() -> u64 { 3600 } // 1 hour fn default_scoring_response_window() -> usize { 100 } +fn default_surface_hooks() -> Vec { + vec!["UserPromptSubmit".into(), "PostToolUse".into(), "Stop".into()] +} fn default_node_weight() -> f64 { 0.7 } fn default_edge_decay() -> f64 { 0.3 } fn default_max_hops() -> u32 { 3 } @@ -73,6 +76,10 @@ pub struct Config { /// Max conversation bytes to include in surface agent context. #[serde(default)] pub surface_conversation_bytes: Option, + /// Claude Code hook events that trigger agent cycles (surface-observe, + /// reflect, journal). Read by consciousness-claude/src/hook.rs. + #[serde(default = "default_surface_hooks")] + pub surface_hooks: Vec, // Spreading activation parameters #[serde(default = "default_node_weight")] @@ -104,6 +111,7 @@ impl Default for Config { "separator".into(), "split".into(), ], surface_conversation_bytes: None, + surface_hooks: default_surface_hooks(), mcp_servers: vec![], lsp_servers: vec![], default_node_weight: default_node_weight(), diff --git a/src/main.rs b/src/main.rs index 78bfa4f..f13448c 100644 --- a/src/main.rs +++ b/src/main.rs @@ -482,6 +482,14 @@ async fn main() { let cli = Cli::parse(); + // Some subcommands (e.g. admin load-context) read from the global + // AppConfig. poc-memory has no config CLI flags of its own, so load + // with defaults — figment still pulls from ~/.consciousness/config.json5 + // and env the same way. + if let Err(e) = crate::config::load_app(&crate::user::CliArgs::default()) { + eprintln!("warning: failed to load config: {:#}", e); + } + if let Err(e) = cli.command.run().await { eprintln!("Error: {}", e); process::exit(1); diff --git a/src/mind/mod.rs b/src/mind/mod.rs index 11d45b1..474e2c2 100644 --- a/src/mind/mod.rs +++ b/src/mind/mod.rs @@ -103,9 +103,13 @@ fn collect_memory_scores(ctx: &ContextState) -> std::collections::BTreeMap, path: &std::path::Path) { - if let Ok(json) = serde_json::to_string_pretty(scores) { - let _ = std::fs::write(path, json); - dbglog!("[scoring] saved {} scores to {}", scores.len(), path.display()); + match serde_json::to_string_pretty(scores) { + Ok(json) => match std::fs::write(path, &json) { + Ok(()) => dbglog!("[scoring] saved {} scores to {} ({} bytes)", + scores.len(), path.display(), json.len()), + Err(e) => dbglog!("[scoring] save FAILED ({}): {}", path.display(), e), + }, + Err(e) => dbglog!("[scoring] serialize FAILED: {}", e), } } @@ -506,6 +510,17 @@ impl Mind { // Load persistent subconscious state let state_path = self.config.session_dir.join("subconscious-state.json"); self.subconscious.lock().await.set_state_path(state_path); + + // Kick off an incremental scoring pass on startup so memories due + // for re-scoring get evaluated without requiring a user message. + { + let mut s = self.shared.lock().unwrap(); + if !s.scoring_in_flight { + s.scoring_in_flight = true; + drop(s); + self.start_memory_scoring(); + } + } } pub fn turn_watch(&self) -> tokio::sync::watch::Receiver { @@ -619,14 +634,40 @@ impl Mind { let mut ctx = agent.context.lock().await; // Find memory by key in identity or conversation let found = find_memory_by_key(&ctx, &key); - if let Some((section, i)) = found { - ctx.set_score(section, i, Some(score)); + match found { + Some((section, i)) => { + ctx.set_score(section, i, Some(score)); + let nodes: &[crate::agent::context::AstNode] = match section { + Section::Identity => ctx.identity(), + Section::Conversation => ctx.conversation(), + _ => &[], + }; + let read_back = match nodes.get(i) { + Some(crate::agent::context::AstNode::Leaf(l)) => match l.body() { + crate::agent::context::NodeBody::Memory { score, .. } => format!("{:?}", score), + _ => "not-memory".to_string(), + }, + _ => "out-of-bounds".to_string(), + }; + dbglog!("[scoring] persisted {} → {:.3} ({:?}[{}]) read_back={}", + key, score, section, i, read_back); + } + None => { + dbglog!( + "[scoring] DROP {}: find_memory_by_key None (id={}, cv={})", + key, ctx.identity().len(), ctx.conversation().len() + ); + } } let snapshot = collect_memory_scores(&ctx); + let in_snapshot = snapshot.contains_key(&key); + dbglog!("[scoring] snapshot size={} contains({})={}", + snapshot.len(), key, in_snapshot); drop(ctx); agent.state.lock().await.changed.notify_one(); snapshot }; + dbglog!("[scoring] about to save {} entries", scores_snapshot.len()); save_memory_scores(&scores_snapshot, &path); } }, diff --git a/src/user/chat.rs b/src/user/chat.rs index 47c5d56..fe3db5b 100644 --- a/src/user/chat.rs +++ b/src/user/chat.rs @@ -486,6 +486,11 @@ impl InteractScreen { if t.is_empty() { vec![] } else { vec![(PaneTarget::ToolResult, text, Marker::None)] } } + NodeBody::Image { orig_height, orig_width, .. } => { + vec![(PaneTarget::Conversation, + format!("[image {}x{}]", orig_width, orig_height), + Marker::None)] + } } } AstNode::Branch { role, children, .. } => {