// agent.rs — Core agent loop // // The simplest possible implementation of the agent pattern: // send messages + tool definitions to the model, if it responds // with tool calls then dispatch them and loop, if it responds // with text then display it and wait for the next prompt. // // Uses streaming by default so text tokens appear as they're // generated. Tool calls are accumulated from stream deltas and // dispatched after the stream completes. // // The DMN (dmn.rs) is the outer loop that decides what prompts // to send here. This module just handles single turns: prompt // in, response out, tool calls dispatched. use anyhow::Result; use chrono::{DateTime, Utc}; use tiktoken_rs::CoreBPE; use std::io::Write; use std::process::{Command, Stdio}; use crate::api::ApiClient; use crate::journal; use crate::log::ConversationLog; use crate::tools; use crate::tools::ProcessTracker; use crate::types::*; use crate::ui_channel::{ContextSection, SharedContextState, StatusInfo, StreamTarget, UiMessage, UiSender}; /// Result of a single agent turn. pub struct TurnResult { /// The text response (already sent through UI channel). #[allow(dead_code)] pub text: String, /// Whether the model called yield_to_user during this turn. pub yield_requested: bool, /// Whether any tools (other than yield_to_user) were called. pub had_tool_calls: bool, /// Number of tool calls that returned errors this turn. pub tool_errors: u32, /// Model name to switch to after this turn completes. pub model_switch: Option, /// Agent requested DMN pause (full stop on autonomous behavior). pub dmn_pause: bool, } /// Accumulated state across tool dispatches within a single turn. struct DispatchState { yield_requested: bool, had_tool_calls: bool, tool_errors: u32, model_switch: Option, dmn_pause: bool, } /// Mutable context state — the structured regions of the context window. /// /// Each field is a different dimension of awareness. The struct renders /// itself to text for inclusion in the context message sent to the model. /// Tools can update individual fields mid-session. #[derive(Debug, Clone)] pub struct ContextState { /// System prompt (identity, instructions, loaded from prompt file). pub system_prompt: String, /// Identity files: (filename, contents). Transparent structure for /// debug inspection and per-file budget control. pub personality: Vec<(String, String)>, /// Journal entries rendered as text — bridges old conversation. pub journal: String, /// Working stack — what the agent is currently doing. /// Top of stack (last element) is the current focus. pub working_stack: Vec, } /// Path to working stack instructions, included in context before the stack state. const WORKING_STACK_INSTRUCTIONS: &str = "/home/kent/.config/poc-agent/working-stack.md"; /// Path to persisted working stack state. const WORKING_STACK_FILE: &str = "/home/kent/.claude/memory/working-stack.json"; impl ContextState { /// Render the context message for the model. Personality + working stack. /// Journal is rendered separately as its own message in the conversation. pub fn render_context_message(&self) -> String { let mut parts: Vec = self.personality.iter() .map(|(name, content)| format!("## {}\n\n{}", name, content)) .collect(); // Always include working stack section — instructions + current state let instructions = std::fs::read_to_string(WORKING_STACK_INSTRUCTIONS) .unwrap_or_default(); let mut stack_section = instructions; if self.working_stack.is_empty() { stack_section.push_str("\n## Current stack\n\n(empty)\n"); } else { stack_section.push_str("\n## Current stack\n\n"); for (i, item) in self.working_stack.iter().enumerate() { if i == self.working_stack.len() - 1 { stack_section.push_str(&format!("→ {}\n", item)); } else { stack_section.push_str(&format!(" [{}] {}\n", i, item)); } } } parts.push(stack_section); parts.join("\n\n---\n\n") } } /// Breakdown of context window usage by category, in tokens. /// /// Categories: /// id — static identity context (system prompt + CLAUDE.md + memory files) /// mem — dynamically recalled content from poc-memory (future) /// jnl — journal entries bridging old conversation /// conv — raw recent conversation messages /// free — unused context window (headroom before compaction) /// /// Token estimates are derived from char proportions scaled by the /// API-reported prompt_tokens count. Before the first API call, uses /// chars/4 as a rough approximation. #[derive(Debug, Clone, Default)] pub struct ContextBudget { pub identity_tokens: usize, pub memory_tokens: usize, pub journal_tokens: usize, pub conversation_tokens: usize, /// Model's context window size in tokens. pub window_tokens: usize, } impl ContextBudget { pub fn used(&self) -> usize { self.identity_tokens + self.memory_tokens + self.journal_tokens + self.conversation_tokens } pub fn free(&self) -> usize { self.window_tokens.saturating_sub(self.used()) } /// Format as a compact status string with percentages of the token window. /// Non-zero values always show at least 1%. pub fn status_string(&self) -> String { let total = self.window_tokens; if total == 0 { return String::new(); } let pct = |n: usize| { if n == 0 { return 0; } ((n * 100) / total).max(1) }; format!( "id:{}% mem:{}% jnl:{}% conv:{}% free:{}%", pct(self.identity_tokens), pct(self.memory_tokens), pct(self.journal_tokens), pct(self.conversation_tokens), pct(self.free()), ) } } pub struct Agent { client: ApiClient, messages: Vec, tool_defs: Vec, /// Last known prompt token count from the API (tracks context size). last_prompt_tokens: u32, /// Shared process tracker for bash tool — lets TUI show/kill running commands. pub process_tracker: ProcessTracker, /// Current reasoning effort level ("none", "low", "high"). pub reasoning_effort: String, /// Persistent conversation log — append-only record of all messages. conversation_log: Option, /// Current context window budget breakdown. pub context_budget: ContextBudget, /// BPE tokenizer for token counting (cl100k_base — close enough /// for Claude and Qwen budget allocation, ~85-90% count accuracy). tokenizer: CoreBPE, /// Mutable context state — personality, working stack, etc. pub context: ContextState, /// Shared live context summary — TUI reads this directly for debug screen. pub shared_context: SharedContextState, /// Stable session ID for memory-search dedup across turns. session_id: String, } impl Agent { pub fn new( client: ApiClient, system_prompt: String, personality: Vec<(String, String)>, conversation_log: Option, shared_context: SharedContextState, ) -> Self { let tool_defs = tools::definitions(); let tokenizer = tiktoken_rs::cl100k_base() .expect("failed to load cl100k_base tokenizer"); let context = ContextState { system_prompt: system_prompt.clone(), personality, journal: String::new(), working_stack: Vec::new(), }; let session_id = format!("poc-agent-{}", chrono::Utc::now().format("%Y%m%d-%H%M%S")); let mut agent = Self { client, messages: Vec::new(), tool_defs, last_prompt_tokens: 0, process_tracker: ProcessTracker::new(), reasoning_effort: "none".to_string(), conversation_log, context_budget: ContextBudget::default(), tokenizer, context, shared_context, session_id, }; // Load recent journal entries at startup for orientation agent.load_startup_journal(); agent.load_working_stack(); agent.push_context(Message::system(system_prompt)); let rendered = agent.context.render_context_message(); if !rendered.is_empty() { agent.push_context(Message::user(rendered)); } if !agent.context.journal.is_empty() { agent.push_context(Message::user(agent.context.journal.clone())); } agent.measure_budget(); agent.publish_context_state(); agent } /// Run poc-hook for a given event, returning any output to inject. fn run_hook(&self, event: &str, prompt: &str) -> Option { let transcript_path = self.conversation_log.as_ref() .map(|l| l.path().to_string_lossy().to_string()) .unwrap_or_default(); let hook_input = serde_json::json!({ "hook_event_name": event, "session_id": self.session_id, "transcript_path": transcript_path, "prompt": prompt, }); let mut child = Command::new("poc-hook") .stdin(Stdio::piped()) .stdout(Stdio::piped()) .stderr(Stdio::null()) .spawn() .ok()?; if let Some(ref mut stdin) = child.stdin { let _ = stdin.write_all(hook_input.to_string().as_bytes()); } drop(child.stdin.take()); let output = child.wait_with_output().ok()?; let text = String::from_utf8_lossy(&output.stdout).to_string(); if text.trim().is_empty() { None } else { Some(text) } } /// Push a conversation message — stamped and logged. fn push_message(&mut self, mut msg: Message) { msg.stamp(); if let Some(ref log) = self.conversation_log { if let Err(e) = log.append(&msg) { eprintln!("warning: failed to log message: {:#}", e); } } self.messages.push(msg); } /// Push a context-only message (system prompt, identity context, /// journal summaries). Not logged — these are reconstructed on /// every startup/compaction. fn push_context(&mut self, msg: Message) { self.messages.push(msg); } /// Measure context window usage by category. Uses the BPE tokenizer /// for direct token counting (no chars/4 approximation). fn measure_budget(&mut self) { let mut id_tokens: usize = 0; let mem_tokens: usize = 0; let mut jnl_tokens: usize = 0; let mut conv_tokens: usize = 0; let mut in_conversation = false; for msg in &self.messages { let tokens = msg_token_count(&self.tokenizer, msg); if in_conversation { conv_tokens += tokens; continue; } match msg.role { Role::System => id_tokens += tokens, Role::User => { let text = msg.content_text(); if text.starts_with("[Earlier in this conversation") { jnl_tokens += tokens; } else if text.starts_with("Your context was just rebuilt") { jnl_tokens += tokens; } else if jnl_tokens == 0 && conv_tokens == 0 { // Static identity context (before any journal/conversation) id_tokens += tokens; } else { in_conversation = true; conv_tokens += tokens; } } _ => { in_conversation = true; conv_tokens += tokens; } } } self.context_budget = ContextBudget { identity_tokens: id_tokens, memory_tokens: mem_tokens, journal_tokens: jnl_tokens, conversation_tokens: conv_tokens, window_tokens: model_context_window(&self.client.model), }; } /// Send a user message and run the agent loop until the model /// produces a text response (no more tool calls). Streams text /// and tool activity through the UI channel. pub async fn turn( &mut self, user_input: &str, ui_tx: &UiSender, target: StreamTarget, ) -> Result { // Run poc-hook (memory search, notifications, context check) if let Some(hook_output) = self.run_hook("UserPromptSubmit", user_input) { let enriched = format!("{}\n\n\n{}\n", user_input, hook_output); self.push_message(Message::user(enriched)); } else { self.push_message(Message::user(user_input)); } let mut overflow_retries: u32 = 0; let mut empty_retries: u32 = 0; let mut ds = DispatchState { yield_requested: false, had_tool_calls: false, tool_errors: 0, model_switch: None, dmn_pause: false, }; loop { let _ = ui_tx.send(UiMessage::Activity("thinking...".into())); let api_result = self .client .chat_completion_stream( &self.messages, Some(&self.tool_defs), ui_tx, target, &self.reasoning_effort, ) .await; // Context overflow → compact and retry (max 2 attempts) // Stream error → retry with backoff (max 2 attempts) let (msg, usage) = match api_result { Err(e) if is_context_overflow(&e) && overflow_retries < 2 => { overflow_retries += 1; let _ = ui_tx.send(UiMessage::Info(format!( "[context overflow — compacting and retrying ({}/2)]", overflow_retries, ))); self.emergency_compact(); continue; } Err(e) if is_stream_error(&e) && empty_retries < 2 => { empty_retries += 1; let _ = ui_tx.send(UiMessage::Info(format!( "[stream error: {} — retrying ({}/2)]", e, empty_retries, ))); tokio::time::sleep(std::time::Duration::from_secs(2)).await; continue; } other => other?, }; // Strip ephemeral tool calls (journal) that the API has // now processed. They're persisted to disk; no need to keep // them in the conversation history burning tokens. self.strip_ephemeral_tool_calls(); if let Some(usage) = &usage { self.last_prompt_tokens = usage.prompt_tokens; self.measure_budget(); self.publish_context_state(); let _ = ui_tx.send(UiMessage::StatusUpdate(StatusInfo { dmn_state: String::new(), // filled by main loop dmn_turns: 0, dmn_max_turns: 0, prompt_tokens: usage.prompt_tokens, completion_tokens: usage.completion_tokens, model: self.client.model.clone(), turn_tools: 0, // tracked by TUI from ToolCall messages context_budget: self.context_budget.status_string(), })); } // Empty response — model returned finish=stop with no content // or tool calls. Inject a nudge so the retry has different input. let has_content = msg.content.is_some(); let has_tools = msg.tool_calls.as_ref().map_or(false, |tc| !tc.is_empty()); if !has_content && !has_tools { if empty_retries < 2 { empty_retries += 1; let _ = ui_tx.send(UiMessage::Debug(format!( "empty response, injecting nudge and retrying ({}/2)", empty_retries, ))); self.push_message(Message::user( "[system] Your previous response was empty. \ Please respond with text or use a tool." )); continue; } // After max retries, fall through — return the empty response } else { empty_retries = 0; } // Structured tool calls from the API if let Some(ref tool_calls) = msg.tool_calls { if !tool_calls.is_empty() { self.push_message(msg.clone()); for call in tool_calls { self.dispatch_tool_call(call, None, ui_tx, &mut ds) .await; } continue; } } // No structured tool calls — check for leaked tool calls // (Qwen sometimes outputs XML as text). let text = msg.content_text().to_string(); let leaked = parse_leaked_tool_calls(&text); if !leaked.is_empty() { let _ = ui_tx.send(UiMessage::Debug(format!( "recovered {} leaked tool call(s) from text", leaked.len() ))); // Strip tool call XML and thinking tokens from the message // so they don't clutter the conversation history. let cleaned = strip_leaked_artifacts(&text); let mut clean_msg = msg.clone(); clean_msg.content = if cleaned.trim().is_empty() { None } else { Some(MessageContent::Text(cleaned)) }; self.push_message(clean_msg); for call in &leaked { self.dispatch_tool_call(call, Some("recovered"), ui_tx, &mut ds) .await; } continue; } // Genuinely text-only response let _ = ui_tx.send(UiMessage::Activity(String::new())); self.push_message(msg); return Ok(TurnResult { text, yield_requested: ds.yield_requested, had_tool_calls: ds.had_tool_calls, tool_errors: ds.tool_errors, model_switch: ds.model_switch, dmn_pause: ds.dmn_pause, }); } } /// Dispatch a single tool call: send UI annotations, run the tool, /// push results into the conversation, handle images. async fn dispatch_tool_call( &mut self, call: &ToolCall, tag: Option<&str>, ui_tx: &UiSender, ds: &mut DispatchState, ) { let args: serde_json::Value = serde_json::from_str(&call.function.arguments).unwrap_or_default(); let args_summary = summarize_args(&call.function.name, &args); let label = match tag { Some(t) => format!("calling: {} ({})", call.function.name, t), None => format!("calling: {}", call.function.name), }; let _ = ui_tx.send(UiMessage::Activity(label)); let _ = ui_tx.send(UiMessage::ToolCall { name: call.function.name.clone(), args_summary: args_summary.clone(), }); let _ = ui_tx.send(UiMessage::ToolStarted { id: call.id.clone(), name: call.function.name.clone(), detail: args_summary, }); // Handle working_stack tool — needs &mut self for context state if call.function.name == "working_stack" { let result = tools::working_stack::handle(&args, &mut self.context.working_stack); let output = tools::ToolOutput { text: result.clone(), is_yield: false, images: Vec::new(), model_switch: None, dmn_pause: false, }; let _ = ui_tx.send(UiMessage::ToolResult { name: call.function.name.clone(), result: output.text.clone(), }); let _ = ui_tx.send(UiMessage::ToolFinished { id: call.id.clone() }); self.push_message(Message::tool_result(&call.id, &output.text)); ds.had_tool_calls = true; // Re-render the context message so the model sees the updated stack if !result.starts_with("Error:") { self.refresh_context_message(); } return; } let output = tools::dispatch(&call.function.name, &args, &self.process_tracker).await; if output.is_yield { ds.yield_requested = true; } else { ds.had_tool_calls = true; } if output.model_switch.is_some() { ds.model_switch = output.model_switch; } if output.dmn_pause { ds.dmn_pause = true; } if output.text.starts_with("Error:") { ds.tool_errors += 1; } let _ = ui_tx.send(UiMessage::ToolResult { name: call.function.name.clone(), result: output.text.clone(), }); let _ = ui_tx.send(UiMessage::ToolFinished { id: call.id.clone() }); self.push_message(Message::tool_result(&call.id, &output.text)); if !output.images.is_empty() { // Only one live image in context at a time — age out any // previous ones to avoid accumulating ~90KB+ per image. self.age_out_images(); self.push_message(Message::user_with_images( "Here is the image you requested:", &output.images, )); } } /// Build context state summary for the debug screen. pub fn context_state_summary(&self) -> Vec { let count = |s: &str| self.tokenizer.encode_with_special_tokens(s).len(); let mut sections = Vec::new(); // System prompt sections.push(ContextSection { name: "System prompt".into(), tokens: count(&self.context.system_prompt), content: self.context.system_prompt.clone(), children: Vec::new(), }); // Personality — parent with file children let personality_children: Vec = self.context.personality.iter() .map(|(name, content)| ContextSection { name: name.clone(), tokens: count(content), content: content.clone(), children: Vec::new(), }) .collect(); let personality_tokens: usize = personality_children.iter().map(|c| c.tokens).sum(); sections.push(ContextSection { name: format!("Personality ({} files)", personality_children.len()), tokens: personality_tokens, content: String::new(), children: personality_children, }); // Journal — split into per-entry children { let mut journal_children = Vec::new(); let mut current_header = String::new(); let mut current_body = String::new(); for line in self.context.journal.lines() { if line.starts_with("## ") { if !current_header.is_empty() { let body = std::mem::take(&mut current_body); let preview: String = body.lines().next().unwrap_or("").chars().take(60).collect(); journal_children.push(ContextSection { name: format!("{}: {}", current_header, preview), tokens: count(&body), content: body, children: Vec::new(), }); } current_header = line.trim_start_matches("## ").to_string(); current_body.clear(); } else { if !current_body.is_empty() || !line.is_empty() { current_body.push_str(line); current_body.push('\n'); } } } if !current_header.is_empty() { let preview: String = current_body.lines().next().unwrap_or("").chars().take(60).collect(); journal_children.push(ContextSection { name: format!("{}: {}", current_header, preview), tokens: count(¤t_body), content: current_body, children: Vec::new(), }); } let journal_tokens: usize = journal_children.iter().map(|c| c.tokens).sum(); sections.push(ContextSection { name: format!("Journal ({} entries)", journal_children.len()), tokens: journal_tokens, content: String::new(), children: journal_children, }); } // Working stack — instructions + items as children let instructions = std::fs::read_to_string(WORKING_STACK_INSTRUCTIONS) .unwrap_or_default(); let mut stack_children = vec![ContextSection { name: "Instructions".into(), tokens: count(&instructions), content: instructions, children: Vec::new(), }]; for (i, item) in self.context.working_stack.iter().enumerate() { let marker = if i == self.context.working_stack.len() - 1 { "→" } else { " " }; stack_children.push(ContextSection { name: format!("{} [{}] {}", marker, i, item), tokens: count(item), content: String::new(), children: Vec::new(), }); } let stack_tokens: usize = stack_children.iter().map(|c| c.tokens).sum(); sections.push(ContextSection { name: format!("Working stack ({} items)", self.context.working_stack.len()), tokens: stack_tokens, content: String::new(), children: stack_children, }); // Conversation — each message as a child let conv_start = self.messages.iter() .position(|m| m.role == Role::Assistant || m.role == Role::Tool) .unwrap_or(self.messages.len()); let conv_messages = &self.messages[conv_start..]; let conv_children: Vec = conv_messages.iter().enumerate() .map(|(i, msg)| { let text = msg.content.as_ref() .map(|c| c.as_text().to_string()) .unwrap_or_default(); let tool_info = msg.tool_calls.as_ref().map(|tc| { tc.iter() .map(|c| c.function.name.clone()) .collect::>() .join(", ") }); let label = match (&msg.role, &tool_info) { (_, Some(tools)) => format!("[tool_call: {}]", tools), _ => { let preview: String = text.chars().take(60).collect(); let preview = preview.replace('\n', " "); if text.len() > 60 { format!("{}...", preview) } else { preview } } }; let tokens = count(&text); let role_name = match msg.role { Role::Assistant => "PoC", Role::User => "Kent", Role::Tool => "tool", Role::System => "system", }; ContextSection { name: format!("[{}] {}: {}", conv_start + i, role_name, label), tokens, content: text, children: Vec::new(), } }) .collect(); let conv_tokens: usize = conv_children.iter().map(|c| c.tokens).sum(); sections.push(ContextSection { name: format!("Conversation ({} messages)", conv_children.len()), tokens: conv_tokens, content: String::new(), children: conv_children, }); sections } /// Load recent journal entries at startup for orientation. /// Uses the same budget logic as compaction but with empty conversation. /// Only parses the tail of the journal file (last 64KB) for speed. fn load_startup_journal(&mut self) { let journal_path = journal::default_journal_path(); let entries = journal::parse_journal_tail(&journal_path, 64 * 1024); if entries.is_empty() { return; } let count = |s: &str| self.tokenizer.encode_with_special_tokens(s).len(); let context_message = self.context.render_context_message(); let plan = plan_context( &self.context.system_prompt, &context_message, &[], // no conversation yet &entries, &self.client.model, &count, ); self.context.journal = render_journal_text(&entries, &plan); } /// Re-render the context message in self.messages from live ContextState. /// Called after any change to context state (working stack, etc). fn refresh_context_message(&mut self) { let rendered = self.context.render_context_message(); // The context message is the first user message (index 1, after system prompt) if self.messages.len() >= 2 && self.messages[1].role == Role::User { self.messages[1] = Message::user(rendered); } self.publish_context_state(); self.save_working_stack(); } /// Persist working stack to disk. fn save_working_stack(&self) { if let Ok(json) = serde_json::to_string(&self.context.working_stack) { let _ = std::fs::write(WORKING_STACK_FILE, json); } } /// Load working stack from disk. fn load_working_stack(&mut self) { if let Ok(data) = std::fs::read_to_string(WORKING_STACK_FILE) { if let Ok(stack) = serde_json::from_str::>(&data) { self.context.working_stack = stack; } } } /// Push the current context summary to the shared state for the TUI to read. fn publish_context_state(&self) { if let Ok(mut state) = self.shared_context.write() { *state = self.context_state_summary(); } } /// Replace base64 image data in older messages with text placeholders. /// Only the most recent image stays live — each new image ages out /// all previous ones. The tool result message (right before each image /// message) already records what was loaded, so no info is lost. fn age_out_images(&mut self) { for msg in &mut self.messages { if let Some(MessageContent::Parts(parts)) = &msg.content { let has_images = parts.iter().any(|p| matches!(p, ContentPart::ImageUrl { .. })); if !has_images { continue; } let mut replacement = String::new(); for part in parts { match part { ContentPart::Text { text } => { if !replacement.is_empty() { replacement.push('\n'); } replacement.push_str(text); } ContentPart::ImageUrl { .. } => { if !replacement.is_empty() { replacement.push('\n'); } replacement.push_str( "[image aged out — see tool result above for details]", ); } } } msg.content = Some(MessageContent::Text(replacement)); } } } /// Strip ephemeral tool calls from the conversation history. /// /// Ephemeral tools (like journal) persist their output to disk, /// so the tool call + result don't need to stay in the context /// window. We keep them for exactly one API round-trip (the model /// needs to see the result was acknowledged), then strip them. /// /// If an assistant message contains ONLY ephemeral tool calls, /// the entire message and its tool results are removed. If mixed /// with non-ephemeral calls, we leave it (rare case, small cost). fn strip_ephemeral_tool_calls(&mut self) { // Collect IDs of tool calls to strip let mut strip_ids: Vec = Vec::new(); let mut strip_msg_indices: Vec = Vec::new(); for (i, msg) in self.messages.iter().enumerate() { if msg.role != Role::Assistant { continue; } let calls = match &msg.tool_calls { Some(c) if !c.is_empty() => c, _ => continue, }; let all_ephemeral = calls.iter().all(|c| { c.function.name == tools::journal::TOOL_NAME }); if all_ephemeral { strip_msg_indices.push(i); for call in calls { strip_ids.push(call.id.clone()); } } } if strip_ids.is_empty() { return; } // Remove in reverse order to preserve indices self.messages.retain(|msg| { // Strip the assistant messages we identified if msg.role == Role::Assistant { if let Some(calls) = &msg.tool_calls { if calls.iter().all(|c| strip_ids.contains(&c.id)) { return false; } } } // Strip matching tool results if msg.role == Role::Tool { if let Some(ref id) = msg.tool_call_id { if strip_ids.contains(id) { return false; } } } true }); } /// Last prompt token count reported by the API. pub fn last_prompt_tokens(&self) -> u32 { self.last_prompt_tokens } /// Build context window from conversation messages + journal. /// Used by both compact() (in-memory messages) and restore_from_log() /// (conversation log). The context window is always: /// identity + journal summaries + raw recent messages pub fn compact(&mut self, new_system_prompt: String, new_personality: Vec<(String, String)>) { self.context.system_prompt = new_system_prompt; self.context.personality = new_personality; self.do_compact(); } /// Internal compaction — rebuilds context window from current messages. fn do_compact(&mut self) { // Find where actual conversation starts (after system + context) let conv_start = self .messages .iter() .position(|m| m.role == Role::Assistant || m.role == Role::Tool) .unwrap_or(self.messages.len()); let conversation: Vec = self.messages[conv_start..].to_vec(); let (messages, journal) = build_context_window( &self.context, &conversation, &self.client.model, &self.tokenizer, ); self.context.journal = journal; self.messages = messages; self.last_prompt_tokens = 0; self.measure_budget(); self.publish_context_state(); } /// Emergency compaction using stored config — called on context overflow. fn emergency_compact(&mut self) { self.do_compact(); } /// Restore from the conversation log. Builds the context window /// the same way compact() does — journal summaries for old messages, /// raw recent messages. This is the unified startup path. /// Returns true if the log had content to restore. pub fn restore_from_log( &mut self, system_prompt: String, personality: Vec<(String, String)>, ) -> bool { self.context.system_prompt = system_prompt; self.context.personality = personality; let all_messages = match &self.conversation_log { Some(log) => match log.read_tail(512 * 1024) { Ok(msgs) if !msgs.is_empty() => { dbglog!("[restore] read {} messages from log tail", msgs.len()); msgs } Ok(_) => { dbglog!("[restore] log exists but is empty"); return false; } Err(e) => { dbglog!("[restore] failed to read log: {}", e); return false; } }, None => { dbglog!("[restore] no conversation log configured"); return false; } }; // Filter out system/context messages — we only want the // actual conversation (user prompts, assistant responses, // tool calls/results) let conversation: Vec = all_messages .into_iter() .filter(|m| m.role != Role::System) .collect(); dbglog!("[restore] {} messages after filtering system", conversation.len()); let (messages, journal) = build_context_window( &self.context, &conversation, &self.client.model, &self.tokenizer, ); dbglog!("[restore] journal text: {} chars, {} lines", journal.len(), journal.lines().count()); self.context.journal = journal; self.messages = messages; dbglog!("[restore] built context window: {} messages", self.messages.len()); self.last_prompt_tokens = 0; self.measure_budget(); self.publish_context_state(); true } /// Replace the API client (for model switching). pub fn swap_client(&mut self, new_client: ApiClient) { self.client = new_client; } /// Get the model identifier. pub fn model(&self) -> &str { &self.client.model } /// Get the conversation history for persistence. pub fn messages(&self) -> &[Message] { &self.messages } /// Mutable access to conversation history (for /retry). pub fn messages_mut(&mut self) -> &mut Vec { &mut self.messages } /// Restore from a saved conversation. pub fn restore(&mut self, messages: Vec) { self.messages = messages; } } /// Look up a model's context window size in tokens. pub fn model_context_window(model: &str) -> usize { let m = model.to_lowercase(); if m.contains("opus") || m.contains("sonnet") { 200_000 } else if m.contains("qwen") { 131_072 } else { 128_000 } } /// Context budget in tokens: 60% of the model's context window. /// Leaves headroom for conversation to grow before compaction triggers. /// /// Future direction: make this dynamic based on what the agent is /// doing — deep coding work might allocate more to conversation, /// consolidation might allocate more to journal/memory, idle might /// shrink everything to save cost. fn context_budget_tokens(model: &str) -> usize { model_context_window(model) * 60 / 100 } /// Allocation plan for the context window. Separates the budget math /// (which entries and messages to include) from the message assembly /// (building the actual Vec). This makes the core algorithm /// testable and inspectable — log the plan on compaction to see exactly /// what allocation decisions were made. struct ContextPlan { /// Index into all_entries: header-only entries start here header_start: usize, /// Index into all_entries: full entries start here (headers end here) full_start: usize, /// Total journal entries (header-only + full go up to this) entry_count: usize, /// Index into recent conversation: skip messages before this conv_trim: usize, /// Total recent conversation messages conv_count: usize, /// Tokens used by full journal entries full_tokens: usize, /// Tokens used by header-only journal entries header_tokens: usize, /// Tokens used by conversation (after trimming) conv_tokens: usize, /// Total budget available (after identity, memory, reserve) available: usize, } /// Build a context window from conversation messages + journal entries. /// This is the core algorithm shared by compact() and restore_from_log(). /// /// Allocation strategy: identity and memory are fixed costs. The /// remaining budget (minus 25% reserve for model output) is split /// between journal and conversation. Conversation gets priority — /// it's what's happening now. Journal fills the rest, newest first. /// /// When the budget is tight, journal entries are dropped first /// (oldest entries go first). If conversation alone exceeds the /// budget, oldest messages are trimmed to fit. /// Returns (messages, journal_text) — caller stores journal_text in ContextState. fn build_context_window( context: &ContextState, conversation: &[Message], model: &str, tokenizer: &CoreBPE, ) -> (Vec, String) { let journal_path = journal::default_journal_path(); let all_entries = journal::parse_journal(&journal_path); dbglog!("[ctx] {} journal entries from {}", all_entries.len(), journal_path.display()); let count = |s: &str| tokenizer.encode_with_special_tokens(s).len(); let system_prompt = context.system_prompt.clone(); let context_message = context.render_context_message(); // Cap memory to 50% of the context budget so conversation always // gets space. Truncate at the last complete section boundary. let max_tokens = context_budget_tokens(model); let memory_cap = max_tokens / 2; let memory_tokens = count(&context_message); let context_message = if memory_tokens > memory_cap { dbglog!("[ctx] memory too large: {} tokens > {} cap, truncating", memory_tokens, memory_cap); truncate_at_section(&context_message, memory_cap, &count) } else { context_message }; let recent_start = find_journal_cutoff(conversation, all_entries.last()); dbglog!("[ctx] journal cutoff: {} of {} conversation messages are 'recent'", conversation.len() - recent_start, conversation.len()); let recent = &conversation[recent_start..]; let plan = plan_context( &system_prompt, &context_message, recent, &all_entries, model, &count, ); // Render journal text from the plan let journal_text = render_journal_text(&all_entries, &plan); dbglog!("[ctx] plan: header_start={} full_start={} entry_count={} conv_trim={} journal_text={} chars", plan.header_start, plan.full_start, plan.entry_count, plan.conv_trim, journal_text.len()); let messages = assemble_context( system_prompt, context_message, &journal_text, recent, &plan, ); (messages, journal_text) } /// Compute the allocation plan: how much budget goes to journal vs /// conversation, which entries and messages to include. fn plan_context( system_prompt: &str, context_message: &str, recent: &[Message], entries: &[journal::JournalEntry], model: &str, count: &dyn Fn(&str) -> usize, ) -> ContextPlan { let max_tokens = context_budget_tokens(model); // Fixed costs — always included let identity_cost = count(system_prompt); let memory_cost = count(context_message); let reserve = max_tokens / 4; let available = max_tokens .saturating_sub(identity_cost) .saturating_sub(memory_cost) .saturating_sub(reserve); // Measure conversation let conv_costs: Vec = recent.iter().map(|m| msg_token_count_fn(m, count)).collect(); let total_conv: usize = conv_costs.iter().sum(); // Journal always gets at least 15% of available budget so it doesn't // get squeezed out by large conversations. let journal_min = available * 15 / 100; let journal_budget = available.saturating_sub(total_conv).max(journal_min); // Fill journal entries newest-first within budget. // Tiered: recent entries get full content, older entries get just // a header line (timestamp + first line) for timeline awareness. let full_budget = journal_budget * 70 / 100; let header_budget = journal_budget.saturating_sub(full_budget); // Phase 1: Full entries (newest first) let mut full_used = 0; let mut n_full = 0; for entry in entries.iter().rev() { let cost = count(&entry.content) + 10; if full_used + cost > full_budget { break; } full_used += cost; n_full += 1; } let full_start = entries.len().saturating_sub(n_full); // Phase 2: Header-only entries (continuing backward from where full stopped) let mut header_used = 0; let mut n_headers = 0; for entry in entries[..full_start].iter().rev() { let first_line = entry .content .lines() .find(|l| !l.trim().is_empty()) .unwrap_or("(empty)"); let cost = count(first_line) + 10; if header_used + cost > header_budget { break; } header_used += cost; n_headers += 1; } let header_start = full_start.saturating_sub(n_headers); // If conversation exceeds available budget, trim oldest messages let journal_used = full_used + header_used; let mut conv_trim = 0; let mut trimmed_conv = total_conv; while trimmed_conv + journal_used > available && conv_trim < recent.len() { trimmed_conv -= conv_costs[conv_trim]; conv_trim += 1; } // Walk forward to user message boundary while conv_trim < recent.len() && recent[conv_trim].role != Role::User { conv_trim += 1; } dbglog!("[plan] model={} max_tokens={} available={} (identity={} memory={} reserve={})", model, max_tokens, available, identity_cost, memory_cost, reserve); dbglog!("[plan] conv: {} msgs, {} tokens total, trimming {} msgs → {} tokens", recent.len(), total_conv, conv_trim, trimmed_conv); dbglog!("[plan] journal: {} full entries ({}t) + {} headers ({}t)", n_full, full_used, n_headers, header_used); ContextPlan { header_start, full_start, entry_count: entries.len(), conv_trim, conv_count: recent.len(), full_tokens: full_used, header_tokens: header_used, conv_tokens: trimmed_conv, available, } } /// Render journal entries into text from a context plan. fn render_journal_text( entries: &[journal::JournalEntry], plan: &ContextPlan, ) -> String { let has_journal = plan.header_start < plan.entry_count; if !has_journal { return String::new(); } let mut text = String::from("[Earlier in this conversation — from your journal]\n\n"); // Header-only entries (older) — just timestamp + first line for entry in &entries[plan.header_start..plan.full_start] { let first_line = entry .content .lines() .find(|l| !l.trim().is_empty()) .unwrap_or("(empty)"); text.push_str(&format!( "## {} — {}\n", entry.timestamp.format("%Y-%m-%dT%H:%M"), first_line, )); } // Separator between headers and full entries let n_headers = plan.full_start - plan.header_start; let n_full = plan.entry_count - plan.full_start; if n_headers > 0 && n_full > 0 { text.push_str("\n---\n\n"); } // Full entries (recent) for entry in &entries[plan.full_start..] { text.push_str(&format!( "## {}\n\n{}\n\n", entry.timestamp.format("%Y-%m-%dT%H:%M"), entry.content )); } text } /// Assemble the context window from a plan. No allocation decisions /// happen here — just follow the plan to build messages. fn assemble_context( system_prompt: String, context_message: String, journal_text: &str, recent: &[Message], plan: &ContextPlan, ) -> Vec { let mut messages = vec![Message::system(system_prompt)]; if !context_message.is_empty() { messages.push(Message::user(context_message)); } let final_recent = &recent[plan.conv_trim..]; if !journal_text.is_empty() { messages.push(Message::user(journal_text.to_string())); } else if !final_recent.is_empty() { messages.push(Message::user( "Your context was just rebuilt. Memory files have been \ reloaded. Your recent conversation continues below. \ Earlier context is in your journal and memory files." .to_string(), )); } messages.extend(final_recent.iter().cloned()); messages } /// Find the conversation index where messages are no longer covered /// Truncate a context message to fit within a token budget. Cuts at /// section boundaries (lines starting with `---` or `## `) to avoid /// splitting mid-section. Drops sections from the end first since /// earlier sections (identity, instructions) matter more. fn truncate_at_section(text: &str, max_tokens: usize, count: &dyn Fn(&str) -> usize) -> String { // Find section boundaries (--- separators between assembled parts) let mut boundaries = vec![0usize]; for (i, line) in text.lines().enumerate() { if line.trim() == "---" || line.starts_with("## ") { // Find byte offset of this line let offset = text.lines().take(i).map(|l| l.len() + 1).sum::(); boundaries.push(offset); } } boundaries.push(text.len()); // Binary search: find the largest prefix of sections that fits let mut best = 0; for &end in &boundaries[1..] { let slice = &text[..end]; if count(slice) <= max_tokens { best = end; } else { break; } } if best == 0 { // Even the first section doesn't fit — hard truncate best = text.len().min(max_tokens * 3); // ~3 chars/token rough estimate } let truncated = &text[..best]; dbglog!("[ctx] truncated memory from {} to {} chars ({} tokens)", text.len(), truncated.len(), count(truncated)); truncated.to_string() } /// by journal entries. Messages before this index are summarized by /// the journal; messages from this index onward stay as raw conversation. /// Walks back to a user message boundary to avoid splitting tool /// call/result sequences. fn find_journal_cutoff( conversation: &[Message], newest_entry: Option<&journal::JournalEntry>, ) -> usize { let cutoff = match newest_entry { Some(entry) => entry.timestamp, None => return 0, }; let mut split = conversation.len(); for (i, msg) in conversation.iter().enumerate() { if let Some(ts) = parse_msg_timestamp(msg) { if ts > cutoff { split = i; break; } } } // Walk back to user message boundary while split > 0 && split < conversation.len() && conversation[split].role != Role::User { split -= 1; } split } /// Count the token footprint of a message using a token counting function. fn msg_token_count_fn(msg: &Message, count: &dyn Fn(&str) -> usize) -> usize { let content = msg.content.as_ref().map_or(0, |c| match c { MessageContent::Text(s) => count(s), MessageContent::Parts(parts) => parts .iter() .map(|p| match p { ContentPart::Text { text } => count(text), ContentPart::ImageUrl { .. } => 85, }) .sum(), }); let tools = msg.tool_calls.as_ref().map_or(0, |calls| { calls .iter() .map(|c| count(&c.function.arguments) + count(&c.function.name)) .sum() }); content + tools } /// Count the token footprint of a message using BPE tokenization. fn msg_token_count(tokenizer: &CoreBPE, msg: &Message) -> usize { msg_token_count_fn(msg, &|s| tokenizer.encode_with_special_tokens(s).len()) } /// Detect context window overflow errors from the API. /// Different providers phrase this differently; we check for common patterns. /// OpenRouter wraps upstream errors, so we check both the wrapper and the raw message. fn is_context_overflow(err: &anyhow::Error) -> bool { let msg = err.to_string().to_lowercase(); msg.contains("context length") || msg.contains("token limit") || msg.contains("too many tokens") || msg.contains("maximum context") || msg.contains("prompt is too long") || msg.contains("request too large") || msg.contains("input validation error") || msg.contains("content length limit") || (msg.contains("400") && msg.contains("tokens")) } /// Detect model/provider errors delivered inside the SSE stream. /// OpenRouter returns HTTP 200 but finish_reason="error" with /// partial content (e.g. "system") — we surface this as an error /// so the turn loop can retry. fn is_stream_error(err: &anyhow::Error) -> bool { err.to_string().contains("model stream error") } /// Parse a message's timestamp field into a DateTime. fn parse_msg_timestamp(msg: &Message) -> Option> { msg.timestamp .as_ref() .and_then(|ts| DateTime::parse_from_rfc3339(ts).ok()) .map(|dt| dt.with_timezone(&Utc)) } /// Create a short summary of tool args for the tools pane header. fn summarize_args(tool_name: &str, args: &serde_json::Value) -> String { match tool_name { "read_file" | "write_file" | "edit_file" => args["file_path"] .as_str() .unwrap_or("") .to_string(), "bash" => { let cmd = args["command"].as_str().unwrap_or(""); if cmd.len() > 60 { let end = cmd.char_indices() .map(|(i, _)| i) .take_while(|&i| i <= 60) .last() .unwrap_or(0); format!("{}...", &cmd[..end]) } else { cmd.to_string() } } "grep" => { let pattern = args["pattern"].as_str().unwrap_or(""); let path = args["path"].as_str().unwrap_or("."); format!("{} in {}", pattern, path) } "glob" => args["pattern"] .as_str() .unwrap_or("") .to_string(), "view_image" => { if let Some(pane) = args["pane_id"].as_str() { format!("pane {}", pane) } else { args["file_path"].as_str().unwrap_or("").to_string() } } "journal" => { let entry = args["entry"].as_str().unwrap_or(""); if entry.len() > 60 { format!("{}...", &entry[..60]) } else { entry.to_string() } } "yield_to_user" => args["message"] .as_str() .unwrap_or("") .to_string(), "switch_model" => args["model"] .as_str() .unwrap_or("") .to_string(), "pause" => String::new(), _ => String::new(), } } /// Parse tool calls leaked as text by models that don't always use the /// structured function calling API (notably Qwen). /// /// Handles the XML format: /// /// /// echo hello /// /// /// /// Also handles JSON-in-text format: /// /// {"name": "bash", "arguments": {"command": "echo hello"}} /// fn parse_leaked_tool_calls(text: &str) -> Vec { // Normalize whitespace inside XML tags: "<\nfunction\n=\nbash\n>" → "" // This handles streaming tokenizers that split tags across tokens. let normalized = normalize_xml_tags(text); let text = &normalized; let mut calls = Vec::new(); let mut search_from = 0; let mut call_counter: u32 = 0; while let Some(start) = text[search_from..].find("") { let abs_start = search_from + start; let after_tag = abs_start + "".len(); let end = match text[after_tag..].find("") { Some(pos) => after_tag + pos, None => break, }; let body = text[after_tag..end].trim(); search_from = end + "".len(); // Try XML format first, then JSON if let Some(call) = parse_xml_tool_call(body, &mut call_counter) { calls.push(call); } else if let Some(call) = parse_json_tool_call(body, &mut call_counter) { calls.push(call); } } calls } /// Normalize whitespace inside XML-like tags for streaming tokenizers. /// Collapses whitespace between `<` and `>` so that `<\nfunction\n=\nbash\n>` /// becomes ``, and `` becomes ``. /// Leaves content between tags untouched. fn normalize_xml_tags(text: &str) -> String { let mut result = String::with_capacity(text.len()); let mut chars = text.chars().peekable(); while let Some(ch) = chars.next() { if ch == '<' { let mut tag = String::from('<'); for inner in chars.by_ref() { if inner == '>' { tag.push('>'); break; } else if inner.is_whitespace() { // Skip whitespace inside tags } else { tag.push(inner); } } result.push_str(&tag); } else { result.push(ch); } } result } /// Parse a Qwen-style `body` pseudo-XML element. /// Returns `(value, body, rest)` on success. fn parse_qwen_tag<'a>(s: &'a str, tag: &str) -> Option<(&'a str, &'a str, &'a str)> { let open = format!("<{}=", tag); let close = format!("", tag); let start = s.find(&open)? + open.len(); let name_end = start + s[start..].find('>')?; let body_start = name_end + 1; let body_end = body_start + s[body_start..].find(&close)?; Some(( s[start..name_end].trim(), s[body_start..body_end].trim(), &s[body_end + close.len()..], )) } /// Parse Qwen's XML tool call format. fn parse_xml_tool_call(body: &str, counter: &mut u32) -> Option { let (func_name, func_body, _) = parse_qwen_tag(body, "function")?; let func_name = func_name.to_string(); let mut args = serde_json::Map::new(); let mut rest = func_body; while let Some((key, val, remainder)) = parse_qwen_tag(rest, "parameter") { args.insert(key.to_string(), serde_json::Value::String(val.to_string())); rest = remainder; } *counter += 1; Some(ToolCall { id: format!("leaked_{}", counter), call_type: "function".to_string(), function: FunctionCall { name: func_name, arguments: serde_json::to_string(&args).unwrap_or_default(), }, }) } /// Parse JSON tool call format (some models emit this). fn parse_json_tool_call(body: &str, counter: &mut u32) -> Option { let v: serde_json::Value = serde_json::from_str(body).ok()?; let name = v["name"].as_str()?; let arguments = &v["arguments"]; *counter += 1; Some(ToolCall { id: format!("leaked_{}", counter), call_type: "function".to_string(), function: FunctionCall { name: name.to_string(), arguments: serde_json::to_string(arguments).unwrap_or_default(), }, }) } /// Strip tool call XML and thinking tokens from text so the conversation /// history stays clean. Removes `...` blocks and /// `` tags (thinking content before them is kept — it's useful context). fn strip_leaked_artifacts(text: &str) -> String { let normalized = normalize_xml_tags(text); let mut result = normalized.clone(); // Remove ... blocks while let Some(start) = result.find("") { if let Some(end_pos) = result[start..].find("") { let end = start + end_pos + "".len(); result = format!("{}{}", &result[..start], &result[end..]); } else { break; } } // Remove tags (but keep the thinking text before them) result = result.replace("", ""); result.trim().to_string() } #[cfg(test)] mod tests { use super::*; #[test] fn test_leaked_tool_call_clean() { let text = "thinking\n\n\n\npoc-memory used core-personality\n\n"; let calls = parse_leaked_tool_calls(text); assert_eq!(calls.len(), 1); assert_eq!(calls[0].function.name, "bash"); let args: serde_json::Value = serde_json::from_str(&calls[0].function.arguments).unwrap(); assert_eq!(args["command"], "poc-memory used core-personality"); } #[test] fn test_leaked_tool_call_streamed_whitespace() { // Streaming tokenizer splits XML tags across tokens with newlines let text = "\n<\nfunction\n=\nbash\n>\n<\nparameter\n=\ncommand\n>pwd\n\n"; let calls = parse_leaked_tool_calls(text); assert_eq!(calls.len(), 1, "should parse streamed format"); assert_eq!(calls[0].function.name, "bash"); let args: serde_json::Value = serde_json::from_str(&calls[0].function.arguments).unwrap(); assert_eq!(args["command"], "pwd"); } #[test] fn test_normalize_preserves_content() { let text = "\necho hello world\n"; let normalized = normalize_xml_tags(text); // Newlines between tags are not inside tags, so preserved assert_eq!(normalized, "\necho hello world\n"); } #[test] fn test_normalize_strips_tag_internal_whitespace() { let text = "<\nfunction\n=\nbash\n>"; let normalized = normalize_xml_tags(text); assert_eq!(normalized, ""); } }