consciousness/poc-agent/src/agent.rs

// agent.rs — Core agent loop
//
// The simplest possible implementation of the agent pattern:
// send messages + tool definitions to the model, if it responds
// with tool calls then dispatch them and loop, if it responds
// with text then display it and wait for the next prompt.
//
// Uses streaming by default so text tokens appear as they're
// generated. Tool calls are accumulated from stream deltas and
// dispatched after the stream completes.
//
// The DMN (dmn.rs) is the outer loop that decides what prompts
// to send here. This module just handles single turns: prompt
// in, response out, tool calls dispatched.

use anyhow::Result;
use chrono::{DateTime, Utc};
use tiktoken_rs::CoreBPE;

use std::io::Write;
use std::process::{Command, Stdio};

use crate::api::ApiClient;
use crate::journal;
use crate::log::ConversationLog;
use crate::tools;
use crate::tools::ProcessTracker;
use crate::types::*;
use crate::ui_channel::{ContextSection, SharedContextState, StatusInfo, StreamTarget, UiMessage, UiSender};

/// Result of a single agent turn.
pub struct TurnResult {
    /// The text response (already sent through UI channel).
    #[allow(dead_code)]
    pub text: String,
    /// Whether the model called yield_to_user during this turn.
    pub yield_requested: bool,
    /// Whether any tools (other than yield_to_user) were called.
    pub had_tool_calls: bool,
    /// Number of tool calls that returned errors this turn.
    pub tool_errors: u32,
    /// Model name to switch to after this turn completes.
    pub model_switch: Option<String>,
    /// Agent requested DMN pause (full stop on autonomous behavior).
    pub dmn_pause: bool,
}

/// Accumulated state across tool dispatches within a single turn.
struct DispatchState {
    yield_requested: bool,
    had_tool_calls: bool,
    tool_errors: u32,
    model_switch: Option<String>,
    dmn_pause: bool,
}

/// Mutable context state — the structured regions of the context window.
///
/// Each field is a different dimension of awareness. The struct renders
/// itself to text for inclusion in the context message sent to the model.
/// Tools can update individual fields mid-session.
#[derive(Debug, Clone)]
pub struct ContextState {
    /// System prompt (identity, instructions, loaded from prompt file).
    pub system_prompt: String,
    /// Identity files: (filename, contents). Transparent structure for
    /// debug inspection and per-file budget control.
    pub personality: Vec<(String, String)>,
    /// Journal entries rendered as text — bridges old conversation.
    pub journal: String,
    /// Working stack — what the agent is currently doing.
    /// Top of stack (last element) is the current focus.
    pub working_stack: Vec<String>,
}

/// Path to working stack instructions, included in context before the stack state.
const WORKING_STACK_INSTRUCTIONS: &str = "/home/kent/.config/poc-agent/working-stack.md";

/// Path to persisted working stack state.
const WORKING_STACK_FILE: &str = "/home/kent/.claude/memory/working-stack.json";

impl ContextState {
    /// Render the context message for the model. Personality + working stack.
    /// Journal is rendered separately as its own message in the conversation.
    pub fn render_context_message(&self) -> String {
        let mut parts: Vec<String> = self.personality.iter()
            .map(|(name, content)| format!("## {}\n\n{}", name, content))
            .collect();

        // Always include working stack section — instructions + current state
        let instructions = std::fs::read_to_string(WORKING_STACK_INSTRUCTIONS)
            .unwrap_or_default();
        let mut stack_section = instructions;

        if self.working_stack.is_empty() {
            stack_section.push_str("\n## Current stack\n\n(empty)\n");
        } else {
            stack_section.push_str("\n## Current stack\n\n");
            for (i, item) in self.working_stack.iter().enumerate() {
                if i == self.working_stack.len() - 1 {
                    stack_section.push_str(&format!("→ {}\n", item));
                } else {
                    stack_section.push_str(&format!("  [{}] {}\n", i, item));
                }
            }
        }
        parts.push(stack_section);

        parts.join("\n\n---\n\n")
    }
}

/// Breakdown of context window usage by category, in tokens.
///
/// Categories:
///   id   — static identity context (system prompt + CLAUDE.md + memory files)
///   mem  — dynamically recalled content from poc-memory (future)
///   jnl  — journal entries bridging old conversation
///   conv — raw recent conversation messages
///   free — unused context window (headroom before compaction)
///
/// Token estimates are derived from char proportions scaled by the
/// API-reported prompt_tokens count. Before the first API call, uses
/// chars/4 as a rough approximation.
#[derive(Debug, Clone, Default)]
pub struct ContextBudget {
    pub identity_tokens: usize,
    pub memory_tokens: usize,
    pub journal_tokens: usize,
    pub conversation_tokens: usize,
    /// Model's context window size in tokens.
    pub window_tokens: usize,
}

impl ContextBudget {
    pub fn used(&self) -> usize {
        self.identity_tokens + self.memory_tokens + self.journal_tokens + self.conversation_tokens
    }

    pub fn free(&self) -> usize {
        self.window_tokens.saturating_sub(self.used())
    }

    /// Format as a compact status string with percentages of the token window.
    /// Non-zero values always show at least 1%.
    pub fn status_string(&self) -> String {
        let total = self.window_tokens;
        if total == 0 {
            return String::new();
        }
        let pct = |n: usize| {
            if n == 0 { return 0; }
            ((n * 100) / total).max(1)
        };
        format!(
            "id:{}% mem:{}% jnl:{}% conv:{}% free:{}%",
            pct(self.identity_tokens),
            pct(self.memory_tokens),
            pct(self.journal_tokens),
            pct(self.conversation_tokens),
            pct(self.free()),
        )
    }
}

pub struct Agent {
    client: ApiClient,
    messages: Vec<Message>,
    tool_defs: Vec<ToolDef>,
    /// Last known prompt token count from the API (tracks context size).
    last_prompt_tokens: u32,
    /// Shared process tracker for bash tool — lets TUI show/kill running commands.
    pub process_tracker: ProcessTracker,
    /// Current reasoning effort level ("none", "low", "high").
    pub reasoning_effort: String,
    /// Persistent conversation log — append-only record of all messages.
    conversation_log: Option<ConversationLog>,
    /// Current context window budget breakdown.
    pub context_budget: ContextBudget,
    /// BPE tokenizer for token counting (cl100k_base — close enough
    /// for Claude and Qwen budget allocation, ~85-90% count accuracy).
    tokenizer: CoreBPE,
    /// Mutable context state — personality, working stack, etc.
    pub context: ContextState,
    /// Shared live context summary — TUI reads this directly for debug screen.
    pub shared_context: SharedContextState,
    /// Stable session ID for memory-search dedup across turns.
    session_id: String,
}

impl Agent {
    pub fn new(
        client: ApiClient,
        system_prompt: String,
        personality: Vec<(String, String)>,
        conversation_log: Option<ConversationLog>,
        shared_context: SharedContextState,
    ) -> Self {
        let tool_defs = tools::definitions();
        let tokenizer = tiktoken_rs::cl100k_base()
            .expect("failed to load cl100k_base tokenizer");

        let context = ContextState {
            system_prompt: system_prompt.clone(),
            personality,
            journal: String::new(),
            working_stack: Vec::new(),
        };
        let session_id = format!("poc-agent-{}", chrono::Utc::now().format("%Y%m%d-%H%M%S"));
        let mut agent = Self {
            client,
            messages: Vec::new(),
            tool_defs,
            last_prompt_tokens: 0,
            process_tracker: ProcessTracker::new(),
            reasoning_effort: "none".to_string(),
            conversation_log,
            context_budget: ContextBudget::default(),
            tokenizer,
            context,
            shared_context,
            session_id,
        };

        // Load recent journal entries at startup for orientation
        agent.load_startup_journal();
        agent.load_working_stack();

        agent.push_context(Message::system(system_prompt));
        let rendered = agent.context.render_context_message();
        if !rendered.is_empty() {
            agent.push_context(Message::user(rendered));
        }
        if !agent.context.journal.is_empty() {
            agent.push_context(Message::user(agent.context.journal.clone()));
        }
        agent.measure_budget();
        agent.publish_context_state();
        agent
    }

    /// Run poc-hook for a given event, returning any output to inject.
    fn run_hook(&self, event: &str, prompt: &str) -> Option<String> {
        let transcript_path = self.conversation_log.as_ref()
            .map(|l| l.path().to_string_lossy().to_string())
            .unwrap_or_default();

        let hook_input = serde_json::json!({
            "hook_event_name": event,
            "session_id": self.session_id,
            "transcript_path": transcript_path,
            "prompt": prompt,
        });

        let mut child = Command::new("poc-hook")
            .stdin(Stdio::piped())
            .stdout(Stdio::piped())
            .stderr(Stdio::null())
            .spawn()
            .ok()?;

        if let Some(ref mut stdin) = child.stdin {
            let _ = stdin.write_all(hook_input.to_string().as_bytes());
        }
        drop(child.stdin.take());

        let output = child.wait_with_output().ok()?;
        let text = String::from_utf8_lossy(&output.stdout).to_string();
        if text.trim().is_empty() {
            None
        } else {
            Some(text)
        }
    }

    /// Push a conversation message — stamped and logged.
    fn push_message(&mut self, mut msg: Message) {
        msg.stamp();
        if let Some(ref log) = self.conversation_log {
            if let Err(e) = log.append(&msg) {
                eprintln!("warning: failed to log message: {:#}", e);
            }
        }
        self.messages.push(msg);
    }

    /// Push a context-only message (system prompt, identity context,
    /// journal summaries). Not logged — these are reconstructed on
    /// every startup/compaction.
    fn push_context(&mut self, msg: Message) {
        self.messages.push(msg);
    }

    /// Measure context window usage by category. Uses the BPE tokenizer
    /// for direct token counting (no chars/4 approximation).
    fn measure_budget(&mut self) {
        let mut id_tokens: usize = 0;
        let mem_tokens: usize = 0;
        let mut jnl_tokens: usize = 0;
        let mut conv_tokens: usize = 0;
        let mut in_conversation = false;

        for msg in &self.messages {
            let tokens = msg_token_count(&self.tokenizer, msg);

            if in_conversation {
                conv_tokens += tokens;
                continue;
            }

            match msg.role {
                Role::System => id_tokens += tokens,
                Role::User => {
                    let text = msg.content_text();
                    if text.starts_with("[Earlier in this conversation") {
                        jnl_tokens += tokens;
                    } else if text.starts_with("Your context was just rebuilt") {
                        jnl_tokens += tokens;
                    } else if jnl_tokens == 0 && conv_tokens == 0 {
                        // Static identity context (before any journal/conversation)
                        id_tokens += tokens;
                    } else {
                        in_conversation = true;
                        conv_tokens += tokens;
                    }
                }
                _ => {
                    in_conversation = true;
                    conv_tokens += tokens;
                }
            }
        }

        self.context_budget = ContextBudget {
            identity_tokens: id_tokens,
            memory_tokens: mem_tokens,
            journal_tokens: jnl_tokens,
            conversation_tokens: conv_tokens,
            window_tokens: model_context_window(&self.client.model),
        };
    }

    /// Send a user message and run the agent loop until the model
    /// produces a text response (no more tool calls). Streams text
    /// and tool activity through the UI channel.
    pub async fn turn(
        &mut self,
        user_input: &str,
        ui_tx: &UiSender,
        target: StreamTarget,
    ) -> Result<TurnResult> {
        // Run poc-hook (memory search, notifications, context check)
        if let Some(hook_output) = self.run_hook("UserPromptSubmit", user_input) {
            let enriched = format!("{}\n\n<system-reminder>\n{}\n</system-reminder>",
                user_input, hook_output);
            self.push_message(Message::user(enriched));
        } else {
            self.push_message(Message::user(user_input));
        }

        let mut overflow_retries: u32 = 0;
        let mut empty_retries: u32 = 0;
        let mut ds = DispatchState {
            yield_requested: false,
            had_tool_calls: false,
            tool_errors: 0,
            model_switch: None,
            dmn_pause: false,
        };

        loop {
            let _ = ui_tx.send(UiMessage::Activity("thinking...".into()));
            let api_result = self
                .client
                .chat_completion_stream(
                    &self.messages,
                    Some(&self.tool_defs),
                    ui_tx,
                    target,
                    &self.reasoning_effort,
                )
                .await;

            // Context overflow → compact and retry (max 2 attempts)
            // Stream error → retry with backoff (max 2 attempts)
            let (msg, usage) = match api_result {
                Err(e) if is_context_overflow(&e) && overflow_retries < 2 => {
                    overflow_retries += 1;
                    let _ = ui_tx.send(UiMessage::Info(format!(
                        "[context overflow — compacting and retrying ({}/2)]",
                        overflow_retries,
                    )));
                    self.emergency_compact();
                    continue;
                }
                Err(e) if is_stream_error(&e) && empty_retries < 2 => {
                    empty_retries += 1;
                    let _ = ui_tx.send(UiMessage::Info(format!(
                        "[stream error: {} — retrying ({}/2)]",
                        e, empty_retries,
                    )));
                    tokio::time::sleep(std::time::Duration::from_secs(2)).await;
                    continue;
                }
                other => other?,
            };

            // Strip ephemeral tool calls (journal) that the API has
            // now processed. They're persisted to disk; no need to keep
            // them in the conversation history burning tokens.
            self.strip_ephemeral_tool_calls();

            if let Some(usage) = &usage {
                self.last_prompt_tokens = usage.prompt_tokens;
                self.measure_budget();
                self.publish_context_state();
                let _ = ui_tx.send(UiMessage::StatusUpdate(StatusInfo {
                    dmn_state: String::new(), // filled by main loop
                    dmn_turns: 0,
                    dmn_max_turns: 0,
                    prompt_tokens: usage.prompt_tokens,
                    completion_tokens: usage.completion_tokens,
                    model: self.client.model.clone(),
                    turn_tools: 0, // tracked by TUI from ToolCall messages
                    context_budget: self.context_budget.status_string(),
                }));
            }

            // Empty response — model returned finish=stop with no content
            // or tool calls. Inject a nudge so the retry has different input.
            let has_content = msg.content.is_some();
            let has_tools = msg.tool_calls.as_ref().map_or(false, |tc| !tc.is_empty());
            if !has_content && !has_tools {
                if empty_retries < 2 {
                    empty_retries += 1;
                    let _ = ui_tx.send(UiMessage::Debug(format!(
                        "empty response, injecting nudge and retrying ({}/2)",
                        empty_retries,
                    )));
                    self.push_message(Message::user(
                        "[system] Your previous response was empty. \
                         Please respond with text or use a tool."
                    ));
                    continue;
                }
                // After max retries, fall through — return the empty response
            } else {
                empty_retries = 0;
            }

            // Structured tool calls from the API
            if let Some(ref tool_calls) = msg.tool_calls {
                if !tool_calls.is_empty() {
                    self.push_message(msg.clone());
                    for call in tool_calls {
                        self.dispatch_tool_call(call, None, ui_tx, &mut ds)
                            .await;
                    }
                    continue;
                }
            }

            // No structured tool calls — check for leaked tool calls
            // (Qwen sometimes outputs <tool_call> XML as text).
            let text = msg.content_text().to_string();
            let leaked = parse_leaked_tool_calls(&text);

            if !leaked.is_empty() {
                let _ = ui_tx.send(UiMessage::Debug(format!(
                    "recovered {} leaked tool call(s) from text",
                    leaked.len()
                )));
                // Strip tool call XML and thinking tokens from the message
                // so they don't clutter the conversation history.
                let cleaned = strip_leaked_artifacts(&text);
                let mut clean_msg = msg.clone();
                clean_msg.content = if cleaned.trim().is_empty() {
                    None
                } else {
                    Some(MessageContent::Text(cleaned))
                };
                self.push_message(clean_msg);
                for call in &leaked {
                    self.dispatch_tool_call(call, Some("recovered"), ui_tx, &mut ds)
                        .await;
                }
                continue;
            }

            // Genuinely text-only response
            let _ = ui_tx.send(UiMessage::Activity(String::new()));
            self.push_message(msg);

            return Ok(TurnResult {
                text,
                yield_requested: ds.yield_requested,
                had_tool_calls: ds.had_tool_calls,
                tool_errors: ds.tool_errors,
                model_switch: ds.model_switch,
                dmn_pause: ds.dmn_pause,
            });
        }
    }

    /// Dispatch a single tool call: send UI annotations, run the tool,
    /// push results into the conversation, handle images.
    async fn dispatch_tool_call(
        &mut self,
        call: &ToolCall,
        tag: Option<&str>,
        ui_tx: &UiSender,
        ds: &mut DispatchState,
    ) {
        let args: serde_json::Value =
            serde_json::from_str(&call.function.arguments).unwrap_or_default();

        let args_summary = summarize_args(&call.function.name, &args);
        let label = match tag {
            Some(t) => format!("calling: {} ({})", call.function.name, t),
            None => format!("calling: {}", call.function.name),
        };
        let _ = ui_tx.send(UiMessage::Activity(label));
        let _ = ui_tx.send(UiMessage::ToolCall {
            name: call.function.name.clone(),
            args_summary: args_summary.clone(),
        });
        let _ = ui_tx.send(UiMessage::ToolStarted {
            id: call.id.clone(),
            name: call.function.name.clone(),
            detail: args_summary,
        });

        // Handle working_stack tool — needs &mut self for context state
        if call.function.name == "working_stack" {
            let result = tools::working_stack::handle(&args, &mut self.context.working_stack);
            let output = tools::ToolOutput {
                text: result.clone(),
                is_yield: false,
                images: Vec::new(),
                model_switch: None,
                dmn_pause: false,
            };
            let _ = ui_tx.send(UiMessage::ToolResult {
                name: call.function.name.clone(),
                result: output.text.clone(),
            });
            let _ = ui_tx.send(UiMessage::ToolFinished { id: call.id.clone() });
            self.push_message(Message::tool_result(&call.id, &output.text));
            ds.had_tool_calls = true;

            // Re-render the context message so the model sees the updated stack
            if !result.starts_with("Error:") {
                self.refresh_context_message();
            }
            return;
        }

        let output =
            tools::dispatch(&call.function.name, &args, &self.process_tracker).await;

        if output.is_yield {
            ds.yield_requested = true;
        } else {
            ds.had_tool_calls = true;
        }
        if output.model_switch.is_some() {
            ds.model_switch = output.model_switch;
        }
        if output.dmn_pause {
            ds.dmn_pause = true;
        }
        if output.text.starts_with("Error:") {
            ds.tool_errors += 1;
        }

        let _ = ui_tx.send(UiMessage::ToolResult {
            name: call.function.name.clone(),
            result: output.text.clone(),
        });
        let _ = ui_tx.send(UiMessage::ToolFinished { id: call.id.clone() });

        self.push_message(Message::tool_result(&call.id, &output.text));

        if !output.images.is_empty() {
            // Only one live image in context at a time — age out any
            // previous ones to avoid accumulating ~90KB+ per image.
            self.age_out_images();
            self.push_message(Message::user_with_images(
                "Here is the image you requested:",
                &output.images,
            ));
        }
    }

    /// Build context state summary for the debug screen.
    pub fn context_state_summary(&self) -> Vec<ContextSection> {
        let count = |s: &str| self.tokenizer.encode_with_special_tokens(s).len();

        let mut sections = Vec::new();

        // System prompt
        sections.push(ContextSection {
            name: "System prompt".into(),
            tokens: count(&self.context.system_prompt),
            content: self.context.system_prompt.clone(),
            children: Vec::new(),
        });

        // Personality — parent with file children
        let personality_children: Vec<ContextSection> = self.context.personality.iter()
            .map(|(name, content)| ContextSection {
                name: name.clone(),
                tokens: count(content),
                content: content.clone(),
                children: Vec::new(),
            })
            .collect();
        let personality_tokens: usize = personality_children.iter().map(|c| c.tokens).sum();
        sections.push(ContextSection {
            name: format!("Personality ({} files)", personality_children.len()),
            tokens: personality_tokens,
            content: String::new(),
            children: personality_children,
        });

        // Journal — split into per-entry children
        {
            let mut journal_children = Vec::new();
            let mut current_header = String::new();
            let mut current_body = String::new();
            for line in self.context.journal.lines() {
                if line.starts_with("## ") {
                    if !current_header.is_empty() {
                        let body = std::mem::take(&mut current_body);
                        let preview: String = body.lines().next().unwrap_or("").chars().take(60).collect();
                        journal_children.push(ContextSection {
                            name: format!("{}: {}", current_header, preview),
                            tokens: count(&body),
                            content: body,
                            children: Vec::new(),
                        });
                    }
                    current_header = line.trim_start_matches("## ").to_string();
                    current_body.clear();
                } else {
                    if !current_body.is_empty() || !line.is_empty() {
                        current_body.push_str(line);
                        current_body.push('\n');
                    }
                }
            }
            if !current_header.is_empty() {
                let preview: String = current_body.lines().next().unwrap_or("").chars().take(60).collect();
                journal_children.push(ContextSection {
                    name: format!("{}: {}", current_header, preview),
                    tokens: count(&current_body),
                    content: current_body,
                    children: Vec::new(),
                });
            }
            let journal_tokens: usize = journal_children.iter().map(|c| c.tokens).sum();
            sections.push(ContextSection {
                name: format!("Journal ({} entries)", journal_children.len()),
                tokens: journal_tokens,
                content: String::new(),
                children: journal_children,
            });
        }

        // Working stack — instructions + items as children
        let instructions = std::fs::read_to_string(WORKING_STACK_INSTRUCTIONS)
            .unwrap_or_default();
        let mut stack_children = vec![ContextSection {
            name: "Instructions".into(),
            tokens: count(&instructions),
            content: instructions,
            children: Vec::new(),
        }];
        for (i, item) in self.context.working_stack.iter().enumerate() {
            let marker = if i == self.context.working_stack.len() - 1 { "→" } else { " " };
            stack_children.push(ContextSection {
                name: format!("{} [{}] {}", marker, i, item),
                tokens: count(item),
                content: String::new(),
                children: Vec::new(),
            });
        }
        let stack_tokens: usize = stack_children.iter().map(|c| c.tokens).sum();
        sections.push(ContextSection {
            name: format!("Working stack ({} items)", self.context.working_stack.len()),
            tokens: stack_tokens,
            content: String::new(),
            children: stack_children,
        });

        // Conversation — each message as a child
        let conv_start = self.messages.iter()
            .position(|m| m.role == Role::Assistant || m.role == Role::Tool)
            .unwrap_or(self.messages.len());
        let conv_messages = &self.messages[conv_start..];
        let conv_children: Vec<ContextSection> = conv_messages.iter().enumerate()
            .map(|(i, msg)| {
                let text = msg.content.as_ref()
                    .map(|c| c.as_text().to_string())
                    .unwrap_or_default();
                let tool_info = msg.tool_calls.as_ref().map(|tc| {
                    tc.iter()
                        .map(|c| c.function.name.clone())
                        .collect::<Vec<_>>()
                        .join(", ")
                });
                let label = match (&msg.role, &tool_info) {
                    (_, Some(tools)) => format!("[tool_call: {}]", tools),
                    _ => {
                        let preview: String = text.chars().take(60).collect();
                        let preview = preview.replace('\n', " ");
                        if text.len() > 60 { format!("{}...", preview) } else { preview }
                    }
                };
                let tokens = count(&text);
                let role_name = match msg.role {
                    Role::Assistant => "PoC",
                    Role::User => "Kent",
                    Role::Tool => "tool",
                    Role::System => "system",
                };
                ContextSection {
                    name: format!("[{}] {}: {}", conv_start + i, role_name, label),
                    tokens,
                    content: text,
                    children: Vec::new(),
                }
            })
            .collect();
        let conv_tokens: usize = conv_children.iter().map(|c| c.tokens).sum();
        sections.push(ContextSection {
            name: format!("Conversation ({} messages)", conv_children.len()),
            tokens: conv_tokens,
            content: String::new(),
            children: conv_children,
        });

        sections
    }

    /// Load recent journal entries at startup for orientation.
    /// Uses the same budget logic as compaction but with empty conversation.
    /// Only parses the tail of the journal file (last 64KB) for speed.
    fn load_startup_journal(&mut self) {
        let journal_path = journal::default_journal_path();
        let entries = journal::parse_journal_tail(&journal_path, 64 * 1024);
        if entries.is_empty() {
            return;
        }

        let count = |s: &str| self.tokenizer.encode_with_special_tokens(s).len();
        let context_message = self.context.render_context_message();

        let plan = plan_context(
            &self.context.system_prompt,
            &context_message,
            &[],  // no conversation yet
            &entries,
            &self.client.model,
            &count,
        );

        self.context.journal = render_journal_text(&entries, &plan);
    }

    /// Re-render the context message in self.messages from live ContextState.
    /// Called after any change to context state (working stack, etc).
    fn refresh_context_message(&mut self) {
        let rendered = self.context.render_context_message();
        // The context message is the first user message (index 1, after system prompt)
        if self.messages.len() >= 2 && self.messages[1].role == Role::User {
            self.messages[1] = Message::user(rendered);
        }
        self.publish_context_state();
        self.save_working_stack();
    }

    /// Persist working stack to disk.
    fn save_working_stack(&self) {
        if let Ok(json) = serde_json::to_string(&self.context.working_stack) {
            let _ = std::fs::write(WORKING_STACK_FILE, json);
        }
    }

    /// Load working stack from disk.
    fn load_working_stack(&mut self) {
        if let Ok(data) = std::fs::read_to_string(WORKING_STACK_FILE) {
            if let Ok(stack) = serde_json::from_str::<Vec<String>>(&data) {
                self.context.working_stack = stack;
            }
        }
    }

    /// Push the current context summary to the shared state for the TUI to read.
    fn publish_context_state(&self) {
        if let Ok(mut state) = self.shared_context.write() {
            *state = self.context_state_summary();
        }
    }

    /// Replace base64 image data in older messages with text placeholders.
    /// Only the most recent image stays live — each new image ages out
    /// all previous ones. The tool result message (right before each image
    /// message) already records what was loaded, so no info is lost.
    fn age_out_images(&mut self) {
        for msg in &mut self.messages {
            if let Some(MessageContent::Parts(parts)) = &msg.content {
                let has_images = parts.iter().any(|p| matches!(p, ContentPart::ImageUrl { .. }));
                if !has_images {
                    continue;
                }
                let mut replacement = String::new();
                for part in parts {
                    match part {
                        ContentPart::Text { text } => {
                            if !replacement.is_empty() {
                                replacement.push('\n');
                            }
                            replacement.push_str(text);
                        }
                        ContentPart::ImageUrl { .. } => {
                            if !replacement.is_empty() {
                                replacement.push('\n');
                            }
                            replacement.push_str(
                                "[image aged out — see tool result above for details]",
                            );
                        }
                    }
                }
                msg.content = Some(MessageContent::Text(replacement));
            }
        }
    }

    /// Strip ephemeral tool calls from the conversation history.
    ///
    /// Ephemeral tools (like journal) persist their output to disk,
    /// so the tool call + result don't need to stay in the context
    /// window. We keep them for exactly one API round-trip (the model
    /// needs to see the result was acknowledged), then strip them.
    ///
    /// If an assistant message contains ONLY ephemeral tool calls,
    /// the entire message and its tool results are removed. If mixed
    /// with non-ephemeral calls, we leave it (rare case, small cost).
    fn strip_ephemeral_tool_calls(&mut self) {
        // Collect IDs of tool calls to strip
        let mut strip_ids: Vec<String> = Vec::new();
        let mut strip_msg_indices: Vec<usize> = Vec::new();

        for (i, msg) in self.messages.iter().enumerate() {
            if msg.role != Role::Assistant {
                continue;
            }
            let calls = match &msg.tool_calls {
                Some(c) if !c.is_empty() => c,
                _ => continue,
            };

            let all_ephemeral = calls.iter().all(|c| {
                c.function.name == tools::journal::TOOL_NAME
            });

            if all_ephemeral {
                strip_msg_indices.push(i);
                for call in calls {
                    strip_ids.push(call.id.clone());
                }
            }
        }

        if strip_ids.is_empty() {
            return;
        }

        // Remove in reverse order to preserve indices
        self.messages.retain(|msg| {
            // Strip the assistant messages we identified
            if msg.role == Role::Assistant {
                if let Some(calls) = &msg.tool_calls {
                    if calls.iter().all(|c| strip_ids.contains(&c.id)) {
                        return false;
                    }
                }
            }
            // Strip matching tool results
            if msg.role == Role::Tool {
                if let Some(ref id) = msg.tool_call_id {
                    if strip_ids.contains(id) {
                        return false;
                    }
                }
            }
            true
        });
    }

    /// Last prompt token count reported by the API.
    pub fn last_prompt_tokens(&self) -> u32 {
        self.last_prompt_tokens
    }

    /// Build context window from conversation messages + journal.
    /// Used by both compact() (in-memory messages) and restore_from_log()
    /// (conversation log). The context window is always:
    ///   identity + journal summaries + raw recent messages
    pub fn compact(&mut self, new_system_prompt: String, new_personality: Vec<(String, String)>) {
        self.context.system_prompt = new_system_prompt;
        self.context.personality = new_personality;
        self.do_compact();
    }

    /// Internal compaction — rebuilds context window from current messages.
    fn do_compact(&mut self) {
        // Find where actual conversation starts (after system + context)
        let conv_start = self
            .messages
            .iter()
            .position(|m| m.role == Role::Assistant || m.role == Role::Tool)
            .unwrap_or(self.messages.len());

        let conversation: Vec<Message> = self.messages[conv_start..].to_vec();
        let (messages, journal) = build_context_window(
            &self.context,
            &conversation,
            &self.client.model,
            &self.tokenizer,
        );
        self.context.journal = journal;
        self.messages = messages;
        self.last_prompt_tokens = 0;
        self.measure_budget();
        self.publish_context_state();
    }

    /// Emergency compaction using stored config — called on context overflow.
    fn emergency_compact(&mut self) {
        self.do_compact();
    }

    /// Restore from the conversation log. Builds the context window
    /// the same way compact() does — journal summaries for old messages,
    /// raw recent messages. This is the unified startup path.
    /// Returns true if the log had content to restore.
    pub fn restore_from_log(
        &mut self,
        system_prompt: String,
        personality: Vec<(String, String)>,
    ) -> bool {
        self.context.system_prompt = system_prompt;
        self.context.personality = personality;

        let all_messages = match &self.conversation_log {
            Some(log) => match log.read_tail(512 * 1024) {
                Ok(msgs) if !msgs.is_empty() => {
                    dbglog!("[restore] read {} messages from log tail", msgs.len());
                    msgs
                }
                Ok(_) => {
                    dbglog!("[restore] log exists but is empty");
                    return false;
                }
                Err(e) => {
                    dbglog!("[restore] failed to read log: {}", e);
                    return false;
                }
            },
            None => {
                dbglog!("[restore] no conversation log configured");
                return false;
            }
        };

        // Filter out system/context messages — we only want the
        // actual conversation (user prompts, assistant responses,
        // tool calls/results)
        let conversation: Vec<Message> = all_messages
            .into_iter()
            .filter(|m| m.role != Role::System)
            .collect();
        dbglog!("[restore] {} messages after filtering system", conversation.len());

        let (messages, journal) = build_context_window(
            &self.context,
            &conversation,
            &self.client.model,
            &self.tokenizer,
        );
        dbglog!("[restore] journal text: {} chars, {} lines",
            journal.len(), journal.lines().count());
        self.context.journal = journal;
        self.messages = messages;
        dbglog!("[restore] built context window: {} messages", self.messages.len());
        self.last_prompt_tokens = 0;
        self.measure_budget();
        self.publish_context_state();
        true
    }

    /// Replace the API client (for model switching).
    pub fn swap_client(&mut self, new_client: ApiClient) {
        self.client = new_client;
    }

    /// Get the model identifier.
    pub fn model(&self) -> &str {
        &self.client.model
    }

    /// Get the conversation history for persistence.
    pub fn messages(&self) -> &[Message] {
        &self.messages
    }

    /// Mutable access to conversation history (for /retry).
    pub fn messages_mut(&mut self) -> &mut Vec<Message> {
        &mut self.messages
    }

    /// Restore from a saved conversation.
    pub fn restore(&mut self, messages: Vec<Message>) {
        self.messages = messages;
    }
}

/// Look up a model's context window size in tokens.
pub fn model_context_window(model: &str) -> usize {
    let m = model.to_lowercase();
    if m.contains("opus") || m.contains("sonnet") {
        200_000
    } else if m.contains("qwen") {
        131_072
    } else {
        128_000
    }
}

/// Context budget in tokens: 60% of the model's context window.
/// Leaves headroom for conversation to grow before compaction triggers.
///
/// Future direction: make this dynamic based on what the agent is
/// doing — deep coding work might allocate more to conversation,
/// consolidation might allocate more to journal/memory, idle might
/// shrink everything to save cost.
fn context_budget_tokens(model: &str) -> usize {
    model_context_window(model) * 60 / 100
}

/// Allocation plan for the context window. Separates the budget math
/// (which entries and messages to include) from the message assembly
/// (building the actual Vec<Message>). This makes the core algorithm
/// testable and inspectable — log the plan on compaction to see exactly
/// what allocation decisions were made.
struct ContextPlan {
    /// Index into all_entries: header-only entries start here
    header_start: usize,
    /// Index into all_entries: full entries start here (headers end here)
    full_start: usize,
    /// Total journal entries (header-only + full go up to this)
    entry_count: usize,
    /// Index into recent conversation: skip messages before this
    conv_trim: usize,
    /// Total recent conversation messages
    conv_count: usize,
    /// Tokens used by full journal entries
    full_tokens: usize,
    /// Tokens used by header-only journal entries
    header_tokens: usize,
    /// Tokens used by conversation (after trimming)
    conv_tokens: usize,
    /// Total budget available (after identity, memory, reserve)
    available: usize,
}

/// Build a context window from conversation messages + journal entries.
/// This is the core algorithm shared by compact() and restore_from_log().
///
/// Allocation strategy: identity and memory are fixed costs. The
/// remaining budget (minus 25% reserve for model output) is split
/// between journal and conversation. Conversation gets priority —
/// it's what's happening now. Journal fills the rest, newest first.
///
/// When the budget is tight, journal entries are dropped first
/// (oldest entries go first). If conversation alone exceeds the
/// budget, oldest messages are trimmed to fit.
/// Returns (messages, journal_text) — caller stores journal_text in ContextState.
fn build_context_window(
    context: &ContextState,
    conversation: &[Message],
    model: &str,
    tokenizer: &CoreBPE,
) -> (Vec<Message>, String) {
    let journal_path = journal::default_journal_path();
    let all_entries = journal::parse_journal(&journal_path);
    dbglog!("[ctx] {} journal entries from {}", all_entries.len(), journal_path.display());
    let count = |s: &str| tokenizer.encode_with_special_tokens(s).len();

    let system_prompt = context.system_prompt.clone();
    let context_message = context.render_context_message();

    // Cap memory to 50% of the context budget so conversation always
    // gets space. Truncate at the last complete section boundary.
    let max_tokens = context_budget_tokens(model);
    let memory_cap = max_tokens / 2;
    let memory_tokens = count(&context_message);
    let context_message = if memory_tokens > memory_cap {
        dbglog!("[ctx] memory too large: {} tokens > {} cap, truncating", memory_tokens, memory_cap);
        truncate_at_section(&context_message, memory_cap, &count)
    } else {
        context_message
    };

    let recent_start = find_journal_cutoff(conversation, all_entries.last());
    dbglog!("[ctx] journal cutoff: {} of {} conversation messages are 'recent'",
        conversation.len() - recent_start, conversation.len());
    let recent = &conversation[recent_start..];

    let plan = plan_context(
        &system_prompt,
        &context_message,
        recent,
        &all_entries,
        model,
        &count,
    );

    // Render journal text from the plan
    let journal_text = render_journal_text(&all_entries, &plan);
    dbglog!("[ctx] plan: header_start={} full_start={} entry_count={} conv_trim={} journal_text={} chars",
        plan.header_start, plan.full_start, plan.entry_count, plan.conv_trim, journal_text.len());

    let messages = assemble_context(
        system_prompt, context_message, &journal_text,
        recent, &plan,
    );
    (messages, journal_text)
}

/// Compute the allocation plan: how much budget goes to journal vs
/// conversation, which entries and messages to include.
fn plan_context(
    system_prompt: &str,
    context_message: &str,
    recent: &[Message],
    entries: &[journal::JournalEntry],
    model: &str,
    count: &dyn Fn(&str) -> usize,
) -> ContextPlan {
    let max_tokens = context_budget_tokens(model);

    // Fixed costs — always included
    let identity_cost = count(system_prompt);
    let memory_cost = count(context_message);
    let reserve = max_tokens / 4;
    let available = max_tokens
        .saturating_sub(identity_cost)
        .saturating_sub(memory_cost)
        .saturating_sub(reserve);

    // Measure conversation
    let conv_costs: Vec<usize> = recent.iter().map(|m| msg_token_count_fn(m, count)).collect();
    let total_conv: usize = conv_costs.iter().sum();

    // Journal always gets at least 15% of available budget so it doesn't
    // get squeezed out by large conversations.
    let journal_min = available * 15 / 100;
    let journal_budget = available.saturating_sub(total_conv).max(journal_min);

    // Fill journal entries newest-first within budget.
    // Tiered: recent entries get full content, older entries get just
    // a header line (timestamp + first line) for timeline awareness.
    let full_budget = journal_budget * 70 / 100;
    let header_budget = journal_budget.saturating_sub(full_budget);

    // Phase 1: Full entries (newest first)
    let mut full_used = 0;
    let mut n_full = 0;
    for entry in entries.iter().rev() {
        let cost = count(&entry.content) + 10;
        if full_used + cost > full_budget {
            break;
        }
        full_used += cost;
        n_full += 1;
    }
    let full_start = entries.len().saturating_sub(n_full);

    // Phase 2: Header-only entries (continuing backward from where full stopped)
    let mut header_used = 0;
    let mut n_headers = 0;
    for entry in entries[..full_start].iter().rev() {
        let first_line = entry
            .content
            .lines()
            .find(|l| !l.trim().is_empty())
            .unwrap_or("(empty)");
        let cost = count(first_line) + 10;
        if header_used + cost > header_budget {
            break;
        }
        header_used += cost;
        n_headers += 1;
    }
    let header_start = full_start.saturating_sub(n_headers);

    // If conversation exceeds available budget, trim oldest messages
    let journal_used = full_used + header_used;
    let mut conv_trim = 0;
    let mut trimmed_conv = total_conv;
    while trimmed_conv + journal_used > available && conv_trim < recent.len() {
        trimmed_conv -= conv_costs[conv_trim];
        conv_trim += 1;
    }
    // Walk forward to user message boundary
    while conv_trim < recent.len() && recent[conv_trim].role != Role::User {
        conv_trim += 1;
    }

    dbglog!("[plan] model={} max_tokens={} available={} (identity={} memory={} reserve={})",
        model, max_tokens, available, identity_cost, memory_cost, reserve);
    dbglog!("[plan] conv: {} msgs, {} tokens total, trimming {} msgs → {} tokens",
        recent.len(), total_conv, conv_trim, trimmed_conv);
    dbglog!("[plan] journal: {} full entries ({}t) + {} headers ({}t)",
        n_full, full_used, n_headers, header_used);

    ContextPlan {
        header_start,
        full_start,
        entry_count: entries.len(),
        conv_trim,
        conv_count: recent.len(),
        full_tokens: full_used,
        header_tokens: header_used,
        conv_tokens: trimmed_conv,
        available,
    }
}

/// Render journal entries into text from a context plan.
fn render_journal_text(
    entries: &[journal::JournalEntry],
    plan: &ContextPlan,
) -> String {
    let has_journal = plan.header_start < plan.entry_count;
    if !has_journal {
        return String::new();
    }

    let mut text = String::from("[Earlier in this conversation — from your journal]\n\n");

    // Header-only entries (older) — just timestamp + first line
    for entry in &entries[plan.header_start..plan.full_start] {
        let first_line = entry
            .content
            .lines()
            .find(|l| !l.trim().is_empty())
            .unwrap_or("(empty)");
        text.push_str(&format!(
            "## {} — {}\n",
            entry.timestamp.format("%Y-%m-%dT%H:%M"),
            first_line,
        ));
    }

    // Separator between headers and full entries
    let n_headers = plan.full_start - plan.header_start;
    let n_full = plan.entry_count - plan.full_start;
    if n_headers > 0 && n_full > 0 {
        text.push_str("\n---\n\n");
    }

    // Full entries (recent)
    for entry in &entries[plan.full_start..] {
        text.push_str(&format!(
            "## {}\n\n{}\n\n",
            entry.timestamp.format("%Y-%m-%dT%H:%M"),
            entry.content
        ));
    }

    text
}

/// Assemble the context window from a plan. No allocation decisions
/// happen here — just follow the plan to build messages.
fn assemble_context(
    system_prompt: String,
    context_message: String,
    journal_text: &str,
    recent: &[Message],
    plan: &ContextPlan,
) -> Vec<Message> {
    let mut messages = vec![Message::system(system_prompt)];
    if !context_message.is_empty() {
        messages.push(Message::user(context_message));
    }

    let final_recent = &recent[plan.conv_trim..];

    if !journal_text.is_empty() {
        messages.push(Message::user(journal_text.to_string()));
    } else if !final_recent.is_empty() {
        messages.push(Message::user(
            "Your context was just rebuilt. Memory files have been \
             reloaded. Your recent conversation continues below. \
             Earlier context is in your journal and memory files."
                .to_string(),
        ));
    }

    messages.extend(final_recent.iter().cloned());
    messages
}

/// Find the conversation index where messages are no longer covered
/// Truncate a context message to fit within a token budget. Cuts at
/// section boundaries (lines starting with `---` or `## `) to avoid
/// splitting mid-section. Drops sections from the end first since
/// earlier sections (identity, instructions) matter more.
fn truncate_at_section(text: &str, max_tokens: usize, count: &dyn Fn(&str) -> usize) -> String {
    // Find section boundaries (--- separators between assembled parts)
    let mut boundaries = vec![0usize];
    for (i, line) in text.lines().enumerate() {
        if line.trim() == "---" || line.starts_with("## ") {
            // Find byte offset of this line
            let offset = text.lines().take(i).map(|l| l.len() + 1).sum::<usize>();
            boundaries.push(offset);
        }
    }
    boundaries.push(text.len());

    // Binary search: find the largest prefix of sections that fits
    let mut best = 0;
    for &end in &boundaries[1..] {
        let slice = &text[..end];
        if count(slice) <= max_tokens {
            best = end;
        } else {
            break;
        }
    }

    if best == 0 {
        // Even the first section doesn't fit — hard truncate
        best = text.len().min(max_tokens * 3); // ~3 chars/token rough estimate
    }

    let truncated = &text[..best];
    dbglog!("[ctx] truncated memory from {} to {} chars ({} tokens)",
        text.len(), truncated.len(), count(truncated));
    truncated.to_string()
}

/// by journal entries. Messages before this index are summarized by
/// the journal; messages from this index onward stay as raw conversation.
/// Walks back to a user message boundary to avoid splitting tool
/// call/result sequences.
fn find_journal_cutoff(
    conversation: &[Message],
    newest_entry: Option<&journal::JournalEntry>,
) -> usize {
    let cutoff = match newest_entry {
        Some(entry) => entry.timestamp,
        None => return 0,
    };

    let mut split = conversation.len();
    for (i, msg) in conversation.iter().enumerate() {
        if let Some(ts) = parse_msg_timestamp(msg) {
            if ts > cutoff {
                split = i;
                break;
            }
        }
    }
    // Walk back to user message boundary
    while split > 0 && split < conversation.len() && conversation[split].role != Role::User {
        split -= 1;
    }
    split
}

/// Count the token footprint of a message using a token counting function.
fn msg_token_count_fn(msg: &Message, count: &dyn Fn(&str) -> usize) -> usize {
    let content = msg.content.as_ref().map_or(0, |c| match c {
        MessageContent::Text(s) => count(s),
        MessageContent::Parts(parts) => parts
            .iter()
            .map(|p| match p {
                ContentPart::Text { text } => count(text),
                ContentPart::ImageUrl { .. } => 85,
            })
            .sum(),
    });
    let tools = msg.tool_calls.as_ref().map_or(0, |calls| {
        calls
            .iter()
            .map(|c| count(&c.function.arguments) + count(&c.function.name))
            .sum()
    });
    content + tools
}

/// Count the token footprint of a message using BPE tokenization.
fn msg_token_count(tokenizer: &CoreBPE, msg: &Message) -> usize {
    msg_token_count_fn(msg, &|s| tokenizer.encode_with_special_tokens(s).len())
}

/// Detect context window overflow errors from the API.
/// Different providers phrase this differently; we check for common patterns.
/// OpenRouter wraps upstream errors, so we check both the wrapper and the raw message.
fn is_context_overflow(err: &anyhow::Error) -> bool {
    let msg = err.to_string().to_lowercase();
    msg.contains("context length")
        || msg.contains("token limit")
        || msg.contains("too many tokens")
        || msg.contains("maximum context")
        || msg.contains("prompt is too long")
        || msg.contains("request too large")
        || msg.contains("input validation error")
        || msg.contains("content length limit")
        || (msg.contains("400") && msg.contains("tokens"))
}

/// Detect model/provider errors delivered inside the SSE stream.
/// OpenRouter returns HTTP 200 but finish_reason="error" with
/// partial content (e.g. "system") — we surface this as an error
/// so the turn loop can retry.
fn is_stream_error(err: &anyhow::Error) -> bool {
    err.to_string().contains("model stream error")
}

/// Parse a message's timestamp field into a DateTime.
fn parse_msg_timestamp(msg: &Message) -> Option<DateTime<Utc>> {
    msg.timestamp
        .as_ref()
        .and_then(|ts| DateTime::parse_from_rfc3339(ts).ok())
        .map(|dt| dt.with_timezone(&Utc))
}

/// Create a short summary of tool args for the tools pane header.
fn summarize_args(tool_name: &str, args: &serde_json::Value) -> String {
    match tool_name {
        "read_file" | "write_file" | "edit_file" => args["file_path"]
            .as_str()
            .unwrap_or("")
            .to_string(),
        "bash" => {
            let cmd = args["command"].as_str().unwrap_or("");
            if cmd.len() > 60 {
                let end = cmd.char_indices()
                    .map(|(i, _)| i)
                    .take_while(|&i| i <= 60)
                    .last()
                    .unwrap_or(0);
                format!("{}...", &cmd[..end])
            } else {
                cmd.to_string()
            }
        }
        "grep" => {
            let pattern = args["pattern"].as_str().unwrap_or("");
            let path = args["path"].as_str().unwrap_or(".");
            format!("{} in {}", pattern, path)
        }
        "glob" => args["pattern"]
            .as_str()
            .unwrap_or("")
            .to_string(),
        "view_image" => {
            if let Some(pane) = args["pane_id"].as_str() {
                format!("pane {}", pane)
            } else {
                args["file_path"].as_str().unwrap_or("").to_string()
            }
        }
        "journal" => {
            let entry = args["entry"].as_str().unwrap_or("");
            if entry.len() > 60 {
                format!("{}...", &entry[..60])
            } else {
                entry.to_string()
            }
        }
        "yield_to_user" => args["message"]
            .as_str()
            .unwrap_or("")
            .to_string(),
        "switch_model" => args["model"]
            .as_str()
            .unwrap_or("")
            .to_string(),
        "pause" => String::new(),
        _ => String::new(),
    }
}

/// Parse tool calls leaked as text by models that don't always use the
/// structured function calling API (notably Qwen).
///
/// Handles the XML format:
///   <tool_call>
///   <function=bash>
///   <parameter=command>echo hello</parameter>
///   </function>
///   </tool_call>
///
/// Also handles JSON-in-text format:
///   <tool_call>
///   {"name": "bash", "arguments": {"command": "echo hello"}}
///   </tool_call>
fn parse_leaked_tool_calls(text: &str) -> Vec<ToolCall> {
    // Normalize whitespace inside XML tags: "<\nfunction\n=\nbash\n>" → "<function=bash>"
    // This handles streaming tokenizers that split tags across tokens.
    let normalized = normalize_xml_tags(text);
    let text = &normalized;

    let mut calls = Vec::new();
    let mut search_from = 0;
    let mut call_counter: u32 = 0;

    while let Some(start) = text[search_from..].find("<tool_call>") {
        let abs_start = search_from + start;
        let after_tag = abs_start + "<tool_call>".len();

        let end = match text[after_tag..].find("</tool_call>") {
            Some(pos) => after_tag + pos,
            None => break,
        };

        let body = text[after_tag..end].trim();
        search_from = end + "</tool_call>".len();

        // Try XML format first, then JSON
        if let Some(call) = parse_xml_tool_call(body, &mut call_counter) {
            calls.push(call);
        } else if let Some(call) = parse_json_tool_call(body, &mut call_counter) {
            calls.push(call);
        }
    }

    calls
}

/// Normalize whitespace inside XML-like tags for streaming tokenizers.
/// Collapses whitespace between `<` and `>` so that `<\nfunction\n=\nbash\n>`
/// becomes `<function=bash>`, and `</\nparameter\n>` becomes `</parameter>`.
/// Leaves content between tags untouched.
fn normalize_xml_tags(text: &str) -> String {
    let mut result = String::with_capacity(text.len());
    let mut chars = text.chars().peekable();
    while let Some(ch) = chars.next() {
        if ch == '<' {
            let mut tag = String::from('<');
            for inner in chars.by_ref() {
                if inner == '>' {
                    tag.push('>');
                    break;
                } else if inner.is_whitespace() {
                    // Skip whitespace inside tags
                } else {
                    tag.push(inner);
                }
            }
            result.push_str(&tag);
        } else {
            result.push(ch);
        }
    }
    result
}

/// Parse a Qwen-style `<tag=value>body</tag>` pseudo-XML element.
/// Returns `(value, body, rest)` on success.
fn parse_qwen_tag<'a>(s: &'a str, tag: &str) -> Option<(&'a str, &'a str, &'a str)> {
    let open = format!("<{}=", tag);
    let close = format!("</{}>", tag);

    let start = s.find(&open)? + open.len();
    let name_end = start + s[start..].find('>')?;
    let body_start = name_end + 1;
    let body_end = body_start + s[body_start..].find(&close)?;

    Some((
        s[start..name_end].trim(),
        s[body_start..body_end].trim(),
        &s[body_end + close.len()..],
    ))
}

/// Parse Qwen's XML tool call format.
fn parse_xml_tool_call(body: &str, counter: &mut u32) -> Option<ToolCall> {
    let (func_name, func_body, _) = parse_qwen_tag(body, "function")?;
    let func_name = func_name.to_string();

    let mut args = serde_json::Map::new();
    let mut rest = func_body;
    while let Some((key, val, remainder)) = parse_qwen_tag(rest, "parameter") {
        args.insert(key.to_string(), serde_json::Value::String(val.to_string()));
        rest = remainder;
    }

    *counter += 1;
    Some(ToolCall {
        id: format!("leaked_{}", counter),
        call_type: "function".to_string(),
        function: FunctionCall {
            name: func_name,
            arguments: serde_json::to_string(&args).unwrap_or_default(),
        },
    })
}

/// Parse JSON tool call format (some models emit this).
fn parse_json_tool_call(body: &str, counter: &mut u32) -> Option<ToolCall> {
    let v: serde_json::Value = serde_json::from_str(body).ok()?;
    let name = v["name"].as_str()?;
    let arguments = &v["arguments"];

    *counter += 1;
    Some(ToolCall {
        id: format!("leaked_{}", counter),
        call_type: "function".to_string(),
        function: FunctionCall {
            name: name.to_string(),
            arguments: serde_json::to_string(arguments).unwrap_or_default(),
        },
    })
}

/// Strip tool call XML and thinking tokens from text so the conversation
/// history stays clean. Removes `<tool_call>...</tool_call>` blocks and
/// `</think>` tags (thinking content before them is kept — it's useful context).
fn strip_leaked_artifacts(text: &str) -> String {
    let normalized = normalize_xml_tags(text);
    let mut result = normalized.clone();

    // Remove <tool_call>...</tool_call> blocks
    while let Some(start) = result.find("<tool_call>") {
        if let Some(end_pos) = result[start..].find("</tool_call>") {
            let end = start + end_pos + "</tool_call>".len();
            result = format!("{}{}", &result[..start], &result[end..]);
        } else {
            break;
        }
    }

    // Remove </think> tags (but keep the thinking text before them)
    result = result.replace("</think>", "");

    result.trim().to_string()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_leaked_tool_call_clean() {
        let text = "thinking\n</think>\n<tool_call>\n<function=bash>\n<parameter=command>poc-memory used core-personality</parameter>\n</function>\n</tool_call>";
        let calls = parse_leaked_tool_calls(text);
        assert_eq!(calls.len(), 1);
        assert_eq!(calls[0].function.name, "bash");
        let args: serde_json::Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
        assert_eq!(args["command"], "poc-memory used core-personality");
    }

    #[test]
    fn test_leaked_tool_call_streamed_whitespace() {
        // Streaming tokenizer splits XML tags across tokens with newlines
        let text = "<tool_call>\n<\nfunction\n=\nbash\n>\n<\nparameter\n=\ncommand\n>pwd</\nparameter\n>\n</\nfunction\n>\n</tool_call>";
        let calls = parse_leaked_tool_calls(text);
        assert_eq!(calls.len(), 1, "should parse streamed format");
        assert_eq!(calls[0].function.name, "bash");
        let args: serde_json::Value = serde_json::from_str(&calls[0].function.arguments).unwrap();
        assert_eq!(args["command"], "pwd");
    }

    #[test]
    fn test_normalize_preserves_content() {
        let text = "<function=bash>\n<parameter=command>echo hello world</parameter>\n</function>";
        let normalized = normalize_xml_tags(text);
        // Newlines between tags are not inside tags, so preserved
        assert_eq!(normalized, "<function=bash>\n<parameter=command>echo hello world</parameter>\n</function>");
    }

    #[test]
    fn test_normalize_strips_tag_internal_whitespace() {
        let text = "<\nfunction\n=\nbash\n>";
        let normalized = normalize_xml_tags(text);
        assert_eq!(normalized, "<function=bash>");
    }
}