refactor: extract context window building into context.rs

Move context window construction (build_context_window, plan_context, render_journal_text, assemble_context), token counting, error classification, and related helpers from agent.rs into context.rs. All extracted functions are pure — they take inputs and return values with no mutable state access. State mutation stays in agent.rs (compact, restore_from_log, load_startup_journal). agent.rs: 1504 → 987 lines (-517) context.rs: 365 lines (new) Net: -152 lines (duplicate comments removed) Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-21 15:55:30 -04:00 · 2026-03-21 15:55:30 -04:00 · db48d57917
commit db48d57917
parent d04d41e993
3 changed files with 206 additions and 488 deletions
--- a/poc-agent/src/agent.rs
+++ b/poc-agent/src/agent.rs
@ -14,7 +14,6 @@
 // in, response out, tool calls dispatched.

 use anyhow::Result;
-use chrono::{DateTime, Utc};
 use tiktoken_rs::CoreBPE;

 use std::io::Write;
@ -54,10 +53,6 @@ struct DispatchState {
    dmn_pause: bool,
 }

-/// Mutable context state — the structured regions of the context window.
-///
-/// Each field is a different dimension of awareness. The struct renders
-
 pub struct Agent {
    client: ApiClient,
    messages: Vec<Message>,
@ -196,7 +191,7 @@ impl Agent {
        let mut in_conversation = false;

        for msg in &self.messages {
-            let tokens = msg_token_count(&self.tokenizer, msg);
+            let tokens = crate::context::msg_token_count(&self.tokenizer, msg);

            if in_conversation {
                conv_tokens += tokens;
@ -231,7 +226,7 @@ impl Agent {
            memory_tokens: mem_tokens,
            journal_tokens: jnl_tokens,
            conversation_tokens: conv_tokens,
-            window_tokens: model_context_window(&self.client.model),
+            window_tokens: crate::context::model_context_window(&self.client.model),
        };
    }

@ -279,7 +274,7 @@ impl Agent {
            // Context overflow → compact and retry (max 2 attempts)
            // Stream error → retry with backoff (max 2 attempts)
            let (msg, usage) = match api_result {
-                Err(e) if is_context_overflow(&e) && overflow_retries < 2 => {
+                Err(e) if crate::context::is_context_overflow(&e) && overflow_retries < 2 => {
                    overflow_retries += 1;
                    let _ = ui_tx.send(UiMessage::Info(format!(
                        "[context overflow — compacting and retrying ({}/2)]",
@ -288,7 +283,7 @@ impl Agent {
                    self.emergency_compact();
                    continue;
                }
-                Err(e) if is_stream_error(&e) && empty_retries < 2 => {
+                Err(e) if crate::context::is_stream_error(&e) && empty_retries < 2 => {
                    empty_retries += 1;
                    let _ = ui_tx.send(UiMessage::Info(format!(
                        "[stream error: {} — retrying ({}/2)]",
@ -651,7 +646,7 @@ impl Agent {
        let count = |s: &str| self.tokenizer.encode_with_special_tokens(s).len();
        let context_message = self.context.render_context_message();

-        let plan = plan_context(
+        let plan = crate::context::plan_context(
            &self.context.system_prompt,
            &context_message,
            &[],  // no conversation yet
@ -660,7 +655,7 @@ impl Agent {
            &count,
        );

-        self.context.journal = render_journal_text(&entries, &plan);
+        self.context.journal = crate::context::render_journal_text(&entries, &plan);
    }

    /// Re-render the context message in self.messages from live ContextState.
@ -820,7 +815,7 @@ impl Agent {
            .unwrap_or(self.messages.len());

        let conversation: Vec<Message> = self.messages[conv_start..].to_vec();
-        let (messages, journal) = build_context_window(
+        let (messages, journal) = crate::context::build_context_window(
            &self.context,
            &conversation,
            &self.client.model,
@ -880,7 +875,7 @@ impl Agent {
            .collect();
        dbglog!("[restore] {} messages after filtering system", conversation.len());

-        let (messages, journal) = build_context_window(
+        let (messages, journal) = crate::context::build_context_window(
            &self.context,
            &conversation,
            &self.client.model,
@ -923,420 +918,9 @@ impl Agent {
    }
 }

-/// Look up a model's context window size in tokens.
-pub fn model_context_window(model: &str) -> usize {
-    let m = model.to_lowercase();
-    if m.contains("opus") || m.contains("sonnet") {
-        200_000
-    } else if m.contains("qwen") {
-        131_072
-    } else {
-        128_000
-    }
-}
+// Context window building, token counting, and error classification
+// live in context.rs

-/// Context budget in tokens: 60% of the model's context window.
-/// Leaves headroom for conversation to grow before compaction triggers.
-///
-/// Future direction: make this dynamic based on what the agent is
-/// doing — deep coding work might allocate more to conversation,
-/// consolidation might allocate more to journal/memory, idle might
-/// shrink everything to save cost.
-fn context_budget_tokens(model: &str) -> usize {
-    model_context_window(model) * 60 / 100
-}
-
-/// Allocation plan for the context window. Separates the budget math
-/// (which entries and messages to include) from the message assembly
-/// (building the actual Vec<Message>). This makes the core algorithm
-/// testable and inspectable — log the plan on compaction to see exactly
-/// what allocation decisions were made.
-struct ContextPlan {
-    /// Index into all_entries: header-only entries start here
-    header_start: usize,
-    /// Index into all_entries: full entries start here (headers end here)
-    full_start: usize,
-    /// Total journal entries (header-only + full go up to this)
-    entry_count: usize,
-    /// Index into recent conversation: skip messages before this
-    conv_trim: usize,
-    /// Total recent conversation messages
-    _conv_count: usize,
-    /// Tokens used by full journal entries
-    _full_tokens: usize,
-    /// Tokens used by header-only journal entries
-    _header_tokens: usize,
-    /// Tokens used by conversation (after trimming)
-    _conv_tokens: usize,
-    /// Total budget available (after identity, memory, reserve)
-    _available: usize,
-}
-
-/// Build a context window from conversation messages + journal entries.
-/// This is the core algorithm shared by compact() and restore_from_log().
-///
-/// Allocation strategy: identity and memory are fixed costs. The
-/// remaining budget (minus 25% reserve for model output) is split
-/// between journal and conversation. Conversation gets priority —
-/// it's what's happening now. Journal fills the rest, newest first.
-///
-/// When the budget is tight, journal entries are dropped first
-/// (oldest entries go first). If conversation alone exceeds the
-/// budget, oldest messages are trimmed to fit.
-/// Returns (messages, journal_text) — caller stores journal_text in ContextState.
-fn build_context_window(
-    context: &ContextState,
-    conversation: &[Message],
-    model: &str,
-    tokenizer: &CoreBPE,
-) -> (Vec<Message>, String) {
-    let journal_path = journal::default_journal_path();
-    let all_entries = journal::parse_journal(&journal_path);
-    dbglog!("[ctx] {} journal entries from {}", all_entries.len(), journal_path.display());
-    let count = |s: &str| tokenizer.encode_with_special_tokens(s).len();
-
-    let system_prompt = context.system_prompt.clone();
-    let context_message = context.render_context_message();
-
-    // Cap memory to 50% of the context budget so conversation always
-    // gets space. Truncate at the last complete section boundary.
-    let max_tokens = context_budget_tokens(model);
-    let memory_cap = max_tokens / 2;
-    let memory_tokens = count(&context_message);
-    let context_message = if memory_tokens > memory_cap {
-        dbglog!("[ctx] memory too large: {} tokens > {} cap, truncating", memory_tokens, memory_cap);
-        truncate_at_section(&context_message, memory_cap, &count)
-    } else {
-        context_message
-    };
-
-    let recent_start = find_journal_cutoff(conversation, all_entries.last());
-    dbglog!("[ctx] journal cutoff: {} of {} conversation messages are 'recent'",
-        conversation.len() - recent_start, conversation.len());
-    let recent = &conversation[recent_start..];
-
-    let plan = plan_context(
-        &system_prompt,
-        &context_message,
-        recent,
-        &all_entries,
-        model,
-        &count,
-    );
-
-    // Render journal text from the plan
-    let journal_text = render_journal_text(&all_entries, &plan);
-    dbglog!("[ctx] plan: header_start={} full_start={} entry_count={} conv_trim={} journal_text={} chars",
-        plan.header_start, plan.full_start, plan.entry_count, plan.conv_trim, journal_text.len());
-
-    let messages = assemble_context(
-        system_prompt, context_message, &journal_text,
-        recent, &plan,
-    );
-    (messages, journal_text)
-}
-
-/// Compute the allocation plan: how much budget goes to journal vs
-/// conversation, which entries and messages to include.
-fn plan_context(
-    system_prompt: &str,
-    context_message: &str,
-    recent: &[Message],
-    entries: &[journal::JournalEntry],
-    model: &str,
-    count: &dyn Fn(&str) -> usize,
-) -> ContextPlan {
-    let max_tokens = context_budget_tokens(model);
-
-    // Fixed costs — always included
-    let identity_cost = count(system_prompt);
-    let memory_cost = count(context_message);
-    let reserve = max_tokens / 4;
-    let available = max_tokens
-        .saturating_sub(identity_cost)
-        .saturating_sub(memory_cost)
-        .saturating_sub(reserve);
-
-    // Measure conversation
-    let conv_costs: Vec<usize> = recent.iter().map(|m| msg_token_count_fn(m, count)).collect();
-    let total_conv: usize = conv_costs.iter().sum();
-
-    // Journal always gets at least 15% of available budget so it doesn't
-    // get squeezed out by large conversations.
-    let journal_min = available * 15 / 100;
-    let journal_budget = available.saturating_sub(total_conv).max(journal_min);
-
-    // Fill journal entries newest-first within budget.
-    // Tiered: recent entries get full content, older entries get just
-    // a header line (timestamp + first line) for timeline awareness.
-    let full_budget = journal_budget * 70 / 100;
-    let header_budget = journal_budget.saturating_sub(full_budget);
-
-    // Phase 1: Full entries (newest first)
-    let mut full_used = 0;
-    let mut n_full = 0;
-    for entry in entries.iter().rev() {
-        let cost = count(&entry.content) + 10;
-        if full_used + cost > full_budget {
-            break;
-        }
-        full_used += cost;
-        n_full += 1;
-    }
-    let full_start = entries.len().saturating_sub(n_full);
-
-    // Phase 2: Header-only entries (continuing backward from where full stopped)
-    let mut header_used = 0;
-    let mut n_headers = 0;
-    for entry in entries[..full_start].iter().rev() {
-        let first_line = entry
-            .content
-            .lines()
-            .find(|l| !l.trim().is_empty())
-            .unwrap_or("(empty)");
-        let cost = count(first_line) + 10;
-        if header_used + cost > header_budget {
-            break;
-        }
-        header_used += cost;
-        n_headers += 1;
-    }
-    let header_start = full_start.saturating_sub(n_headers);
-
-    // If conversation exceeds available budget, trim oldest messages
-    let journal_used = full_used + header_used;
-    let mut conv_trim = 0;
-    let mut trimmed_conv = total_conv;
-    while trimmed_conv + journal_used > available && conv_trim < recent.len() {
-        trimmed_conv -= conv_costs[conv_trim];
-        conv_trim += 1;
-    }
-    // Walk forward to user message boundary
-    while conv_trim < recent.len() && recent[conv_trim].role != Role::User {
-        conv_trim += 1;
-    }
-
-    dbglog!("[plan] model={} max_tokens={} available={} (identity={} memory={} reserve={})",
-        model, max_tokens, available, identity_cost, memory_cost, reserve);
-    dbglog!("[plan] conv: {} msgs, {} tokens total, trimming {} msgs → {} tokens",
-        recent.len(), total_conv, conv_trim, trimmed_conv);
-    dbglog!("[plan] journal: {} full entries ({}t) + {} headers ({}t)",
-        n_full, full_used, n_headers, header_used);
-
-    ContextPlan {
-        header_start,
-        full_start,
-        entry_count: entries.len(),
-        conv_trim,
-        _conv_count: recent.len(),
-        _full_tokens: full_used,
-        _header_tokens: header_used,
-        _conv_tokens: trimmed_conv,
-        _available: available,
-    }
-}
-
-/// Render journal entries into text from a context plan.
-fn render_journal_text(
-    entries: &[journal::JournalEntry],
-    plan: &ContextPlan,
-) -> String {
-    let has_journal = plan.header_start < plan.entry_count;
-    if !has_journal {
-        return String::new();
-    }
-
-    let mut text = String::from("[Earlier in this conversation — from your journal]\n\n");
-
-    // Header-only entries (older) — just timestamp + first line
-    for entry in &entries[plan.header_start..plan.full_start] {
-        let first_line = entry
-            .content
-            .lines()
-            .find(|l| !l.trim().is_empty())
-            .unwrap_or("(empty)");
-        text.push_str(&format!(
-            "## {} — {}\n",
-            entry.timestamp.format("%Y-%m-%dT%H:%M"),
-            first_line,
-        ));
-    }
-
-    // Separator between headers and full entries
-    let n_headers = plan.full_start - plan.header_start;
-    let n_full = plan.entry_count - plan.full_start;
-    if n_headers > 0 && n_full > 0 {
-        text.push_str("\n---\n\n");
-    }
-
-    // Full entries (recent)
-    for entry in &entries[plan.full_start..] {
-        text.push_str(&format!(
-            "## {}\n\n{}\n\n",
-            entry.timestamp.format("%Y-%m-%dT%H:%M"),
-            entry.content
-        ));
-    }
-
-    text
-}
-
-/// Assemble the context window from a plan. No allocation decisions
-/// happen here — just follow the plan to build messages.
-fn assemble_context(
-    system_prompt: String,
-    context_message: String,
-    journal_text: &str,
-    recent: &[Message],
-    plan: &ContextPlan,
-) -> Vec<Message> {
-    let mut messages = vec![Message::system(system_prompt)];
-    if !context_message.is_empty() {
-        messages.push(Message::user(context_message));
-    }
-
-    let final_recent = &recent[plan.conv_trim..];
-
-    if !journal_text.is_empty() {
-        messages.push(Message::user(journal_text.to_string()));
-    } else if !final_recent.is_empty() {
-        messages.push(Message::user(
-            "Your context was just rebuilt. Memory files have been \
-             reloaded. Your recent conversation continues below. \
-             Earlier context is in your journal and memory files."
-                .to_string(),
-        ));
-    }
-
-    messages.extend(final_recent.iter().cloned());
-    messages
-}
-
-/// Find the conversation index where messages are no longer covered
-/// Truncate a context message to fit within a token budget. Cuts at
-/// section boundaries (lines starting with `---` or `## `) to avoid
-/// splitting mid-section. Drops sections from the end first since
-/// earlier sections (identity, instructions) matter more.
-fn truncate_at_section(text: &str, max_tokens: usize, count: &dyn Fn(&str) -> usize) -> String {
-    // Find section boundaries (--- separators between assembled parts)
-    let mut boundaries = vec![0usize];
-    for (i, line) in text.lines().enumerate() {
-        if line.trim() == "---" || line.starts_with("## ") {
-            // Find byte offset of this line
-            let offset = text.lines().take(i).map(|l| l.len() + 1).sum::<usize>();
-            boundaries.push(offset);
-        }
-    }
-    boundaries.push(text.len());
-
-    // Binary search: find the largest prefix of sections that fits
-    let mut best = 0;
-    for &end in &boundaries[1..] {
-        let slice = &text[..end];
-        if count(slice) <= max_tokens {
-            best = end;
-        } else {
-            break;
-        }
-    }
-
-    if best == 0 {
-        // Even the first section doesn't fit — hard truncate
-        best = text.len().min(max_tokens * 3); // ~3 chars/token rough estimate
-    }
-
-    let truncated = &text[..best];
-    dbglog!("[ctx] truncated memory from {} to {} chars ({} tokens)",
-        text.len(), truncated.len(), count(truncated));
-    truncated.to_string()
-}
-
-/// by journal entries. Messages before this index are summarized by
-/// the journal; messages from this index onward stay as raw conversation.
-/// Walks back to a user message boundary to avoid splitting tool
-/// call/result sequences.
-fn find_journal_cutoff(
-    conversation: &[Message],
-    newest_entry: Option<&journal::JournalEntry>,
-) -> usize {
-    let cutoff = match newest_entry {
-        Some(entry) => entry.timestamp,
-        None => return 0,
-    };
-
-    let mut split = conversation.len();
-    for (i, msg) in conversation.iter().enumerate() {
-        if let Some(ts) = parse_msg_timestamp(msg) {
-            if ts > cutoff {
-                split = i;
-                break;
-            }
-        }
-    }
-    // Walk back to user message boundary
-    while split > 0 && split < conversation.len() && conversation[split].role != Role::User {
-        split -= 1;
-    }
-    split
-}
-
-/// Count the token footprint of a message using a token counting function.
-fn msg_token_count_fn(msg: &Message, count: &dyn Fn(&str) -> usize) -> usize {
-    let content = msg.content.as_ref().map_or(0, |c| match c {
-        MessageContent::Text(s) => count(s),
-        MessageContent::Parts(parts) => parts
-            .iter()
-            .map(|p| match p {
-                ContentPart::Text { text } => count(text),
-                ContentPart::ImageUrl { .. } => 85,
-            })
-            .sum(),
-    });
-    let tools = msg.tool_calls.as_ref().map_or(0, |calls| {
-        calls
-            .iter()
-            .map(|c| count(&c.function.arguments) + count(&c.function.name))
-            .sum()
-    });
-    content + tools
-}
-
-/// Count the token footprint of a message using BPE tokenization.
-fn msg_token_count(tokenizer: &CoreBPE, msg: &Message) -> usize {
-    msg_token_count_fn(msg, &|s| tokenizer.encode_with_special_tokens(s).len())
-}
-
-/// Detect context window overflow errors from the API.
-/// Different providers phrase this differently; we check for common patterns.
-/// OpenRouter wraps upstream errors, so we check both the wrapper and the raw message.
-fn is_context_overflow(err: &anyhow::Error) -> bool {
-    let msg = err.to_string().to_lowercase();
-    msg.contains("context length")
-        || msg.contains("token limit")
-        || msg.contains("too many tokens")
-        || msg.contains("maximum context")
-        || msg.contains("prompt is too long")
-        || msg.contains("request too large")
-        || msg.contains("input validation error")
-        || msg.contains("content length limit")
-        || (msg.contains("400") && msg.contains("tokens"))
-}
-
-/// Detect model/provider errors delivered inside the SSE stream.
-/// OpenRouter returns HTTP 200 but finish_reason="error" with
-/// partial content (e.g. "system") — we surface this as an error
-/// so the turn loop can retry.
-fn is_stream_error(err: &anyhow::Error) -> bool {
-    err.to_string().contains("model stream error")
-}
-
-/// Parse a message's timestamp field into a DateTime.
-fn parse_msg_timestamp(msg: &Message) -> Option<DateTime<Utc>> {
-    msg.timestamp
-        .as_ref()
-        .and_then(|ts| DateTime::parse_from_rfc3339(ts).ok())
-        .map(|dt| dt.with_timezone(&Utc))
-}

 /// Create a short summary of tool args for the tools pane header.
 fn summarize_args(tool_name: &str, args: &serde_json::Value) -> String {
--- a/poc-agent/src/context.rs
+++ b/poc-agent/src/context.rs
@ -1,14 +1,52 @@
 // context.rs — Context window building and management
 //
 // Pure functions for building the agent's context window from journal
-// entries and conversation messages. No mutable state.
+// entries and conversation messages. No mutable state — all functions
+// take inputs and return new values. State mutation happens in agent.rs.

 use crate::journal;
-use crate::types::{ContextPlan, ContextState, Message};
+use crate::types::*;
 use chrono::{DateTime, Utc};
 use tiktoken_rs::CoreBPE;

+/// Look up a model's context window size in tokens.
+pub fn model_context_window(model: &str) -> usize {
+    let m = model.to_lowercase();
+    if m.contains("opus") || m.contains("sonnet") {
+        200_000
+    } else if m.contains("qwen") {
+        131_072
+    } else {
+        128_000
+    }
+}
+
+/// Context budget in tokens: 60% of the model's context window.
+fn context_budget_tokens(model: &str) -> usize {
+    model_context_window(model) * 60 / 100
+}
+
+/// Allocation plan for the context window.
+pub struct ContextPlan {
+    header_start: usize,
+    full_start: usize,
+    entry_count: usize,
+    conv_trim: usize,
+    _conv_count: usize,
+    _full_tokens: usize,
+    _header_tokens: usize,
+    _conv_tokens: usize,
+    _available: usize,
+}
+
 /// Build a context window from conversation messages + journal entries.
+///
+/// Allocation strategy: identity and memory are fixed costs. The
+/// remaining budget (minus 25% reserve for model output) is split
+/// between journal and conversation. Conversation gets priority —
+/// it's what's happening now. Journal fills the rest, newest first.
+///
+/// Returns (messages, journal_text) — caller stores journal_text in ContextState.
 pub fn build_context_window(
    context: &ContextState,
    conversation: &[Message],
@ -17,44 +55,50 @@ pub fn build_context_window(
 ) -> (Vec<Message>, String) {
    let journal_path = journal::default_journal_path();
    let all_entries = journal::parse_journal(&journal_path);
-    crate::dbglog!("[ctx] {} journal entries from {}", all_entries.len(), journal_path.display());
+    dbglog!("[ctx] {} journal entries from {}", all_entries.len(), journal_path.display());
    let count = |s: &str| tokenizer.encode_with_special_tokens(s).len();

    let system_prompt = context.system_prompt.clone();
    let context_message = context.render_context_message();

+    // Cap memory to 50% of the context budget so conversation always
+    // gets space. Truncate at the last complete section boundary.
    let max_tokens = context_budget_tokens(model);
    let memory_cap = max_tokens / 2;
    let memory_tokens = count(&context_message);
    let context_message = if memory_tokens > memory_cap {
-        crate::dbglog!("[ctx] memory too large: {} tokens > {} cap, truncating", memory_tokens, memory_cap);
+        dbglog!("[ctx] memory too large: {} tokens > {} cap, truncating", memory_tokens, memory_cap);
        truncate_at_section(&context_message, memory_cap, &count)
    } else {
        context_message
    };

    let recent_start = find_journal_cutoff(conversation, all_entries.last());
+    dbglog!("[ctx] journal cutoff: {} of {} conversation messages are 'recent'",
+        conversation.len() - recent_start, conversation.len());
    let recent = &conversation[recent_start..];

-    let plan = plan_context(&system_prompt, &context_message, recent, &all_entries, model, &count);
-    let journal_text = render_journal_text(&all_entries, &plan);
+    let plan = plan_context(
+        &system_prompt,
+        &context_message,
+        recent,
+        &all_entries,
+        model,
+        &count,
+    );

-    let messages = assemble_context(system_prompt, context_message, &journal_text, recent, &plan);
+    let journal_text = render_journal_text(&all_entries, &plan);
+    dbglog!("[ctx] plan: header_start={} full_start={} entry_count={} conv_trim={} journal_text={} chars",
+        plan.header_start, plan.full_start, plan.entry_count, plan.conv_trim, journal_text.len());
+
+    let messages = assemble_context(
+        system_prompt, context_message, &journal_text,
+        recent, &plan,
+    );
    (messages, journal_text)
 }

-pub fn model_context_window(model: &str) -> usize {
-    let m = model.to_lowercase();
-    if m.contains("opus") || m.contains("sonnet") { 200_000 }
-    else if m.contains("qwen") { 131_072 }
-    else { 128_000 }
-}
-
-fn context_budget_tokens(model: &str) -> usize {
-    model_context_window(model) * 60 / 100
-}
-
-fn plan_context(
+pub fn plan_context(
    system_prompt: &str,
    context_message: &str,
    recent: &[Message],
@ -63,40 +107,56 @@ fn plan_context(
    count: &dyn Fn(&str) -> usize,
 ) -> ContextPlan {
    let max_tokens = context_budget_tokens(model);
+
    let identity_cost = count(system_prompt);
    let memory_cost = count(context_message);
    let reserve = max_tokens / 4;
-    let available = max_tokens.saturating_sub(identity_cost).saturating_sub(memory_cost).saturating_sub(reserve);
+    let available = max_tokens
+        .saturating_sub(identity_cost)
+        .saturating_sub(memory_cost)
+        .saturating_sub(reserve);

    let conv_costs: Vec<usize> = recent.iter().map(|m| msg_token_count_fn(m, count)).collect();
    let total_conv: usize = conv_costs.iter().sum();

    let journal_min = available * 15 / 100;
    let journal_budget = available.saturating_sub(total_conv).max(journal_min);
+
    let full_budget = journal_budget * 70 / 100;
    let header_budget = journal_budget.saturating_sub(full_budget);

+    // Phase 1: Full entries (newest first)
    let mut full_used = 0;
    let mut n_full = 0;
    for entry in entries.iter().rev() {
        let cost = count(&entry.content) + 10;
-        if full_used + cost > full_budget { break; }
+        if full_used + cost > full_budget {
+            break;
+        }
        full_used += cost;
        n_full += 1;
    }
    let full_start = entries.len().saturating_sub(n_full);

+    // Phase 2: Header-only entries (continuing backward)
    let mut header_used = 0;
    let mut n_headers = 0;
    for entry in entries[..full_start].iter().rev() {
-        let first_line = entry.content.lines().find(|l| !l.trim().is_empty()).unwrap_or("(empty)");
+        let first_line = entry
+            .content
+            .lines()
+            .find(|l| !l.trim().is_empty())
+            .unwrap_or("(empty)");
        let cost = count(first_line) + 10;
-        if header_used + cost > header_budget { break; }
+        if header_used + cost > header_budget {
+            break;
+        }
        header_used += cost;
        n_headers += 1;
    }
    let header_start = full_start.saturating_sub(n_headers);

+    // Trim oldest conversation if it exceeds budget
    let journal_used = full_used + header_used;
    let mut conv_trim = 0;
    let mut trimmed_conv = total_conv;
@ -104,34 +164,69 @@ fn plan_context(
        trimmed_conv -= conv_costs[conv_trim];
        conv_trim += 1;
    }
-    while conv_trim < recent.len() && recent[conv_trim].role != crate::types::Role::User {
+    // Walk forward to user message boundary
+    while conv_trim < recent.len() && recent[conv_trim].role != Role::User {
        conv_trim += 1;
    }

+    dbglog!("[plan] model={} max_tokens={} available={} (identity={} memory={} reserve={})",
+        model, max_tokens, available, identity_cost, memory_cost, reserve);
+    dbglog!("[plan] conv: {} msgs, {} tokens total, trimming {} msgs → {} tokens",
+        recent.len(), total_conv, conv_trim, trimmed_conv);
+    dbglog!("[plan] journal: {} full entries ({}t) + {} headers ({}t)",
+        n_full, full_used, n_headers, header_used);
+
    ContextPlan {
-        header_start, full_start, entry_count: entries.len(), conv_trim,
-        _conv_count: recent.len(), _full_tokens: full_used, _header_tokens: header_used,
-        _conv_tokens: trimmed_conv, _available: available,
+        header_start,
+        full_start,
+        entry_count: entries.len(),
+        conv_trim,
+        _conv_count: recent.len(),
+        _full_tokens: full_used,
+        _header_tokens: header_used,
+        _conv_tokens: trimmed_conv,
+        _available: available,
    }
 }

-fn render_journal_text(entries: &[journal::JournalEntry], plan: &ContextPlan) -> String {
-    if plan.header_start >= plan.entry_count { return String::new(); }
+pub fn render_journal_text(
+    entries: &[journal::JournalEntry],
+    plan: &ContextPlan,
+) -> String {
+    let has_journal = plan.header_start < plan.entry_count;
+    if !has_journal {
+        return String::new();
+    }

    let mut text = String::from("[Earlier in this conversation — from your journal]\n\n");

    for entry in &entries[plan.header_start..plan.full_start] {
-        let first_line = entry.content.lines().find(|l| !l.trim().is_empty()).unwrap_or("(empty)");
-        text.push_str(&format!("## {} — {}\n", entry.timestamp.format("%Y-%m-%dT%H:%M"), first_line));
+        let first_line = entry
+            .content
+            .lines()
+            .find(|l| !l.trim().is_empty())
+            .unwrap_or("(empty)");
+        text.push_str(&format!(
+            "## {} — {}\n",
+            entry.timestamp.format("%Y-%m-%dT%H:%M"),
+            first_line,
+        ));
    }

    let n_headers = plan.full_start - plan.header_start;
    let n_full = plan.entry_count - plan.full_start;
-    if n_headers > 0 && n_full > 0 { text.push_str("\n---\n\n"); }
+    if n_headers > 0 && n_full > 0 {
+        text.push_str("\n---\n\n");
+    }

    for entry in &entries[plan.full_start..] {
-        text.push_str(&format!("## {}\n\n{}\n\n", entry.timestamp.format("%Y-%m-%dT%H:%M"), entry.content));
+        text.push_str(&format!(
+            "## {}\n\n{}\n\n",
+            entry.timestamp.format("%Y-%m-%dT%H:%M"),
+            entry.content
+        ));
    }
+
    text
 }

@ -143,7 +238,9 @@ fn assemble_context(
    plan: &ContextPlan,
 ) -> Vec<Message> {
    let mut messages = vec![Message::system(system_prompt)];
-    if !context_message.is_empty() { messages.push(Message::user(context_message)); }
+    if !context_message.is_empty() {
+        messages.push(Message::user(context_message));
+    }

    let final_recent = &recent[plan.conv_trim..];

@ -157,6 +254,7 @@ fn assemble_context(
                .to_string(),
        ));
    }
+
    messages.extend(final_recent.iter().cloned());
    messages
 }
@ -174,26 +272,42 @@ fn truncate_at_section(text: &str, max_tokens: usize, count: &dyn Fn(&str) -> us
    let mut best = 0;
    for &end in &boundaries[1..] {
        let slice = &text[..end];
-        if count(slice) <= max_tokens { best = end; }
-        else { break; }
+        if count(slice) <= max_tokens {
+            best = end;
+        } else {
+            break;
+        }
+    }
+
+    if best == 0 {
+        best = text.len().min(max_tokens * 3);
    }
-    if best == 0 { best = text.len().min(max_tokens * 3); }

    let truncated = &text[..best];
-    crate::dbglog!("[ctx] truncated memory from {} to {} chars ({} tokens)", text.len(), truncated.len(), count(truncated));
+    dbglog!("[ctx] truncated memory from {} to {} chars ({} tokens)",
+        text.len(), truncated.len(), count(truncated));
    truncated.to_string()
 }

-fn find_journal_cutoff(conversation: &[Message], newest_entry: Option<&journal::JournalEntry>) -> usize {
-    let cutoff = match newest_entry { Some(entry) => entry.timestamp, None => return 0 };
+fn find_journal_cutoff(
+    conversation: &[Message],
+    newest_entry: Option<&journal::JournalEntry>,
+) -> usize {
+    let cutoff = match newest_entry {
+        Some(entry) => entry.timestamp,
+        None => return 0,
+    };

    let mut split = conversation.len();
    for (i, msg) in conversation.iter().enumerate() {
        if let Some(ts) = parse_msg_timestamp(msg) {
-            if ts > cutoff { split = i; break; }
+            if ts > cutoff {
+                split = i;
+                break;
+            }
        }
    }
-    while split > 0 && split < conversation.len() && conversation[split].role != crate::types::Role::User {
+    while split > 0 && split < conversation.len() && conversation[split].role != Role::User {
        split -= 1;
    }
    split
@ -201,32 +315,51 @@ fn find_journal_cutoff(conversation: &[Message], newest_entry: Option<&journal::

 fn msg_token_count_fn(msg: &Message, count: &dyn Fn(&str) -> usize) -> usize {
    let content = msg.content.as_ref().map_or(0, |c| match c {
-        crate::types::MessageContent::Text(s) => count(s),
-        crate::types::MessageContent::Parts(parts) => parts.iter().map(|p| match p {
-            crate::types::ContentPart::Text { text } => count(text),
-            crate::types::ContentPart::ImageUrl { .. } => 85,
-        }).sum(),
+        MessageContent::Text(s) => count(s),
+        MessageContent::Parts(parts) => parts
+            .iter()
+            .map(|p| match p {
+                ContentPart::Text { text } => count(text),
+                ContentPart::ImageUrl { .. } => 85,
+            })
+            .sum(),
+    });
+    let tools = msg.tool_calls.as_ref().map_or(0, |calls| {
+        calls
+            .iter()
+            .map(|c| count(&c.function.arguments) + count(&c.function.name))
+            .sum()
    });
-    let tools = msg.tool_calls.as_ref().map_or(0, |calls| calls.iter().map(|c| count(&c.function.arguments) + count(&c.function.name)).sum());
    content + tools
 }

-fn parse_msg_timestamp(msg: &Message) -> Option<DateTime<Utc>> {
-    msg.timestamp.as_ref().and_then(|ts| DateTime::parse_from_rfc3339(ts).ok()).map(|dt| dt.with_timezone(&Utc))
+/// Count the token footprint of a message using BPE tokenization.
+pub fn msg_token_count(tokenizer: &CoreBPE, msg: &Message) -> usize {
+    msg_token_count_fn(msg, &|s| tokenizer.encode_with_special_tokens(s).len())
 }

+/// Detect context window overflow errors from the API.
 pub fn is_context_overflow(err: &anyhow::Error) -> bool {
    let msg = err.to_string().to_lowercase();
-    msg.contains("context length") || msg.contains("token limit") || msg.contains("too many tokens")
-        || msg.contains("maximum context") || msg.contains("prompt is too long") || msg.contains("request too large")
-        || msg.contains("input validation error") || msg.contains("content length limit")
+    msg.contains("context length")
+        || msg.contains("token limit")
+        || msg.contains("too many tokens")
+        || msg.contains("maximum context")
+        || msg.contains("prompt is too long")
+        || msg.contains("request too large")
+        || msg.contains("input validation error")
+        || msg.contains("content length limit")
        || (msg.contains("400") && msg.contains("tokens"))
 }

+/// Detect model/provider errors delivered inside the SSE stream.
 pub fn is_stream_error(err: &anyhow::Error) -> bool {
    err.to_string().contains("model stream error")
 }

-pub fn msg_token_count(tokenizer: &CoreBPE, msg: &Message) -> usize {
-    msg_token_count_fn(msg, &|s| tokenizer.encode_with_special_tokens(s).len())
+fn parse_msg_timestamp(msg: &Message) -> Option<DateTime<Utc>> {
+    msg.timestamp
+        .as_ref()
+        .and_then(|ts| DateTime::parse_from_rfc3339(ts).ok())
+        .map(|dt| dt.with_timezone(&Utc))
 }
--- a/poc-agent/src/main.rs
+++ b/poc-agent/src/main.rs
@ -37,6 +37,7 @@ mod agent;
 mod api;
 mod cli;
 mod config;
+mod context;
 mod dmn;
 mod journal;
 mod log;
@ -66,13 +67,13 @@ use crate::ui_channel::{ContextInfo, StatusInfo, StreamTarget, UiMessage};
 /// Hard compaction threshold — context is rebuilt immediately.
 /// Uses config percentage of model context window.
 fn compaction_threshold(model: &str, app: &AppConfig) -> u32 {
-    (agent::model_context_window(model) as u32) * app.compaction.hard_threshold_pct / 100
+    (context::model_context_window(model) as u32) * app.compaction.hard_threshold_pct / 100
 }

 /// Soft threshold — nudge the model to journal before compaction.
 /// Fires once; the hard threshold handles the actual rebuild.
 fn pre_compaction_threshold(model: &str, app: &AppConfig) -> u32 {
-    (agent::model_context_window(model) as u32) * app.compaction.soft_threshold_pct / 100
+    (context::model_context_window(model) as u32) * app.compaction.soft_threshold_pct / 100
 }

 #[tokio::main]