From 2b03dbb20006b15f19b96a2f911a8fd0de934b07 Mon Sep 17 00:00:00 2001
From: Kent Overstreet <kent.overstreet@linux.dev>
Date: Fri, 17 Apr 2026 16:01:11 -0400
Subject: [PATCH] user: F7 compare screen
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Side-by-side model comparison against the current conversation context.
Built on the MindTriggered pattern — F7 drops in as one more
CompareScoring flow next to MemoryScoring / FinetuneScoring.

Motivation: we have the VRAM on the b200 to load two versions of the
same family simultaneously (e.g. Qwen3.5 27B bf16 and q8_k_xl). Rather
than trust perplexity/KLD numbers on a generic corpus, we can measure
divergence on our actual conversations: for each assistant response,
ask the test model what it would have said given the same prefix, and
eyeball the diffs.

 - config.compare.test_backend — names an entry in the existing
   backends map to use as the test model. Empty = F7 reports "(unset)"
   and does nothing.

 - subconscious::compare::{score_compare_candidates, CompareCandidate,
   CompareScoringStats, CompareScoring}. For each assistant response,
   gen_continuation runs with the test client against the same prefix
   the original response saw; pairs stream into
   shared.compare_candidates as they complete.

 - user::compare::CompareScreen — F7 in the screen list. c/Enter
   triggers a run; list/detail layout mirroring F6, detail shows
   prior context / original / test-model alternate.

No persistence yet — each F7 run regenerates. Caching via a context
manifest (so we can re-view without re-burning generation) is the
natural follow-up; for now light usage is fine.

Also reusable later for validating finetune checkpoints: same pattern,
swap the test backend for the new checkpoint, watch where it diverges
from the base.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
---
 src/config.rs               |  13 ++++
 src/mind/mod.rs             |  20 ++++-
 src/subconscious/compare.rs | 109 +++++++++++++++++++++++++++
 src/subconscious/mod.rs     |   1 +
 src/user/compare.rs         | 142 ++++++++++++++++++++++++++++++++++++
 src/user/learn.rs           |  10 +--
 src/user/mod.rs             |  17 ++++-
 7 files changed, 301 insertions(+), 11 deletions(-)
 create mode 100644 src/subconscious/compare.rs
 create mode 100644 src/user/compare.rs
diff --git a/src/config.rs b/src/config.rs
index 6323aae..209bdc1 100644
--- a/src/config.rs
+++ b/src/config.rs
@@ -250,6 +250,8 @@ pub struct AppConfig {
     #[serde(default)]
     pub learn: LearnConfig,
     #[serde(default)]
+    pub compare: CompareConfig,
+    #[serde(default)]
     pub mcp_servers: Vec<McpServerConfig>,
     #[serde(default)]
     pub lsp_servers: Vec<LspServerConfig>,
@@ -323,6 +325,16 @@ impl Default for LearnConfig {
     }
 }
 
+/// Settings for the F7 compare screen — side-by-side generation with a
+/// test model against the current context.
+#[derive(Debug, Clone, Default, Serialize, Deserialize)]
+pub struct CompareConfig {
+    /// Backend name (looked up in `backends`) to use as the test model.
+    /// Empty = F7 reports "no test backend configured" and does nothing.
+    #[serde(default)]
+    pub test_backend: String,
+}
+
 fn default_user_name() -> String { "User".into() }
 fn default_assistant_name() -> String { "Assistant".into() }
 
@@ -340,6 +352,7 @@ impl Default for AppConfig {
             },
             dmn: DmnConfig { max_turns: 20 },
             learn: LearnConfig::default(),
+            compare: CompareConfig::default(),
             mcp_servers: Vec::new(),
             lsp_servers: Vec::new(),
         }
diff --git a/src/mind/mod.rs b/src/mind/mod.rs
index 4ca97ea..f526b10 100644
--- a/src/mind/mod.rs
+++ b/src/mind/mod.rs
@@ -63,7 +63,7 @@ use tokio::sync::mpsc;
 use crate::agent::{Agent, TurnResult};
 use crate::agent::api::ApiClient;
 use crate::config::{AppConfig, SessionConfig};
-use crate::subconscious::learn;
+use crate::subconscious::{compare, learn};
 use crate::hippocampus::access_local;
 
 pub use subconscious::{SubconsciousSnapshot, Subconscious};
@@ -193,6 +193,11 @@ pub struct MindState {
     pub finetune_candidates: Vec<learn::FinetuneCandidate>,
     /// Last scoring run stats for UI display.
     pub finetune_last_run: Option<learn::FinetuneScoringStats>,
+    /// F7 compare candidates — one per response, showing what the test
+    /// model would say given the same context.
+    pub compare_candidates: Vec<compare::CompareCandidate>,
+    /// F7 compare error from the last run, if any.
+    pub compare_error: Option<String>,
 }
 
 impl Clone for MindState {
@@ -213,6 +218,8 @@ impl Clone for MindState {
             unc_idle_deadline: self.unc_idle_deadline,
             finetune_candidates: self.finetune_candidates.clone(),
             finetune_last_run: self.finetune_last_run.clone(),
+            compare_candidates: self.compare_candidates.clone(),
+            compare_error: self.compare_error.clone(),
         }
     }
 }
@@ -227,6 +234,9 @@ pub enum MindCommand {
     ScoreFull,
     /// Score for finetune candidates
     ScoreFinetune,
+    /// Run F7 compare: generate alternates with the configured test model
+    /// for every assistant response in the context.
+    Compare,
     /// Update the finetune divergence threshold and persist to config.
     SetLearnThreshold(f64),
     /// Toggle alternate-response generation during scoring; persist to config.
@@ -258,6 +268,8 @@ impl MindState {
             unc_idle_deadline: Instant::now() + std::time::Duration::from_secs(60),
             finetune_candidates: Vec::new(),
             finetune_last_run: None,
+            compare_candidates: Vec::new(),
+            compare_error: None,
         }
     }
 
@@ -359,6 +371,7 @@ pub struct Mind {
     conscious_active: tokio::sync::watch::Sender<bool>,
     memory_scoring: learn::MemoryScoring,
     finetune_scoring: learn::FinetuneScoring,
+    compare_scoring: compare::CompareScoring,
     _supervisor: crate::thalamus::supervisor::Supervisor,
 }
 
@@ -486,12 +499,14 @@ impl Mind {
         let memory_scoring = learn::MemoryScoring::new(
             agent.clone(), shared.clone(), scores_path);
         let finetune_scoring = learn::FinetuneScoring::new(agent.clone(), shared.clone());
+        let compare_scoring = compare::CompareScoring::new(agent.clone(), shared.clone());
 
         Self { agent, shared, config,
                subconscious, unconscious,
                turn_tx, turn_watch, conscious_active,
                memory_scoring,
                finetune_scoring,
+               compare_scoring,
                _supervisor: sup }
     }
 
@@ -593,6 +608,9 @@ impl Mind {
                 MindCommand::ScoreFinetune => {
                     self.finetune_scoring.trigger();
                 }
+                MindCommand::Compare => {
+                    self.compare_scoring.trigger();
+                }
                 MindCommand::SetLearnThreshold(value) => {
                     if let Err(e) = crate::config_writer::set_learn_threshold(value) {
                         dbglog!("[learn] failed to persist threshold {}: {:#}", value, e);
diff --git a/src/subconscious/compare.rs b/src/subconscious/compare.rs
new file mode 100644
index 0000000..f2652ce
--- /dev/null
+++ b/src/subconscious/compare.rs
@@ -0,0 +1,109 @@
+// compare.rs — F7 compare: for each assistant response in the current
+// context, regenerate with a configured test model and emit pairs for
+// side-by-side review.
+
+use std::sync::Arc;
+
+use crate::agent::api::ApiClient;
+use crate::agent::context::{
+    AstNode, Role, render_branch_text, render_prior_context,
+};
+use crate::mind::{MindState, MindTriggered, TaskHandle};
+use crate::subconscious::generate::gen_continuation;
+use crate::subconscious::learn::node_timestamp_ns;
+
+#[derive(Clone, Debug)]
+pub struct CompareCandidate {
+    pub entry_idx: usize,
+    pub original_text: String,
+    pub alternate_text: String,
+    pub prior_context: String,
+    pub timestamp_ns: i64,
+}
+
+pub struct CompareScoring {
+    agent: Arc<crate::agent::Agent>,
+    shared: Arc<std::sync::Mutex<MindState>>,
+    task: TaskHandle,
+}
+
+impl CompareScoring {
+    pub fn new(
+        agent: Arc<crate::agent::Agent>,
+        shared: Arc<std::sync::Mutex<MindState>>,
+    ) -> Self {
+        Self { agent, shared, task: TaskHandle::new() }
+    }
+}
+
+impl MindTriggered for CompareScoring {
+    fn trigger(&self) {
+        self.task.trigger(run(self.agent.clone(), self.shared.clone()));
+    }
+}
+
+fn resolve_test_client() -> Result<ApiClient, String> {
+    let cfg = crate::config::app();
+    let name = cfg.compare.test_backend.clone();
+    if name.is_empty() {
+        return Err("compare.test_backend not set in config".to_string());
+    }
+    let r = cfg.resolve_model(&name).map_err(|e| format!("{:#}", e))?;
+    Ok(ApiClient::new(&r.api_base, &r.api_key, &r.model_id))
+}
+
+async fn run(
+    agent: Arc<crate::agent::Agent>,
+    shared: Arc<std::sync::Mutex<MindState>>,
+) {
+    {
+        let mut s = shared.lock().unwrap();
+        s.compare_candidates.clear();
+        s.compare_error = None;
+    }
+    agent.state.lock().await.changed.notify_one();
+
+    let activity = crate::agent::start_activity(&agent, "compare: scoring...").await;
+
+    let test_client = match resolve_test_client() {
+        Ok(c) => c,
+        Err(e) => {
+            shared.lock().unwrap().compare_error = Some(e);
+            agent.state.lock().await.changed.notify_one();
+            return;
+        }
+    };
+
+    let context = agent.context.lock().await.clone();
+    let entries = context.conversation();
+    let responses: Vec<usize> = entries.iter().enumerate()
+        .filter(|(_, n)| matches!(n, AstNode::Branch { role: Role::Assistant, .. }))
+        .map(|(i, _)| i).collect();
+
+    for (i, entry_idx) in responses.iter().copied().enumerate() {
+        activity.update(format!("compare: {}/{}", i + 1, responses.len())).await;
+
+        let node = &entries[entry_idx];
+        let original_text = match node {
+            AstNode::Branch { children, .. } => render_branch_text(children),
+            _ => continue,
+        };
+        if original_text.trim().is_empty() { continue; }
+
+        let alternate_text = match
+            gen_continuation(&context, entry_idx, |_| false, &test_client).await
+        {
+            Ok(t) => t,
+            Err(e) => { dbglog!("[compare] gen failed at {}: {:#}", entry_idx, e); continue; }
+        };
+
+        shared.lock().unwrap().compare_candidates.push(CompareCandidate {
+            entry_idx,
+            original_text,
+            alternate_text,
+            prior_context: render_prior_context(entries, entry_idx, 2),
+            timestamp_ns: node_timestamp_ns(node),
+        });
+        if let Ok(st) = agent.state.try_lock() { st.changed.notify_one(); }
+    }
+}
diff --git a/src/subconscious/mod.rs b/src/subconscious/mod.rs
index d50f833..1abf25a 100644
--- a/src/subconscious/mod.rs
+++ b/src/subconscious/mod.rs
@@ -1,5 +1,6 @@
 // Agent layer: LLM-powered operations on the memory graph
 
+pub mod compare;
 pub mod daemon;
 pub mod defs;
 pub mod digest;
diff --git a/src/user/compare.rs b/src/user/compare.rs
new file mode 100644
index 0000000..74fb10d
--- /dev/null
+++ b/src/user/compare.rs
@@ -0,0 +1,142 @@
+// compare.rs — F7 compare screen: side-by-side test-model regen of
+// every assistant response in the current context.
+
+use ratatui::{
+    layout::{Constraint, Layout, Rect},
+    style::{Color, Modifier, Style},
+    text::{Line, Span},
+    widgets::{Block, Borders, List, ListItem, ListState, Paragraph, Wrap},
+    Frame,
+};
+use ratatui::crossterm::event::{Event, KeyCode, KeyEvent};
+
+use super::{App, ScreenView, screen_legend, truncate};
+
+pub use crate::subconscious::compare::CompareCandidate;
+
+pub(crate) struct CompareScreen {
+    list_state: ListState,
+    mind_tx: tokio::sync::mpsc::UnboundedSender<crate::mind::MindCommand>,
+}
+
+impl CompareScreen {
+    pub fn new(
+        mind_tx: tokio::sync::mpsc::UnboundedSender<crate::mind::MindCommand>,
+    ) -> Self {
+        Self { list_state: ListState::default(), mind_tx }
+    }
+}
+
+impl ScreenView for CompareScreen {
+    fn label(&self) -> &'static str { "compare" }
+
+    fn tick(&mut self, frame: &mut Frame, area: Rect,
+            events: &[Event], app: &mut App) {
+        let n = app.compare_candidates.len();
+        for event in events {
+            if let Event::Key(KeyEvent { code, .. }) = event {
+                match code {
+                    KeyCode::Up | KeyCode::Char('k') => {
+                        let i = self.list_state.selected().unwrap_or(0);
+                        self.list_state.select(Some(i.saturating_sub(1)));
+                    }
+                    KeyCode::Down | KeyCode::Char('j') => {
+                        let i = self.list_state.selected().unwrap_or(0);
+                        self.list_state.select(Some((i + 1).min(n.saturating_sub(1))));
+                    }
+                    KeyCode::Char('c') | KeyCode::Enter => {
+                        let _ = self.mind_tx.send(crate::mind::MindCommand::Compare);
+                    }
+                    _ => {}
+                }
+            }
+        }
+        if n > 0 {
+            let sel = self.list_state.selected().unwrap_or(0).min(n - 1);
+            self.list_state.select(Some(sel));
+        }
+
+        let test_backend = crate::config::app().compare.test_backend.clone();
+        let block = Block::default()
+            .title_top(Line::from(screen_legend()).left_aligned())
+            .title_top(Line::from(" compare ").right_aligned())
+            .borders(Borders::ALL)
+            .border_style(Style::default().fg(Color::Magenta));
+        let inner = block.inner(area);
+        frame.render_widget(block, area);
+
+        let [settings_area, content_area] = Layout::vertical([
+            Constraint::Length(1), Constraint::Min(0),
+        ]).areas(inner);
+
+        let backend_label = if test_backend.is_empty() {
+            ("(unset — set compare.test_backend)", Color::Red)
+        } else {
+            (test_backend.as_str(), Color::Yellow)
+        };
+        frame.render_widget(Paragraph::new(Line::from(vec![
+            Span::raw(" test model: "),
+            Span::styled(backend_label.0.to_string(), Style::default().fg(backend_label.1)),
+        ])), settings_area);
+
+        let candidates = &app.compare_candidates;
+        if candidates.is_empty() {
+            let err = app.mind_state.as_ref().and_then(|ms| ms.compare_error.as_deref());
+            let mut lines = vec![Line::from(""),
+                Line::styled("  Press c/Enter to compare against the configured test model.",
+                    Style::default().fg(Color::DarkGray))];
+            if let Some(e) = err {
+                lines.push(Line::from(""));
+                lines.push(Line::from(vec![
+                    Span::raw("  "),
+                    Span::styled(format!("error: {}", e), Style::default().fg(Color::Red)),
+                ]));
+            }
+            frame.render_widget(Paragraph::new(lines), content_area);
+        } else {
+            let [list_area, detail_area] = Layout::horizontal([
+                Constraint::Percentage(40), Constraint::Percentage(60),
+            ]).areas(content_area);
+
+            let items: Vec<ListItem> = candidates.iter().map(|c| ListItem::new(Line::from(vec![
+                Span::styled(format!("#{:<3} ", c.entry_idx), Style::default().fg(Color::DarkGray)),
+                Span::raw(truncate(&c.original_text, 30)),
+            ]))).collect();
+            frame.render_stateful_widget(
+                List::new(items)
+                    .block(Block::default().borders(Borders::RIGHT).title(" candidates "))
+                    .highlight_style(Style::default().add_modifier(Modifier::REVERSED)),
+                list_area, &mut self.list_state,
+            );
+
+            if let Some(c) = self.list_state.selected().and_then(|i| candidates.get(i)) {
+                let mut text = String::new();
+                if !c.prior_context.is_empty() {
+                    text.push_str(&c.prior_context);
+                    text.push_str("\n\n─── original ───\n\n");
+                }
+                text.push_str(&c.original_text);
+                text.push_str("\n\n─── test model ───\n\n");
+                text.push_str(&c.alternate_text);
+                frame.render_widget(
+                    Paragraph::new(text)
+                        .block(Block::default().borders(Borders::TOP)
+                            .title(format!(" entry {} ", c.entry_idx)))
+                        .wrap(Wrap { trim: false }),
+                    detail_area,
+                );
+            }
+        }
+
+        let help = Line::from(vec![
+            Span::styled(" j/k/\u{2191}\u{2193}", Style::default().fg(Color::Cyan)),
+            Span::raw("=nav  "),
+            Span::styled("c/Enter", Style::default().fg(Color::Green)),
+            Span::raw("=run "),
+        ]);
+        frame.render_widget(
+            Paragraph::new(help),
+            Rect { y: area.y + area.height - 1, height: 1, ..area },
+        );
+    }
+}
diff --git a/src/user/learn.rs b/src/user/learn.rs
index 0bd351f..78c16d0 100644
--- a/src/user/learn.rs
+++ b/src/user/learn.rs
@@ -12,7 +12,7 @@ use ratatui::{
 };
 use ratatui::crossterm::event::{Event, KeyCode, KeyEvent};
 
-use super::{App, ScreenView, screen_legend};
+use super::{App, ScreenView, screen_legend, truncate};
 
 /// A candidate response identified for fine-tuning.
 #[derive(Clone, Debug)]
@@ -331,11 +331,3 @@ fn render_detail(frame: &mut Frame, c: &FinetuneCandidate, area: Rect) {
     frame.render_widget(content, content_area);
 }
 
-fn truncate(s: &str, max: usize) -> String {
-    let first_line = s.lines().next().unwrap_or("");
-    if first_line.len() > max {
-        format!("{}...", &first_line[..max])
-    } else {
-        first_line.to_string()
-    }
-}
diff --git a/src/user/mod.rs b/src/user/mod.rs
index e077167..33008b7 100644
--- a/src/user/mod.rs
+++ b/src/user/mod.rs
@@ -4,6 +4,7 @@
 // machine, DMN, identity) lives in mind/.
 
 pub(crate) mod chat;
+pub(crate) mod compare;
 mod context;
 pub(crate) mod learn;
 pub(crate) mod scroll_pane;
@@ -64,6 +65,13 @@ fn screen_legend() -> String {
     SCREEN_LEGEND.get().cloned().unwrap_or_default()
 }
 
+/// Return the first line of `s`, truncated to `max` chars with an
+/// ellipsis suffix. Used by candidate-list screens.
+fn truncate(s: &str, max: usize) -> String {
+    let first = s.lines().next().unwrap_or("");
+    if first.len() > max { format!("{}...", &first[..max]) } else { first.to_string() }
+}
+
 /// A screen that can draw itself and handle input.
 trait ScreenView: Send {
     fn tick(&mut self, frame: &mut ratatui::Frame, area: ratatui::layout::Rect,
@@ -114,6 +122,8 @@ struct App {
     idle_info: Option<IdleInfo>,
     /// Fine-tuning candidates pending review.
     finetune_candidates: Vec<learn::FinetuneCandidate>,
+    /// F7 compare candidates — response pairs from test-model comparison.
+    compare_candidates: Vec<compare::CompareCandidate>,
 }
 
 impl App {
@@ -144,6 +154,7 @@ impl App {
             walked_count: 0,
             channel_status: Vec::new(), idle_info: None,
             finetune_candidates: Vec::new(),
+            compare_candidates: Vec::new(),
         }
     }
 
@@ -372,7 +383,7 @@ async fn run(
     }
     let notify_rx = crate::thalamus::channels::subscribe_all();
 
-    // F1=chat, F2=conscious, F3=subconscious, F4=unconscious, F5=thalamus, F6=learn
+    // F1=chat, F2=conscious, F3=subconscious, F4=unconscious, F5=thalamus, F6=learn, F7=compare
     let mut screens: Vec<Box<dyn tui::ScreenView>> = vec![
         Box::new(crate::user::chat::InteractScreen::new(
             mind.agent.clone(), mind.shared.clone(), mind_tx.clone(),
@@ -382,6 +393,7 @@ async fn run(
         Box::new(crate::user::unconscious::UnconsciousScreen::new()),
         Box::new(crate::user::thalamus::ThalamusScreen::new()),
         Box::new(crate::user::learn::LearnScreen::new(mind_tx.clone())),
+        Box::new(crate::user::compare::CompareScreen::new(mind_tx.clone())),
     ];
     let mut active_screen: usize = 1; // F-key number
     tui::set_screen_legend(tui::screen_legend_from(&*screens));
@@ -505,6 +517,9 @@ async fn run(
                 });
             }
 
+            // Sync compare candidates — a fresh run clears, so take a snapshot.
+            app.compare_candidates = ms.compare_candidates.clone();
+
             app.mind_state = Some(ms.clone());
         }
         app.walked_count = mind.subconscious_walked().await.len();