user: F7 compare screen

Side-by-side model comparison against the current conversation context. Built on the MindTriggered pattern — F7 drops in as one more CompareScoring flow next to MemoryScoring / FinetuneScoring. Motivation: we have the VRAM on the b200 to load two versions of the same family simultaneously (e.g. Qwen3.5 27B bf16 and q8_k_xl). Rather than trust perplexity/KLD numbers on a generic corpus, we can measure divergence on our actual conversations: for each assistant response, ask the test model what it would have said given the same prefix, and eyeball the diffs. - config.compare.test_backend — names an entry in the existing backends map to use as the test model. Empty = F7 reports "(unset)" and does nothing. - subconscious::compare::{score_compare_candidates, CompareCandidate, CompareScoringStats, CompareScoring}. For each assistant response, gen_continuation runs with the test client against the same prefix the original response saw; pairs stream into shared.compare_candidates as they complete. - user::compare::CompareScreen — F7 in the screen list. c/Enter triggers a run; list/detail layout mirroring F6, detail shows prior context / original / test-model alternate. No persistence yet — each F7 run regenerates. Caching via a context manifest (so we can re-view without re-burning generation) is the natural follow-up; for now light usage is fine. Also reusable later for validating finetune checkpoints: same pattern, swap the test backend for the new checkpoint, watch where it diverges from the base. Co-Authored-By: Proof of Concept <poc@bcachefs.org>
2026-04-17 16:01:11 -04:00 · 2026-04-17 16:01:11 -04:00 · 2b03dbb200
commit 2b03dbb200
parent 575325e855
7 changed files with 301 additions and 11 deletions
--- a/src/subconscious/compare.rs
+++ b/src/subconscious/compare.rs
@ -0,0 +1,109 @@
+// compare.rs — F7 compare: for each assistant response in the current
+// context, regenerate with a configured test model and emit pairs for
+// side-by-side review.
+
+use std::sync::Arc;
+
+use crate::agent::api::ApiClient;
+use crate::agent::context::{
+    AstNode, Role, render_branch_text, render_prior_context,
+};
+use crate::mind::{MindState, MindTriggered, TaskHandle};
+use crate::subconscious::generate::gen_continuation;
+use crate::subconscious::learn::node_timestamp_ns;
+
+#[derive(Clone, Debug)]
+pub struct CompareCandidate {
+    pub entry_idx: usize,
+    pub original_text: String,
+    pub alternate_text: String,
+    pub prior_context: String,
+    pub timestamp_ns: i64,
+}
+
+pub struct CompareScoring {
+    agent: Arc<crate::agent::Agent>,
+    shared: Arc<std::sync::Mutex<MindState>>,
+    task: TaskHandle,
+}
+
+impl CompareScoring {
+    pub fn new(
+        agent: Arc<crate::agent::Agent>,
+        shared: Arc<std::sync::Mutex<MindState>>,
+    ) -> Self {
+        Self { agent, shared, task: TaskHandle::new() }
+    }
+}
+
+impl MindTriggered for CompareScoring {
+    fn trigger(&self) {
+        self.task.trigger(run(self.agent.clone(), self.shared.clone()));
+    }
+}
+
+fn resolve_test_client() -> Result<ApiClient, String> {
+    let cfg = crate::config::app();
+    let name = cfg.compare.test_backend.clone();
+    if name.is_empty() {
+        return Err("compare.test_backend not set in config".to_string());
+    }
+    let r = cfg.resolve_model(&name).map_err(|e| format!("{:#}", e))?;
+    Ok(ApiClient::new(&r.api_base, &r.api_key, &r.model_id))
+}
+
+async fn run(
+    agent: Arc<crate::agent::Agent>,
+    shared: Arc<std::sync::Mutex<MindState>>,
+) {
+    {
+        let mut s = shared.lock().unwrap();
+        s.compare_candidates.clear();
+        s.compare_error = None;
+    }
+    agent.state.lock().await.changed.notify_one();
+
+    let activity = crate::agent::start_activity(&agent, "compare: scoring...").await;
+
+    let test_client = match resolve_test_client() {
+        Ok(c) => c,
+        Err(e) => {
+            shared.lock().unwrap().compare_error = Some(e);
+            agent.state.lock().await.changed.notify_one();
+            return;
+        }
+    };
+
+    let context = agent.context.lock().await.clone();
+    let entries = context.conversation();
+    let responses: Vec<usize> = entries.iter().enumerate()
+        .filter(|(_, n)| matches!(n, AstNode::Branch { role: Role::Assistant, .. }))
+        .map(|(i, _)| i).collect();
+
+    for (i, entry_idx) in responses.iter().copied().enumerate() {
+        activity.update(format!("compare: {}/{}", i + 1, responses.len())).await;
+
+        let node = &entries[entry_idx];
+        let original_text = match node {
+            AstNode::Branch { children, .. } => render_branch_text(children),
+            _ => continue,
+        };
+        if original_text.trim().is_empty() { continue; }
+
+        let alternate_text = match
+            gen_continuation(&context, entry_idx, |_| false, &test_client).await
+        {
+            Ok(t) => t,
+            Err(e) => { dbglog!("[compare] gen failed at {}: {:#}", entry_idx, e); continue; }
+        };
+
+        shared.lock().unwrap().compare_candidates.push(CompareCandidate {
+            entry_idx,
+            original_text,
+            alternate_text,
+            prior_context: render_prior_context(entries, entry_idx, 2),
+            timestamp_ns: node_timestamp_ns(node),
+        });
+        if let Ok(st) = agent.state.try_lock() { st.changed.notify_one(); }
+    }
+}
--- a/src/subconscious/mod.rs
+++ b/src/subconscious/mod.rs
@ -1,5 +1,6 @@
 // Agent layer: LLM-powered operations on the memory graph

+pub mod compare;
 pub mod daemon;
 pub mod defs;
 pub mod digest;