user: F7 compare screen

Side-by-side model comparison against the current conversation context.
Built on the MindTriggered pattern — F7 drops in as one more
CompareScoring flow next to MemoryScoring / FinetuneScoring.

Motivation: we have the VRAM on the b200 to load two versions of the
same family simultaneously (e.g. Qwen3.5 27B bf16 and q8_k_xl). Rather
than trust perplexity/KLD numbers on a generic corpus, we can measure
divergence on our actual conversations: for each assistant response,
ask the test model what it would have said given the same prefix, and
eyeball the diffs.

 - config.compare.test_backend — names an entry in the existing
   backends map to use as the test model. Empty = F7 reports "(unset)"
   and does nothing.

 - subconscious::compare::{score_compare_candidates, CompareCandidate,
   CompareScoringStats, CompareScoring}. For each assistant response,
   gen_continuation runs with the test client against the same prefix
   the original response saw; pairs stream into
   shared.compare_candidates as they complete.

 - user::compare::CompareScreen — F7 in the screen list. c/Enter
   triggers a run; list/detail layout mirroring F6, detail shows
   prior context / original / test-model alternate.

No persistence yet — each F7 run regenerates. Caching via a context
manifest (so we can re-view without re-burning generation) is the
natural follow-up; for now light usage is fine.

Also reusable later for validating finetune checkpoints: same pattern,
swap the test backend for the new checkpoint, watch where it diverges
from the base.

Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
Kent Overstreet 2026-04-17 16:01:11 -04:00
parent 575325e855
commit 2b03dbb200
7 changed files with 301 additions and 11 deletions

109
src/subconscious/compare.rs Normal file
View file

@ -0,0 +1,109 @@
// compare.rs — F7 compare: for each assistant response in the current
// context, regenerate with a configured test model and emit pairs for
// side-by-side review.
use std::sync::Arc;
use crate::agent::api::ApiClient;
use crate::agent::context::{
AstNode, Role, render_branch_text, render_prior_context,
};
use crate::mind::{MindState, MindTriggered, TaskHandle};
use crate::subconscious::generate::gen_continuation;
use crate::subconscious::learn::node_timestamp_ns;
#[derive(Clone, Debug)]
pub struct CompareCandidate {
pub entry_idx: usize,
pub original_text: String,
pub alternate_text: String,
pub prior_context: String,
pub timestamp_ns: i64,
}
pub struct CompareScoring {
agent: Arc<crate::agent::Agent>,
shared: Arc<std::sync::Mutex<MindState>>,
task: TaskHandle,
}
impl CompareScoring {
pub fn new(
agent: Arc<crate::agent::Agent>,
shared: Arc<std::sync::Mutex<MindState>>,
) -> Self {
Self { agent, shared, task: TaskHandle::new() }
}
}
impl MindTriggered for CompareScoring {
fn trigger(&self) {
self.task.trigger(run(self.agent.clone(), self.shared.clone()));
}
}
fn resolve_test_client() -> Result<ApiClient, String> {
let cfg = crate::config::app();
let name = cfg.compare.test_backend.clone();
if name.is_empty() {
return Err("compare.test_backend not set in config".to_string());
}
let r = cfg.resolve_model(&name).map_err(|e| format!("{:#}", e))?;
Ok(ApiClient::new(&r.api_base, &r.api_key, &r.model_id))
}
async fn run(
agent: Arc<crate::agent::Agent>,
shared: Arc<std::sync::Mutex<MindState>>,
) {
{
let mut s = shared.lock().unwrap();
s.compare_candidates.clear();
s.compare_error = None;
}
agent.state.lock().await.changed.notify_one();
let activity = crate::agent::start_activity(&agent, "compare: scoring...").await;
let test_client = match resolve_test_client() {
Ok(c) => c,
Err(e) => {
shared.lock().unwrap().compare_error = Some(e);
agent.state.lock().await.changed.notify_one();
return;
}
};
let context = agent.context.lock().await.clone();
let entries = context.conversation();
let responses: Vec<usize> = entries.iter().enumerate()
.filter(|(_, n)| matches!(n, AstNode::Branch { role: Role::Assistant, .. }))
.map(|(i, _)| i).collect();
for (i, entry_idx) in responses.iter().copied().enumerate() {
activity.update(format!("compare: {}/{}", i + 1, responses.len())).await;
let node = &entries[entry_idx];
let original_text = match node {
AstNode::Branch { children, .. } => render_branch_text(children),
_ => continue,
};
if original_text.trim().is_empty() { continue; }
let alternate_text = match
gen_continuation(&context, entry_idx, |_| false, &test_client).await
{
Ok(t) => t,
Err(e) => { dbglog!("[compare] gen failed at {}: {:#}", entry_idx, e); continue; }
};
shared.lock().unwrap().compare_candidates.push(CompareCandidate {
entry_idx,
original_text,
alternate_text,
prior_context: render_prior_context(entries, entry_idx, 2),
timestamp_ns: node_timestamp_ns(node),
});
if let Ok(st) = agent.state.try_lock() { st.changed.notify_one(); }
}
}

View file

@ -1,5 +1,6 @@
// Agent layer: LLM-powered operations on the memory graph
pub mod compare;
pub mod daemon;
pub mod defs;
pub mod digest;