user: F7 compare screen
Side-by-side model comparison against the current conversation context.
Built on the MindTriggered pattern — F7 drops in as one more
CompareScoring flow next to MemoryScoring / FinetuneScoring.
Motivation: we have the VRAM on the b200 to load two versions of the
same family simultaneously (e.g. Qwen3.5 27B bf16 and q8_k_xl). Rather
than trust perplexity/KLD numbers on a generic corpus, we can measure
divergence on our actual conversations: for each assistant response,
ask the test model what it would have said given the same prefix, and
eyeball the diffs.
- config.compare.test_backend — names an entry in the existing
backends map to use as the test model. Empty = F7 reports "(unset)"
and does nothing.
- subconscious::compare::{score_compare_candidates, CompareCandidate,
CompareScoringStats, CompareScoring}. For each assistant response,
gen_continuation runs with the test client against the same prefix
the original response saw; pairs stream into
shared.compare_candidates as they complete.
- user::compare::CompareScreen — F7 in the screen list. c/Enter
triggers a run; list/detail layout mirroring F6, detail shows
prior context / original / test-model alternate.
No persistence yet — each F7 run regenerates. Caching via a context
manifest (so we can re-view without re-burning generation) is the
natural follow-up; for now light usage is fine.
Also reusable later for validating finetune checkpoints: same pattern,
swap the test backend for the new checkpoint, watch where it diverges
from the base.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
parent
575325e855
commit
2b03dbb200
7 changed files with 301 additions and 11 deletions
|
|
@ -250,6 +250,8 @@ pub struct AppConfig {
|
|||
#[serde(default)]
|
||||
pub learn: LearnConfig,
|
||||
#[serde(default)]
|
||||
pub compare: CompareConfig,
|
||||
#[serde(default)]
|
||||
pub mcp_servers: Vec<McpServerConfig>,
|
||||
#[serde(default)]
|
||||
pub lsp_servers: Vec<LspServerConfig>,
|
||||
|
|
@ -323,6 +325,16 @@ impl Default for LearnConfig {
|
|||
}
|
||||
}
|
||||
|
||||
/// Settings for the F7 compare screen — side-by-side generation with a
|
||||
/// test model against the current context.
|
||||
#[derive(Debug, Clone, Default, Serialize, Deserialize)]
|
||||
pub struct CompareConfig {
|
||||
/// Backend name (looked up in `backends`) to use as the test model.
|
||||
/// Empty = F7 reports "no test backend configured" and does nothing.
|
||||
#[serde(default)]
|
||||
pub test_backend: String,
|
||||
}
|
||||
|
||||
fn default_user_name() -> String { "User".into() }
|
||||
fn default_assistant_name() -> String { "Assistant".into() }
|
||||
|
||||
|
|
@ -340,6 +352,7 @@ impl Default for AppConfig {
|
|||
},
|
||||
dmn: DmnConfig { max_turns: 20 },
|
||||
learn: LearnConfig::default(),
|
||||
compare: CompareConfig::default(),
|
||||
mcp_servers: Vec::new(),
|
||||
lsp_servers: Vec::new(),
|
||||
}
|
||||
|
|
|
|||
|
|
@ -63,7 +63,7 @@ use tokio::sync::mpsc;
|
|||
use crate::agent::{Agent, TurnResult};
|
||||
use crate::agent::api::ApiClient;
|
||||
use crate::config::{AppConfig, SessionConfig};
|
||||
use crate::subconscious::learn;
|
||||
use crate::subconscious::{compare, learn};
|
||||
use crate::hippocampus::access_local;
|
||||
|
||||
pub use subconscious::{SubconsciousSnapshot, Subconscious};
|
||||
|
|
@ -193,6 +193,11 @@ pub struct MindState {
|
|||
pub finetune_candidates: Vec<learn::FinetuneCandidate>,
|
||||
/// Last scoring run stats for UI display.
|
||||
pub finetune_last_run: Option<learn::FinetuneScoringStats>,
|
||||
/// F7 compare candidates — one per response, showing what the test
|
||||
/// model would say given the same context.
|
||||
pub compare_candidates: Vec<compare::CompareCandidate>,
|
||||
/// F7 compare error from the last run, if any.
|
||||
pub compare_error: Option<String>,
|
||||
}
|
||||
|
||||
impl Clone for MindState {
|
||||
|
|
@ -213,6 +218,8 @@ impl Clone for MindState {
|
|||
unc_idle_deadline: self.unc_idle_deadline,
|
||||
finetune_candidates: self.finetune_candidates.clone(),
|
||||
finetune_last_run: self.finetune_last_run.clone(),
|
||||
compare_candidates: self.compare_candidates.clone(),
|
||||
compare_error: self.compare_error.clone(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -227,6 +234,9 @@ pub enum MindCommand {
|
|||
ScoreFull,
|
||||
/// Score for finetune candidates
|
||||
ScoreFinetune,
|
||||
/// Run F7 compare: generate alternates with the configured test model
|
||||
/// for every assistant response in the context.
|
||||
Compare,
|
||||
/// Update the finetune divergence threshold and persist to config.
|
||||
SetLearnThreshold(f64),
|
||||
/// Toggle alternate-response generation during scoring; persist to config.
|
||||
|
|
@ -258,6 +268,8 @@ impl MindState {
|
|||
unc_idle_deadline: Instant::now() + std::time::Duration::from_secs(60),
|
||||
finetune_candidates: Vec::new(),
|
||||
finetune_last_run: None,
|
||||
compare_candidates: Vec::new(),
|
||||
compare_error: None,
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -359,6 +371,7 @@ pub struct Mind {
|
|||
conscious_active: tokio::sync::watch::Sender<bool>,
|
||||
memory_scoring: learn::MemoryScoring,
|
||||
finetune_scoring: learn::FinetuneScoring,
|
||||
compare_scoring: compare::CompareScoring,
|
||||
_supervisor: crate::thalamus::supervisor::Supervisor,
|
||||
}
|
||||
|
||||
|
|
@ -486,12 +499,14 @@ impl Mind {
|
|||
let memory_scoring = learn::MemoryScoring::new(
|
||||
agent.clone(), shared.clone(), scores_path);
|
||||
let finetune_scoring = learn::FinetuneScoring::new(agent.clone(), shared.clone());
|
||||
let compare_scoring = compare::CompareScoring::new(agent.clone(), shared.clone());
|
||||
|
||||
Self { agent, shared, config,
|
||||
subconscious, unconscious,
|
||||
turn_tx, turn_watch, conscious_active,
|
||||
memory_scoring,
|
||||
finetune_scoring,
|
||||
compare_scoring,
|
||||
_supervisor: sup }
|
||||
}
|
||||
|
||||
|
|
@ -593,6 +608,9 @@ impl Mind {
|
|||
MindCommand::ScoreFinetune => {
|
||||
self.finetune_scoring.trigger();
|
||||
}
|
||||
MindCommand::Compare => {
|
||||
self.compare_scoring.trigger();
|
||||
}
|
||||
MindCommand::SetLearnThreshold(value) => {
|
||||
if let Err(e) = crate::config_writer::set_learn_threshold(value) {
|
||||
dbglog!("[learn] failed to persist threshold {}: {:#}", value, e);
|
||||
|
|
|
|||
109
src/subconscious/compare.rs
Normal file
109
src/subconscious/compare.rs
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
// compare.rs — F7 compare: for each assistant response in the current
|
||||
// context, regenerate with a configured test model and emit pairs for
|
||||
// side-by-side review.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::agent::api::ApiClient;
|
||||
use crate::agent::context::{
|
||||
AstNode, Role, render_branch_text, render_prior_context,
|
||||
};
|
||||
use crate::mind::{MindState, MindTriggered, TaskHandle};
|
||||
use crate::subconscious::generate::gen_continuation;
|
||||
use crate::subconscious::learn::node_timestamp_ns;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct CompareCandidate {
|
||||
pub entry_idx: usize,
|
||||
pub original_text: String,
|
||||
pub alternate_text: String,
|
||||
pub prior_context: String,
|
||||
pub timestamp_ns: i64,
|
||||
}
|
||||
|
||||
pub struct CompareScoring {
|
||||
agent: Arc<crate::agent::Agent>,
|
||||
shared: Arc<std::sync::Mutex<MindState>>,
|
||||
task: TaskHandle,
|
||||
}
|
||||
|
||||
impl CompareScoring {
|
||||
pub fn new(
|
||||
agent: Arc<crate::agent::Agent>,
|
||||
shared: Arc<std::sync::Mutex<MindState>>,
|
||||
) -> Self {
|
||||
Self { agent, shared, task: TaskHandle::new() }
|
||||
}
|
||||
}
|
||||
|
||||
impl MindTriggered for CompareScoring {
|
||||
fn trigger(&self) {
|
||||
self.task.trigger(run(self.agent.clone(), self.shared.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
fn resolve_test_client() -> Result<ApiClient, String> {
|
||||
let cfg = crate::config::app();
|
||||
let name = cfg.compare.test_backend.clone();
|
||||
if name.is_empty() {
|
||||
return Err("compare.test_backend not set in config".to_string());
|
||||
}
|
||||
let r = cfg.resolve_model(&name).map_err(|e| format!("{:#}", e))?;
|
||||
Ok(ApiClient::new(&r.api_base, &r.api_key, &r.model_id))
|
||||
}
|
||||
|
||||
async fn run(
|
||||
agent: Arc<crate::agent::Agent>,
|
||||
shared: Arc<std::sync::Mutex<MindState>>,
|
||||
) {
|
||||
{
|
||||
let mut s = shared.lock().unwrap();
|
||||
s.compare_candidates.clear();
|
||||
s.compare_error = None;
|
||||
}
|
||||
agent.state.lock().await.changed.notify_one();
|
||||
|
||||
let activity = crate::agent::start_activity(&agent, "compare: scoring...").await;
|
||||
|
||||
let test_client = match resolve_test_client() {
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
shared.lock().unwrap().compare_error = Some(e);
|
||||
agent.state.lock().await.changed.notify_one();
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let context = agent.context.lock().await.clone();
|
||||
let entries = context.conversation();
|
||||
let responses: Vec<usize> = entries.iter().enumerate()
|
||||
.filter(|(_, n)| matches!(n, AstNode::Branch { role: Role::Assistant, .. }))
|
||||
.map(|(i, _)| i).collect();
|
||||
|
||||
for (i, entry_idx) in responses.iter().copied().enumerate() {
|
||||
activity.update(format!("compare: {}/{}", i + 1, responses.len())).await;
|
||||
|
||||
let node = &entries[entry_idx];
|
||||
let original_text = match node {
|
||||
AstNode::Branch { children, .. } => render_branch_text(children),
|
||||
_ => continue,
|
||||
};
|
||||
if original_text.trim().is_empty() { continue; }
|
||||
|
||||
let alternate_text = match
|
||||
gen_continuation(&context, entry_idx, |_| false, &test_client).await
|
||||
{
|
||||
Ok(t) => t,
|
||||
Err(e) => { dbglog!("[compare] gen failed at {}: {:#}", entry_idx, e); continue; }
|
||||
};
|
||||
|
||||
shared.lock().unwrap().compare_candidates.push(CompareCandidate {
|
||||
entry_idx,
|
||||
original_text,
|
||||
alternate_text,
|
||||
prior_context: render_prior_context(entries, entry_idx, 2),
|
||||
timestamp_ns: node_timestamp_ns(node),
|
||||
});
|
||||
if let Ok(st) = agent.state.try_lock() { st.changed.notify_one(); }
|
||||
}
|
||||
}
|
||||
|
|
@ -1,5 +1,6 @@
|
|||
// Agent layer: LLM-powered operations on the memory graph
|
||||
|
||||
pub mod compare;
|
||||
pub mod daemon;
|
||||
pub mod defs;
|
||||
pub mod digest;
|
||||
|
|
|
|||
142
src/user/compare.rs
Normal file
142
src/user/compare.rs
Normal file
|
|
@ -0,0 +1,142 @@
|
|||
// compare.rs — F7 compare screen: side-by-side test-model regen of
|
||||
// every assistant response in the current context.
|
||||
|
||||
use ratatui::{
|
||||
layout::{Constraint, Layout, Rect},
|
||||
style::{Color, Modifier, Style},
|
||||
text::{Line, Span},
|
||||
widgets::{Block, Borders, List, ListItem, ListState, Paragraph, Wrap},
|
||||
Frame,
|
||||
};
|
||||
use ratatui::crossterm::event::{Event, KeyCode, KeyEvent};
|
||||
|
||||
use super::{App, ScreenView, screen_legend, truncate};
|
||||
|
||||
pub use crate::subconscious::compare::CompareCandidate;
|
||||
|
||||
pub(crate) struct CompareScreen {
|
||||
list_state: ListState,
|
||||
mind_tx: tokio::sync::mpsc::UnboundedSender<crate::mind::MindCommand>,
|
||||
}
|
||||
|
||||
impl CompareScreen {
|
||||
pub fn new(
|
||||
mind_tx: tokio::sync::mpsc::UnboundedSender<crate::mind::MindCommand>,
|
||||
) -> Self {
|
||||
Self { list_state: ListState::default(), mind_tx }
|
||||
}
|
||||
}
|
||||
|
||||
impl ScreenView for CompareScreen {
|
||||
fn label(&self) -> &'static str { "compare" }
|
||||
|
||||
fn tick(&mut self, frame: &mut Frame, area: Rect,
|
||||
events: &[Event], app: &mut App) {
|
||||
let n = app.compare_candidates.len();
|
||||
for event in events {
|
||||
if let Event::Key(KeyEvent { code, .. }) = event {
|
||||
match code {
|
||||
KeyCode::Up | KeyCode::Char('k') => {
|
||||
let i = self.list_state.selected().unwrap_or(0);
|
||||
self.list_state.select(Some(i.saturating_sub(1)));
|
||||
}
|
||||
KeyCode::Down | KeyCode::Char('j') => {
|
||||
let i = self.list_state.selected().unwrap_or(0);
|
||||
self.list_state.select(Some((i + 1).min(n.saturating_sub(1))));
|
||||
}
|
||||
KeyCode::Char('c') | KeyCode::Enter => {
|
||||
let _ = self.mind_tx.send(crate::mind::MindCommand::Compare);
|
||||
}
|
||||
_ => {}
|
||||
}
|
||||
}
|
||||
}
|
||||
if n > 0 {
|
||||
let sel = self.list_state.selected().unwrap_or(0).min(n - 1);
|
||||
self.list_state.select(Some(sel));
|
||||
}
|
||||
|
||||
let test_backend = crate::config::app().compare.test_backend.clone();
|
||||
let block = Block::default()
|
||||
.title_top(Line::from(screen_legend()).left_aligned())
|
||||
.title_top(Line::from(" compare ").right_aligned())
|
||||
.borders(Borders::ALL)
|
||||
.border_style(Style::default().fg(Color::Magenta));
|
||||
let inner = block.inner(area);
|
||||
frame.render_widget(block, area);
|
||||
|
||||
let [settings_area, content_area] = Layout::vertical([
|
||||
Constraint::Length(1), Constraint::Min(0),
|
||||
]).areas(inner);
|
||||
|
||||
let backend_label = if test_backend.is_empty() {
|
||||
("(unset — set compare.test_backend)", Color::Red)
|
||||
} else {
|
||||
(test_backend.as_str(), Color::Yellow)
|
||||
};
|
||||
frame.render_widget(Paragraph::new(Line::from(vec![
|
||||
Span::raw(" test model: "),
|
||||
Span::styled(backend_label.0.to_string(), Style::default().fg(backend_label.1)),
|
||||
])), settings_area);
|
||||
|
||||
let candidates = &app.compare_candidates;
|
||||
if candidates.is_empty() {
|
||||
let err = app.mind_state.as_ref().and_then(|ms| ms.compare_error.as_deref());
|
||||
let mut lines = vec![Line::from(""),
|
||||
Line::styled(" Press c/Enter to compare against the configured test model.",
|
||||
Style::default().fg(Color::DarkGray))];
|
||||
if let Some(e) = err {
|
||||
lines.push(Line::from(""));
|
||||
lines.push(Line::from(vec![
|
||||
Span::raw(" "),
|
||||
Span::styled(format!("error: {}", e), Style::default().fg(Color::Red)),
|
||||
]));
|
||||
}
|
||||
frame.render_widget(Paragraph::new(lines), content_area);
|
||||
} else {
|
||||
let [list_area, detail_area] = Layout::horizontal([
|
||||
Constraint::Percentage(40), Constraint::Percentage(60),
|
||||
]).areas(content_area);
|
||||
|
||||
let items: Vec<ListItem> = candidates.iter().map(|c| ListItem::new(Line::from(vec![
|
||||
Span::styled(format!("#{:<3} ", c.entry_idx), Style::default().fg(Color::DarkGray)),
|
||||
Span::raw(truncate(&c.original_text, 30)),
|
||||
]))).collect();
|
||||
frame.render_stateful_widget(
|
||||
List::new(items)
|
||||
.block(Block::default().borders(Borders::RIGHT).title(" candidates "))
|
||||
.highlight_style(Style::default().add_modifier(Modifier::REVERSED)),
|
||||
list_area, &mut self.list_state,
|
||||
);
|
||||
|
||||
if let Some(c) = self.list_state.selected().and_then(|i| candidates.get(i)) {
|
||||
let mut text = String::new();
|
||||
if !c.prior_context.is_empty() {
|
||||
text.push_str(&c.prior_context);
|
||||
text.push_str("\n\n─── original ───\n\n");
|
||||
}
|
||||
text.push_str(&c.original_text);
|
||||
text.push_str("\n\n─── test model ───\n\n");
|
||||
text.push_str(&c.alternate_text);
|
||||
frame.render_widget(
|
||||
Paragraph::new(text)
|
||||
.block(Block::default().borders(Borders::TOP)
|
||||
.title(format!(" entry {} ", c.entry_idx)))
|
||||
.wrap(Wrap { trim: false }),
|
||||
detail_area,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
let help = Line::from(vec![
|
||||
Span::styled(" j/k/\u{2191}\u{2193}", Style::default().fg(Color::Cyan)),
|
||||
Span::raw("=nav "),
|
||||
Span::styled("c/Enter", Style::default().fg(Color::Green)),
|
||||
Span::raw("=run "),
|
||||
]);
|
||||
frame.render_widget(
|
||||
Paragraph::new(help),
|
||||
Rect { y: area.y + area.height - 1, height: 1, ..area },
|
||||
);
|
||||
}
|
||||
}
|
||||
|
|
@ -12,7 +12,7 @@ use ratatui::{
|
|||
};
|
||||
use ratatui::crossterm::event::{Event, KeyCode, KeyEvent};
|
||||
|
||||
use super::{App, ScreenView, screen_legend};
|
||||
use super::{App, ScreenView, screen_legend, truncate};
|
||||
|
||||
/// A candidate response identified for fine-tuning.
|
||||
#[derive(Clone, Debug)]
|
||||
|
|
@ -331,11 +331,3 @@ fn render_detail(frame: &mut Frame, c: &FinetuneCandidate, area: Rect) {
|
|||
frame.render_widget(content, content_area);
|
||||
}
|
||||
|
||||
fn truncate(s: &str, max: usize) -> String {
|
||||
let first_line = s.lines().next().unwrap_or("");
|
||||
if first_line.len() > max {
|
||||
format!("{}...", &first_line[..max])
|
||||
} else {
|
||||
first_line.to_string()
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@
|
|||
// machine, DMN, identity) lives in mind/.
|
||||
|
||||
pub(crate) mod chat;
|
||||
pub(crate) mod compare;
|
||||
mod context;
|
||||
pub(crate) mod learn;
|
||||
pub(crate) mod scroll_pane;
|
||||
|
|
@ -64,6 +65,13 @@ fn screen_legend() -> String {
|
|||
SCREEN_LEGEND.get().cloned().unwrap_or_default()
|
||||
}
|
||||
|
||||
/// Return the first line of `s`, truncated to `max` chars with an
|
||||
/// ellipsis suffix. Used by candidate-list screens.
|
||||
fn truncate(s: &str, max: usize) -> String {
|
||||
let first = s.lines().next().unwrap_or("");
|
||||
if first.len() > max { format!("{}...", &first[..max]) } else { first.to_string() }
|
||||
}
|
||||
|
||||
/// A screen that can draw itself and handle input.
|
||||
trait ScreenView: Send {
|
||||
fn tick(&mut self, frame: &mut ratatui::Frame, area: ratatui::layout::Rect,
|
||||
|
|
@ -114,6 +122,8 @@ struct App {
|
|||
idle_info: Option<IdleInfo>,
|
||||
/// Fine-tuning candidates pending review.
|
||||
finetune_candidates: Vec<learn::FinetuneCandidate>,
|
||||
/// F7 compare candidates — response pairs from test-model comparison.
|
||||
compare_candidates: Vec<compare::CompareCandidate>,
|
||||
}
|
||||
|
||||
impl App {
|
||||
|
|
@ -144,6 +154,7 @@ impl App {
|
|||
walked_count: 0,
|
||||
channel_status: Vec::new(), idle_info: None,
|
||||
finetune_candidates: Vec::new(),
|
||||
compare_candidates: Vec::new(),
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -372,7 +383,7 @@ async fn run(
|
|||
}
|
||||
let notify_rx = crate::thalamus::channels::subscribe_all();
|
||||
|
||||
// F1=chat, F2=conscious, F3=subconscious, F4=unconscious, F5=thalamus, F6=learn
|
||||
// F1=chat, F2=conscious, F3=subconscious, F4=unconscious, F5=thalamus, F6=learn, F7=compare
|
||||
let mut screens: Vec<Box<dyn tui::ScreenView>> = vec![
|
||||
Box::new(crate::user::chat::InteractScreen::new(
|
||||
mind.agent.clone(), mind.shared.clone(), mind_tx.clone(),
|
||||
|
|
@ -382,6 +393,7 @@ async fn run(
|
|||
Box::new(crate::user::unconscious::UnconsciousScreen::new()),
|
||||
Box::new(crate::user::thalamus::ThalamusScreen::new()),
|
||||
Box::new(crate::user::learn::LearnScreen::new(mind_tx.clone())),
|
||||
Box::new(crate::user::compare::CompareScreen::new(mind_tx.clone())),
|
||||
];
|
||||
let mut active_screen: usize = 1; // F-key number
|
||||
tui::set_screen_legend(tui::screen_legend_from(&*screens));
|
||||
|
|
@ -505,6 +517,9 @@ async fn run(
|
|||
});
|
||||
}
|
||||
|
||||
// Sync compare candidates — a fresh run clears, so take a snapshot.
|
||||
app.compare_candidates = ms.compare_candidates.clone();
|
||||
|
||||
app.mind_state = Some(ms.clone());
|
||||
}
|
||||
app.walked_count = mind.subconscious_walked().await.len();
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue