forked from kent/consciousness
Side-by-side model comparison against the current conversation context.
Built on the MindTriggered pattern — F7 drops in as one more
CompareScoring flow next to MemoryScoring / FinetuneScoring.
Motivation: we have the VRAM on the b200 to load two versions of the
same family simultaneously (e.g. Qwen3.5 27B bf16 and q8_k_xl). Rather
than trust perplexity/KLD numbers on a generic corpus, we can measure
divergence on our actual conversations: for each assistant response,
ask the test model what it would have said given the same prefix, and
eyeball the diffs.
- config.compare.test_backend — names an entry in the existing
backends map to use as the test model. Empty = F7 reports "(unset)"
and does nothing.
- subconscious::compare::{score_compare_candidates, CompareCandidate,
CompareScoringStats, CompareScoring}. For each assistant response,
gen_continuation runs with the test client against the same prefix
the original response saw; pairs stream into
shared.compare_candidates as they complete.
- user::compare::CompareScreen — F7 in the screen list. c/Enter
triggers a run; list/detail layout mirroring F6, detail shows
prior context / original / test-model alternate.
No persistence yet — each F7 run regenerates. Caching via a context
manifest (so we can re-view without re-burning generation) is the
natural follow-up; for now light usage is fine.
Also reusable later for validating finetune checkpoints: same pattern,
swap the test backend for the new checkpoint, watch where it diverges
from the base.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
109 lines
3.4 KiB
Rust
109 lines
3.4 KiB
Rust
// compare.rs — F7 compare: for each assistant response in the current
|
|
// context, regenerate with a configured test model and emit pairs for
|
|
// side-by-side review.
|
|
|
|
use std::sync::Arc;
|
|
|
|
use crate::agent::api::ApiClient;
|
|
use crate::agent::context::{
|
|
AstNode, Role, render_branch_text, render_prior_context,
|
|
};
|
|
use crate::mind::{MindState, MindTriggered, TaskHandle};
|
|
use crate::subconscious::generate::gen_continuation;
|
|
use crate::subconscious::learn::node_timestamp_ns;
|
|
|
|
#[derive(Clone, Debug)]
|
|
pub struct CompareCandidate {
|
|
pub entry_idx: usize,
|
|
pub original_text: String,
|
|
pub alternate_text: String,
|
|
pub prior_context: String,
|
|
pub timestamp_ns: i64,
|
|
}
|
|
|
|
pub struct CompareScoring {
|
|
agent: Arc<crate::agent::Agent>,
|
|
shared: Arc<std::sync::Mutex<MindState>>,
|
|
task: TaskHandle,
|
|
}
|
|
|
|
impl CompareScoring {
|
|
pub fn new(
|
|
agent: Arc<crate::agent::Agent>,
|
|
shared: Arc<std::sync::Mutex<MindState>>,
|
|
) -> Self {
|
|
Self { agent, shared, task: TaskHandle::new() }
|
|
}
|
|
}
|
|
|
|
impl MindTriggered for CompareScoring {
|
|
fn trigger(&self) {
|
|
self.task.trigger(run(self.agent.clone(), self.shared.clone()));
|
|
}
|
|
}
|
|
|
|
fn resolve_test_client() -> Result<ApiClient, String> {
|
|
let cfg = crate::config::app();
|
|
let name = cfg.compare.test_backend.clone();
|
|
if name.is_empty() {
|
|
return Err("compare.test_backend not set in config".to_string());
|
|
}
|
|
let r = cfg.resolve_model(&name).map_err(|e| format!("{:#}", e))?;
|
|
Ok(ApiClient::new(&r.api_base, &r.api_key, &r.model_id))
|
|
}
|
|
|
|
async fn run(
|
|
agent: Arc<crate::agent::Agent>,
|
|
shared: Arc<std::sync::Mutex<MindState>>,
|
|
) {
|
|
{
|
|
let mut s = shared.lock().unwrap();
|
|
s.compare_candidates.clear();
|
|
s.compare_error = None;
|
|
}
|
|
agent.state.lock().await.changed.notify_one();
|
|
|
|
let activity = crate::agent::start_activity(&agent, "compare: scoring...").await;
|
|
|
|
let test_client = match resolve_test_client() {
|
|
Ok(c) => c,
|
|
Err(e) => {
|
|
shared.lock().unwrap().compare_error = Some(e);
|
|
agent.state.lock().await.changed.notify_one();
|
|
return;
|
|
}
|
|
};
|
|
|
|
let context = agent.context.lock().await.clone();
|
|
let entries = context.conversation();
|
|
let responses: Vec<usize> = entries.iter().enumerate()
|
|
.filter(|(_, n)| matches!(n, AstNode::Branch { role: Role::Assistant, .. }))
|
|
.map(|(i, _)| i).collect();
|
|
|
|
for (i, entry_idx) in responses.iter().copied().enumerate() {
|
|
activity.update(format!("compare: {}/{}", i + 1, responses.len())).await;
|
|
|
|
let node = &entries[entry_idx];
|
|
let original_text = match node {
|
|
AstNode::Branch { children, .. } => render_branch_text(children),
|
|
_ => continue,
|
|
};
|
|
if original_text.trim().is_empty() { continue; }
|
|
|
|
let alternate_text = match
|
|
gen_continuation(&context, entry_idx, |_| false, &test_client).await
|
|
{
|
|
Ok(t) => t,
|
|
Err(e) => { dbglog!("[compare] gen failed at {}: {:#}", entry_idx, e); continue; }
|
|
};
|
|
|
|
shared.lock().unwrap().compare_candidates.push(CompareCandidate {
|
|
entry_idx,
|
|
original_text,
|
|
alternate_text,
|
|
prior_context: render_prior_context(entries, entry_idx, 2),
|
|
timestamp_ns: node_timestamp_ns(node),
|
|
});
|
|
if let Ok(st) = agent.state.try_lock() { st.changed.notify_one(); }
|
|
}
|
|
}
|