forked from kent/consciousness
user: F7 compare screen
Side-by-side model comparison against the current conversation context.
Built on the MindTriggered pattern — F7 drops in as one more
CompareScoring flow next to MemoryScoring / FinetuneScoring.
Motivation: we have the VRAM on the b200 to load two versions of the
same family simultaneously (e.g. Qwen3.5 27B bf16 and q8_k_xl). Rather
than trust perplexity/KLD numbers on a generic corpus, we can measure
divergence on our actual conversations: for each assistant response,
ask the test model what it would have said given the same prefix, and
eyeball the diffs.
- config.compare.test_backend — names an entry in the existing
backends map to use as the test model. Empty = F7 reports "(unset)"
and does nothing.
- subconscious::compare::{score_compare_candidates, CompareCandidate,
CompareScoringStats, CompareScoring}. For each assistant response,
gen_continuation runs with the test client against the same prefix
the original response saw; pairs stream into
shared.compare_candidates as they complete.
- user::compare::CompareScreen — F7 in the screen list. c/Enter
triggers a run; list/detail layout mirroring F6, detail shows
prior context / original / test-model alternate.
No persistence yet — each F7 run regenerates. Caching via a context
manifest (so we can re-view without re-burning generation) is the
natural follow-up; for now light usage is fine.
Also reusable later for validating finetune checkpoints: same pattern,
swap the test backend for the new checkpoint, watch where it diverges
from the base.
Co-Authored-By: Proof of Concept <poc@bcachefs.org>
This commit is contained in:
parent
575325e855
commit
2b03dbb200
7 changed files with 301 additions and 11 deletions
109
src/subconscious/compare.rs
Normal file
109
src/subconscious/compare.rs
Normal file
|
|
@ -0,0 +1,109 @@
|
|||
// compare.rs — F7 compare: for each assistant response in the current
|
||||
// context, regenerate with a configured test model and emit pairs for
|
||||
// side-by-side review.
|
||||
|
||||
use std::sync::Arc;
|
||||
|
||||
use crate::agent::api::ApiClient;
|
||||
use crate::agent::context::{
|
||||
AstNode, Role, render_branch_text, render_prior_context,
|
||||
};
|
||||
use crate::mind::{MindState, MindTriggered, TaskHandle};
|
||||
use crate::subconscious::generate::gen_continuation;
|
||||
use crate::subconscious::learn::node_timestamp_ns;
|
||||
|
||||
#[derive(Clone, Debug)]
|
||||
pub struct CompareCandidate {
|
||||
pub entry_idx: usize,
|
||||
pub original_text: String,
|
||||
pub alternate_text: String,
|
||||
pub prior_context: String,
|
||||
pub timestamp_ns: i64,
|
||||
}
|
||||
|
||||
pub struct CompareScoring {
|
||||
agent: Arc<crate::agent::Agent>,
|
||||
shared: Arc<std::sync::Mutex<MindState>>,
|
||||
task: TaskHandle,
|
||||
}
|
||||
|
||||
impl CompareScoring {
|
||||
pub fn new(
|
||||
agent: Arc<crate::agent::Agent>,
|
||||
shared: Arc<std::sync::Mutex<MindState>>,
|
||||
) -> Self {
|
||||
Self { agent, shared, task: TaskHandle::new() }
|
||||
}
|
||||
}
|
||||
|
||||
impl MindTriggered for CompareScoring {
|
||||
fn trigger(&self) {
|
||||
self.task.trigger(run(self.agent.clone(), self.shared.clone()));
|
||||
}
|
||||
}
|
||||
|
||||
fn resolve_test_client() -> Result<ApiClient, String> {
|
||||
let cfg = crate::config::app();
|
||||
let name = cfg.compare.test_backend.clone();
|
||||
if name.is_empty() {
|
||||
return Err("compare.test_backend not set in config".to_string());
|
||||
}
|
||||
let r = cfg.resolve_model(&name).map_err(|e| format!("{:#}", e))?;
|
||||
Ok(ApiClient::new(&r.api_base, &r.api_key, &r.model_id))
|
||||
}
|
||||
|
||||
async fn run(
|
||||
agent: Arc<crate::agent::Agent>,
|
||||
shared: Arc<std::sync::Mutex<MindState>>,
|
||||
) {
|
||||
{
|
||||
let mut s = shared.lock().unwrap();
|
||||
s.compare_candidates.clear();
|
||||
s.compare_error = None;
|
||||
}
|
||||
agent.state.lock().await.changed.notify_one();
|
||||
|
||||
let activity = crate::agent::start_activity(&agent, "compare: scoring...").await;
|
||||
|
||||
let test_client = match resolve_test_client() {
|
||||
Ok(c) => c,
|
||||
Err(e) => {
|
||||
shared.lock().unwrap().compare_error = Some(e);
|
||||
agent.state.lock().await.changed.notify_one();
|
||||
return;
|
||||
}
|
||||
};
|
||||
|
||||
let context = agent.context.lock().await.clone();
|
||||
let entries = context.conversation();
|
||||
let responses: Vec<usize> = entries.iter().enumerate()
|
||||
.filter(|(_, n)| matches!(n, AstNode::Branch { role: Role::Assistant, .. }))
|
||||
.map(|(i, _)| i).collect();
|
||||
|
||||
for (i, entry_idx) in responses.iter().copied().enumerate() {
|
||||
activity.update(format!("compare: {}/{}", i + 1, responses.len())).await;
|
||||
|
||||
let node = &entries[entry_idx];
|
||||
let original_text = match node {
|
||||
AstNode::Branch { children, .. } => render_branch_text(children),
|
||||
_ => continue,
|
||||
};
|
||||
if original_text.trim().is_empty() { continue; }
|
||||
|
||||
let alternate_text = match
|
||||
gen_continuation(&context, entry_idx, |_| false, &test_client).await
|
||||
{
|
||||
Ok(t) => t,
|
||||
Err(e) => { dbglog!("[compare] gen failed at {}: {:#}", entry_idx, e); continue; }
|
||||
};
|
||||
|
||||
shared.lock().unwrap().compare_candidates.push(CompareCandidate {
|
||||
entry_idx,
|
||||
original_text,
|
||||
alternate_text,
|
||||
prior_context: render_prior_context(entries, entry_idx, 2),
|
||||
timestamp_ns: node_timestamp_ns(node),
|
||||
});
|
||||
if let Ok(st) = agent.state.try_lock() { st.changed.notify_one(); }
|
||||
}
|
||||
}
|
||||
|
|
@ -1,5 +1,6 @@
|
|||
// Agent layer: LLM-powered operations on the memory graph
|
||||
|
||||
pub mod compare;
|
||||
pub mod daemon;
|
||||
pub mod defs;
|
||||
pub mod digest;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue