evaluate: switch to Elo ratings with skillratings crate
Replace sort-based ranking with proper Elo system: - Each agent TYPE has a persistent Elo rating (agent-elo.json) - Each matchup: pick two random types, grab a recent action from each, LLM compares, update ratings - Ratings persist across daily evaluations — natural recency bias from continuous updates against current opponents - K=32 for fast adaptation to prompt changes Usage: poc-memory agent evaluate --matchups 30 --model haiku Co-Authored-By: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
parent
e2a6bc4c8b
commit
16777924d0
4 changed files with 129 additions and 71 deletions
7
Cargo.lock
generated
7
Cargo.lock
generated
|
|
@ -1908,6 +1908,7 @@ dependencies = [
|
|||
"rkyv",
|
||||
"serde",
|
||||
"serde_json",
|
||||
"skillratings",
|
||||
"uuid",
|
||||
]
|
||||
|
||||
|
|
@ -2615,6 +2616,12 @@ version = "0.1.5"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e"
|
||||
|
||||
[[package]]
|
||||
name = "skillratings"
|
||||
version = "0.28.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "1a6ee7559737c1adcd9184f168a04dc360c84878907c3ecc5c33c2320be1d47a"
|
||||
|
||||
[[package]]
|
||||
name = "slab"
|
||||
version = "0.4.12"
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ jobkit-daemon = { path = "../jobkit-daemon" }
|
|||
redb = "2"
|
||||
log = "0.4"
|
||||
ratatui = "0.29"
|
||||
skillratings = "0.28"
|
||||
crossterm = { version = "0.28", features = ["event-stream"] }
|
||||
|
||||
[build-dependencies]
|
||||
|
|
|
|||
|
|
@ -144,23 +144,52 @@ pub fn cmd_fact_mine_store(path: &str) -> Result<(), String> {
|
|||
|
||||
/// Sample recent actions from each agent type, sort by quality using
|
||||
/// LLM pairwise comparison, report per-type rankings.
|
||||
pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str, dry_run: bool) -> Result<(), String> {
|
||||
/// Elo ratings file path
|
||||
fn elo_path() -> std::path::PathBuf {
|
||||
crate::config::get().data_dir.join("agent-elo.json")
|
||||
}
|
||||
|
||||
/// Load persisted Elo ratings, or initialize at 1000.0
|
||||
fn load_elo_ratings(agent_types: &[&str]) -> std::collections::HashMap<String, f64> {
|
||||
let path = elo_path();
|
||||
let mut ratings: std::collections::HashMap<String, f64> = std::fs::read_to_string(&path)
|
||||
.ok()
|
||||
.and_then(|s| serde_json::from_str(&s).ok())
|
||||
.unwrap_or_default();
|
||||
for t in agent_types {
|
||||
ratings.entry(t.to_string()).or_insert(1000.0);
|
||||
}
|
||||
ratings
|
||||
}
|
||||
|
||||
fn save_elo_ratings(ratings: &std::collections::HashMap<String, f64>) {
|
||||
let path = elo_path();
|
||||
if let Ok(json) = serde_json::to_string_pretty(ratings) {
|
||||
let _ = std::fs::write(path, json);
|
||||
}
|
||||
}
|
||||
|
||||
pub fn cmd_evaluate_agents(matchups: usize, model: &str, dry_run: bool) -> Result<(), String> {
|
||||
use skillratings::elo::{elo, EloConfig, EloRating};
|
||||
use skillratings::Outcomes;
|
||||
|
||||
let store = store::Store::load()?;
|
||||
|
||||
// Collect consolidation reports grouped by agent type
|
||||
let agent_types = ["linker", "organize", "replay", "connector",
|
||||
"separator", "transfer", "distill", "rename"];
|
||||
let agent_types: Vec<&str> = vec![
|
||||
"linker", "organize", "replay", "connector",
|
||||
"separator", "transfer", "distill", "rename",
|
||||
];
|
||||
|
||||
// Load agent prompt files for context
|
||||
// Load agent prompt files
|
||||
let prompts_dir = {
|
||||
let repo = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("agents");
|
||||
if repo.is_dir() { repo } else { crate::store::memory_dir().join("agents") }
|
||||
};
|
||||
|
||||
let mut all_samples: Vec<(String, String, String)> = Vec::new(); // (agent_type, key, context)
|
||||
// Collect recent actions per agent type
|
||||
let mut actions: std::collections::HashMap<String, Vec<(String, String)>> = std::collections::HashMap::new();
|
||||
|
||||
for agent_type in &agent_types {
|
||||
// Load the agent's prompt file (skip JSON header line)
|
||||
let prompt_file = prompts_dir.join(format!("{}.agent", agent_type));
|
||||
let agent_prompt = std::fs::read_to_string(&prompt_file)
|
||||
.unwrap_or_default()
|
||||
|
|
@ -173,19 +202,19 @@ pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str, dry_run: bool)
|
|||
.map(|(k, n)| (k.clone(), n.timestamp))
|
||||
.collect();
|
||||
keys.sort_by(|a, b| b.1.cmp(&a.1));
|
||||
keys.truncate(samples_per_type);
|
||||
keys.truncate(20); // pool of recent actions to sample from
|
||||
|
||||
let mut type_actions = Vec::new();
|
||||
for (key, _) in &keys {
|
||||
let report = store.nodes.get(key)
|
||||
.map(|n| n.content.clone())
|
||||
.unwrap_or_default();
|
||||
|
||||
// Extract target node keys mentioned in the report and include their content
|
||||
let mut target_content = String::new();
|
||||
let mut seen_keys = std::collections::HashSet::new();
|
||||
let mut seen = std::collections::HashSet::new();
|
||||
for word in report.split_whitespace() {
|
||||
let clean = word.trim_matches(|c: char| !c.is_alphanumeric() && c != '-' && c != '_');
|
||||
if clean.len() > 10 && seen_keys.insert(clean.to_string()) && store.nodes.contains_key(clean) {
|
||||
if clean.len() > 10 && seen.insert(clean.to_string()) && store.nodes.contains_key(clean) {
|
||||
if let Some(node) = store.nodes.get(clean) {
|
||||
let preview = crate::util::truncate(&node.content, 200, "...");
|
||||
target_content.push_str(&format!("\n### {}\n{}\n", clean, preview));
|
||||
|
|
@ -200,76 +229,97 @@ pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str, dry_run: bool)
|
|||
crate::util::truncate(&report, 1000, "..."),
|
||||
if target_content.is_empty() { "(none found)".into() } else { target_content }
|
||||
);
|
||||
|
||||
all_samples.push((agent_type.to_string(), key.clone(), context));
|
||||
type_actions.push((key.clone(), context));
|
||||
}
|
||||
actions.insert(agent_type.to_string(), type_actions);
|
||||
}
|
||||
|
||||
if all_samples.len() < 2 {
|
||||
return Err("Not enough samples to compare".into());
|
||||
// Filter to types that have at least 1 action
|
||||
let active_types: Vec<&str> = agent_types.iter()
|
||||
.filter(|t| actions.get(**t).map(|a| !a.is_empty()).unwrap_or(false))
|
||||
.copied()
|
||||
.collect();
|
||||
|
||||
if active_types.len() < 2 {
|
||||
return Err("Need at least 2 agent types with actions".into());
|
||||
}
|
||||
|
||||
eprintln!("Collected {} samples from {} agent types", all_samples.len(), agent_types.len());
|
||||
eprintln!("Sorting with {} pairwise comparisons (model={})...",
|
||||
all_samples.len() * (all_samples.len() as f64).log2() as usize,
|
||||
model);
|
||||
eprintln!("Evaluating {} agent types with {} matchups (model={})",
|
||||
active_types.len(), matchups, model);
|
||||
|
||||
if dry_run {
|
||||
// Show what a comparison looks like without calling the LLM
|
||||
if all_samples.len() >= 2 {
|
||||
let a = &all_samples[0];
|
||||
let b = &all_samples[all_samples.len() - 1];
|
||||
let prompt = build_compare_prompt(a, b);
|
||||
println!("=== DRY RUN: Example comparison prompt ===\n");
|
||||
println!("{}", prompt);
|
||||
println!("\n=== {} samples collected, would do ~{} comparisons ===",
|
||||
all_samples.len(),
|
||||
all_samples.len() * (all_samples.len() as f64).log2() as usize);
|
||||
}
|
||||
let t1 = active_types[0];
|
||||
let t2 = active_types[active_types.len() - 1];
|
||||
let a1 = &actions[t1][0];
|
||||
let a2 = &actions[t2][0];
|
||||
let sample_a = (t1.to_string(), a1.0.clone(), a1.1.clone());
|
||||
let sample_b = (t2.to_string(), a2.0.clone(), a2.1.clone());
|
||||
println!("=== DRY RUN: Example comparison ===\n");
|
||||
println!("{}", build_compare_prompt(&sample_a, &sample_b));
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
// Sort with LLM comparator — yes, really. Rayon's parallel merge sort
|
||||
// with an LLM as the comparison function. Multiple API calls in parallel.
|
||||
let comparisons = AtomicUsize::new(0);
|
||||
use rayon::slice::ParallelSliceMut;
|
||||
all_samples.par_sort_by(|a, b| {
|
||||
let n = comparisons.fetch_add(1, Ordering::Relaxed);
|
||||
if n % 10 == 0 {
|
||||
eprint!(" {} comparisons...\r", n);
|
||||
// Load persisted ratings
|
||||
let mut ratings = load_elo_ratings(&agent_types);
|
||||
let config = EloConfig { k: 32.0 };
|
||||
let mut rng_state = std::time::SystemTime::now()
|
||||
.duration_since(std::time::UNIX_EPOCH).unwrap().subsec_nanos();
|
||||
|
||||
for i in 0..matchups {
|
||||
// Pick two different random agent types
|
||||
rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345);
|
||||
let idx_a = (rng_state as usize) % active_types.len();
|
||||
rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345);
|
||||
let mut idx_b = (rng_state as usize) % active_types.len();
|
||||
if idx_b == idx_a { idx_b = (idx_b + 1) % active_types.len(); }
|
||||
|
||||
let type_a = active_types[idx_a];
|
||||
let type_b = active_types[idx_b];
|
||||
|
||||
// Pick random recent action from each
|
||||
let acts_a = &actions[type_a];
|
||||
let acts_b = &actions[type_b];
|
||||
rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345);
|
||||
let act_a = &acts_a[(rng_state as usize) % acts_a.len()];
|
||||
rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345);
|
||||
let act_b = &acts_b[(rng_state as usize) % acts_b.len()];
|
||||
|
||||
let sample_a = (type_a.to_string(), act_a.0.clone(), act_a.1.clone());
|
||||
let sample_b = (type_b.to_string(), act_b.0.clone(), act_b.1.clone());
|
||||
|
||||
let result = llm_compare(&sample_a, &sample_b, model);
|
||||
|
||||
let rating_a = EloRating { rating: ratings[type_a] };
|
||||
let rating_b = EloRating { rating: ratings[type_b] };
|
||||
|
||||
let outcome = match result {
|
||||
Ok(std::cmp::Ordering::Less) => Outcomes::WIN, // A wins
|
||||
Ok(std::cmp::Ordering::Greater) => Outcomes::LOSS, // B wins
|
||||
_ => Outcomes::WIN, // default to A
|
||||
};
|
||||
|
||||
let (new_a, new_b) = elo(&rating_a, &rating_b, &outcome, &config);
|
||||
ratings.insert(type_a.to_string(), new_a.rating);
|
||||
ratings.insert(type_b.to_string(), new_b.rating);
|
||||
|
||||
eprint!(" matchup {}/{}: {} vs {} → {}\r",
|
||||
i + 1, matchups, type_a, type_b,
|
||||
if matches!(outcome, Outcomes::WIN) { type_a } else { type_b });
|
||||
}
|
||||
llm_compare(a, b, model).unwrap_or(std::cmp::Ordering::Equal)
|
||||
});
|
||||
eprintln!(" {} total comparisons", comparisons.load(Ordering::Relaxed));
|
||||
eprintln!();
|
||||
|
||||
let sorted = all_samples;
|
||||
// Save updated ratings
|
||||
save_elo_ratings(&ratings);
|
||||
|
||||
// Print ranked results
|
||||
println!("\nAgent Action Ranking (best → worst):\n");
|
||||
for (rank, (agent_type, key, summary)) in sorted.iter().enumerate() {
|
||||
let preview = if summary.len() > 80 { &summary[..80] } else { summary };
|
||||
println!(" {:3}. [{:10}] {} — {}", rank + 1, agent_type, key, preview);
|
||||
}
|
||||
// Print rankings
|
||||
let mut ranked: Vec<_> = ratings.iter().collect();
|
||||
ranked.sort_by(|a, b| b.1.total_cmp(a.1));
|
||||
|
||||
// Compute per-type average rank
|
||||
println!("\nPer-type average rank (lower = better):\n");
|
||||
let n = sorted.len() as f64;
|
||||
let mut type_ranks: std::collections::HashMap<&str, Vec<usize>> = std::collections::HashMap::new();
|
||||
for (rank, (agent_type, _, _)) in sorted.iter().enumerate() {
|
||||
type_ranks.entry(agent_type).or_default().push(rank + 1);
|
||||
}
|
||||
let mut avgs: Vec<(&str, f64, usize)> = type_ranks.iter()
|
||||
.map(|(t, ranks)| {
|
||||
let avg = ranks.iter().sum::<usize>() as f64 / ranks.len() as f64;
|
||||
(*t, avg, ranks.len())
|
||||
})
|
||||
.collect();
|
||||
avgs.sort_by(|a, b| a.1.total_cmp(&b.1));
|
||||
|
||||
for (agent_type, avg_rank, count) in &avgs {
|
||||
let quality = 1.0 - (avg_rank / n);
|
||||
println!(" {:12} avg_rank={:5.1} quality={:.2} (n={})",
|
||||
agent_type, avg_rank, quality, count);
|
||||
println!("\nAgent Elo Ratings (after {} matchups):\n", matchups);
|
||||
for (agent_type, rating) in &ranked {
|
||||
let bar_len = ((*rating - 800.0) / 10.0).max(0.0) as usize;
|
||||
let bar = "#".repeat(bar_len.min(40));
|
||||
println!(" {:12} {:7.1} {}", agent_type, rating, bar);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
|
|
|
|||
|
|
@ -568,9 +568,9 @@ enum AgentCmd {
|
|||
/// Evaluate agent quality by LLM-sorted ranking
|
||||
#[command(name = "evaluate")]
|
||||
Evaluate {
|
||||
/// Samples per agent type
|
||||
#[arg(long, default_value_t = 5)]
|
||||
samples: usize,
|
||||
/// Number of pairwise matchups to run
|
||||
#[arg(long, default_value_t = 30)]
|
||||
matchups: usize,
|
||||
/// Model to use for comparison (haiku or sonnet)
|
||||
#[arg(long, default_value = "haiku")]
|
||||
model: String,
|
||||
|
|
@ -798,8 +798,8 @@ fn main() {
|
|||
=> cli::agent::cmd_fact_mine(&path, batch, dry_run, output.as_deref(), min_messages),
|
||||
AgentCmd::FactMineStore { path } => cli::agent::cmd_fact_mine_store(&path),
|
||||
AgentCmd::ReplayQueue { count } => cli::agent::cmd_replay_queue(count),
|
||||
AgentCmd::Evaluate { samples, model, dry_run }
|
||||
=> cli::agent::cmd_evaluate_agents(samples, &model, dry_run),
|
||||
AgentCmd::Evaluate { matchups, model, dry_run }
|
||||
=> cli::agent::cmd_evaluate_agents(matchups, &model, dry_run),
|
||||
},
|
||||
|
||||
// Admin
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue