diff --git a/Cargo.lock b/Cargo.lock index 5937877..0f6d559 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1908,6 +1908,7 @@ dependencies = [ "rkyv", "serde", "serde_json", + "skillratings", "uuid", ] @@ -2615,6 +2616,12 @@ version = "0.1.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e3a9fe34e3e7a50316060351f37187a3f546bce95496156754b601a5fa71b76e" +[[package]] +name = "skillratings" +version = "0.28.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a6ee7559737c1adcd9184f168a04dc360c84878907c3ecc5c33c2320be1d47a" + [[package]] name = "slab" version = "0.4.12" diff --git a/poc-memory/Cargo.toml b/poc-memory/Cargo.toml index 0158743..d840475 100644 --- a/poc-memory/Cargo.toml +++ b/poc-memory/Cargo.toml @@ -24,6 +24,7 @@ jobkit-daemon = { path = "../jobkit-daemon" } redb = "2" log = "0.4" ratatui = "0.29" +skillratings = "0.28" crossterm = { version = "0.28", features = ["event-stream"] } [build-dependencies] diff --git a/poc-memory/src/cli/agent.rs b/poc-memory/src/cli/agent.rs index 8ec9dea..24a7cc1 100644 --- a/poc-memory/src/cli/agent.rs +++ b/poc-memory/src/cli/agent.rs @@ -144,23 +144,52 @@ pub fn cmd_fact_mine_store(path: &str) -> Result<(), String> { /// Sample recent actions from each agent type, sort by quality using /// LLM pairwise comparison, report per-type rankings. -pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str, dry_run: bool) -> Result<(), String> { +/// Elo ratings file path +fn elo_path() -> std::path::PathBuf { + crate::config::get().data_dir.join("agent-elo.json") +} + +/// Load persisted Elo ratings, or initialize at 1000.0 +fn load_elo_ratings(agent_types: &[&str]) -> std::collections::HashMap { + let path = elo_path(); + let mut ratings: std::collections::HashMap = std::fs::read_to_string(&path) + .ok() + .and_then(|s| serde_json::from_str(&s).ok()) + .unwrap_or_default(); + for t in agent_types { + ratings.entry(t.to_string()).or_insert(1000.0); + } + ratings +} + +fn save_elo_ratings(ratings: &std::collections::HashMap) { + let path = elo_path(); + if let Ok(json) = serde_json::to_string_pretty(ratings) { + let _ = std::fs::write(path, json); + } +} + +pub fn cmd_evaluate_agents(matchups: usize, model: &str, dry_run: bool) -> Result<(), String> { + use skillratings::elo::{elo, EloConfig, EloRating}; + use skillratings::Outcomes; + let store = store::Store::load()?; - // Collect consolidation reports grouped by agent type - let agent_types = ["linker", "organize", "replay", "connector", - "separator", "transfer", "distill", "rename"]; + let agent_types: Vec<&str> = vec![ + "linker", "organize", "replay", "connector", + "separator", "transfer", "distill", "rename", + ]; - // Load agent prompt files for context + // Load agent prompt files let prompts_dir = { let repo = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("agents"); if repo.is_dir() { repo } else { crate::store::memory_dir().join("agents") } }; - let mut all_samples: Vec<(String, String, String)> = Vec::new(); // (agent_type, key, context) + // Collect recent actions per agent type + let mut actions: std::collections::HashMap> = std::collections::HashMap::new(); for agent_type in &agent_types { - // Load the agent's prompt file (skip JSON header line) let prompt_file = prompts_dir.join(format!("{}.agent", agent_type)); let agent_prompt = std::fs::read_to_string(&prompt_file) .unwrap_or_default() @@ -173,19 +202,19 @@ pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str, dry_run: bool) .map(|(k, n)| (k.clone(), n.timestamp)) .collect(); keys.sort_by(|a, b| b.1.cmp(&a.1)); - keys.truncate(samples_per_type); + keys.truncate(20); // pool of recent actions to sample from + let mut type_actions = Vec::new(); for (key, _) in &keys { let report = store.nodes.get(key) .map(|n| n.content.clone()) .unwrap_or_default(); - // Extract target node keys mentioned in the report and include their content let mut target_content = String::new(); - let mut seen_keys = std::collections::HashSet::new(); + let mut seen = std::collections::HashSet::new(); for word in report.split_whitespace() { let clean = word.trim_matches(|c: char| !c.is_alphanumeric() && c != '-' && c != '_'); - if clean.len() > 10 && seen_keys.insert(clean.to_string()) && store.nodes.contains_key(clean) { + if clean.len() > 10 && seen.insert(clean.to_string()) && store.nodes.contains_key(clean) { if let Some(node) = store.nodes.get(clean) { let preview = crate::util::truncate(&node.content, 200, "..."); target_content.push_str(&format!("\n### {}\n{}\n", clean, preview)); @@ -200,76 +229,97 @@ pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str, dry_run: bool) crate::util::truncate(&report, 1000, "..."), if target_content.is_empty() { "(none found)".into() } else { target_content } ); - - all_samples.push((agent_type.to_string(), key.clone(), context)); + type_actions.push((key.clone(), context)); } + actions.insert(agent_type.to_string(), type_actions); } - if all_samples.len() < 2 { - return Err("Not enough samples to compare".into()); + // Filter to types that have at least 1 action + let active_types: Vec<&str> = agent_types.iter() + .filter(|t| actions.get(**t).map(|a| !a.is_empty()).unwrap_or(false)) + .copied() + .collect(); + + if active_types.len() < 2 { + return Err("Need at least 2 agent types with actions".into()); } - eprintln!("Collected {} samples from {} agent types", all_samples.len(), agent_types.len()); - eprintln!("Sorting with {} pairwise comparisons (model={})...", - all_samples.len() * (all_samples.len() as f64).log2() as usize, - model); + eprintln!("Evaluating {} agent types with {} matchups (model={})", + active_types.len(), matchups, model); if dry_run { - // Show what a comparison looks like without calling the LLM - if all_samples.len() >= 2 { - let a = &all_samples[0]; - let b = &all_samples[all_samples.len() - 1]; - let prompt = build_compare_prompt(a, b); - println!("=== DRY RUN: Example comparison prompt ===\n"); - println!("{}", prompt); - println!("\n=== {} samples collected, would do ~{} comparisons ===", - all_samples.len(), - all_samples.len() * (all_samples.len() as f64).log2() as usize); - } + let t1 = active_types[0]; + let t2 = active_types[active_types.len() - 1]; + let a1 = &actions[t1][0]; + let a2 = &actions[t2][0]; + let sample_a = (t1.to_string(), a1.0.clone(), a1.1.clone()); + let sample_b = (t2.to_string(), a2.0.clone(), a2.1.clone()); + println!("=== DRY RUN: Example comparison ===\n"); + println!("{}", build_compare_prompt(&sample_a, &sample_b)); return Ok(()); } - // Sort with LLM comparator — yes, really. Rayon's parallel merge sort - // with an LLM as the comparison function. Multiple API calls in parallel. - let comparisons = AtomicUsize::new(0); - use rayon::slice::ParallelSliceMut; - all_samples.par_sort_by(|a, b| { - let n = comparisons.fetch_add(1, Ordering::Relaxed); - if n % 10 == 0 { - eprint!(" {} comparisons...\r", n); - } - llm_compare(a, b, model).unwrap_or(std::cmp::Ordering::Equal) - }); - eprintln!(" {} total comparisons", comparisons.load(Ordering::Relaxed)); + // Load persisted ratings + let mut ratings = load_elo_ratings(&agent_types); + let config = EloConfig { k: 32.0 }; + let mut rng_state = std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH).unwrap().subsec_nanos(); - let sorted = all_samples; + for i in 0..matchups { + // Pick two different random agent types + rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345); + let idx_a = (rng_state as usize) % active_types.len(); + rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345); + let mut idx_b = (rng_state as usize) % active_types.len(); + if idx_b == idx_a { idx_b = (idx_b + 1) % active_types.len(); } - // Print ranked results - println!("\nAgent Action Ranking (best → worst):\n"); - for (rank, (agent_type, key, summary)) in sorted.iter().enumerate() { - let preview = if summary.len() > 80 { &summary[..80] } else { summary }; - println!(" {:3}. [{:10}] {} — {}", rank + 1, agent_type, key, preview); + let type_a = active_types[idx_a]; + let type_b = active_types[idx_b]; + + // Pick random recent action from each + let acts_a = &actions[type_a]; + let acts_b = &actions[type_b]; + rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345); + let act_a = &acts_a[(rng_state as usize) % acts_a.len()]; + rng_state = rng_state.wrapping_mul(1103515245).wrapping_add(12345); + let act_b = &acts_b[(rng_state as usize) % acts_b.len()]; + + let sample_a = (type_a.to_string(), act_a.0.clone(), act_a.1.clone()); + let sample_b = (type_b.to_string(), act_b.0.clone(), act_b.1.clone()); + + let result = llm_compare(&sample_a, &sample_b, model); + + let rating_a = EloRating { rating: ratings[type_a] }; + let rating_b = EloRating { rating: ratings[type_b] }; + + let outcome = match result { + Ok(std::cmp::Ordering::Less) => Outcomes::WIN, // A wins + Ok(std::cmp::Ordering::Greater) => Outcomes::LOSS, // B wins + _ => Outcomes::WIN, // default to A + }; + + let (new_a, new_b) = elo(&rating_a, &rating_b, &outcome, &config); + ratings.insert(type_a.to_string(), new_a.rating); + ratings.insert(type_b.to_string(), new_b.rating); + + eprint!(" matchup {}/{}: {} vs {} → {}\r", + i + 1, matchups, type_a, type_b, + if matches!(outcome, Outcomes::WIN) { type_a } else { type_b }); } + eprintln!(); - // Compute per-type average rank - println!("\nPer-type average rank (lower = better):\n"); - let n = sorted.len() as f64; - let mut type_ranks: std::collections::HashMap<&str, Vec> = std::collections::HashMap::new(); - for (rank, (agent_type, _, _)) in sorted.iter().enumerate() { - type_ranks.entry(agent_type).or_default().push(rank + 1); - } - let mut avgs: Vec<(&str, f64, usize)> = type_ranks.iter() - .map(|(t, ranks)| { - let avg = ranks.iter().sum::() as f64 / ranks.len() as f64; - (*t, avg, ranks.len()) - }) - .collect(); - avgs.sort_by(|a, b| a.1.total_cmp(&b.1)); + // Save updated ratings + save_elo_ratings(&ratings); - for (agent_type, avg_rank, count) in &avgs { - let quality = 1.0 - (avg_rank / n); - println!(" {:12} avg_rank={:5.1} quality={:.2} (n={})", - agent_type, avg_rank, quality, count); + // Print rankings + let mut ranked: Vec<_> = ratings.iter().collect(); + ranked.sort_by(|a, b| b.1.total_cmp(a.1)); + + println!("\nAgent Elo Ratings (after {} matchups):\n", matchups); + for (agent_type, rating) in &ranked { + let bar_len = ((*rating - 800.0) / 10.0).max(0.0) as usize; + let bar = "#".repeat(bar_len.min(40)); + println!(" {:12} {:7.1} {}", agent_type, rating, bar); } Ok(()) diff --git a/poc-memory/src/main.rs b/poc-memory/src/main.rs index c01cfad..bf479d9 100644 --- a/poc-memory/src/main.rs +++ b/poc-memory/src/main.rs @@ -568,9 +568,9 @@ enum AgentCmd { /// Evaluate agent quality by LLM-sorted ranking #[command(name = "evaluate")] Evaluate { - /// Samples per agent type - #[arg(long, default_value_t = 5)] - samples: usize, + /// Number of pairwise matchups to run + #[arg(long, default_value_t = 30)] + matchups: usize, /// Model to use for comparison (haiku or sonnet) #[arg(long, default_value = "haiku")] model: String, @@ -798,8 +798,8 @@ fn main() { => cli::agent::cmd_fact_mine(&path, batch, dry_run, output.as_deref(), min_messages), AgentCmd::FactMineStore { path } => cli::agent::cmd_fact_mine_store(&path), AgentCmd::ReplayQueue { count } => cli::agent::cmd_replay_queue(count), - AgentCmd::Evaluate { samples, model, dry_run } - => cli::agent::cmd_evaluate_agents(samples, &model, dry_run), + AgentCmd::Evaluate { matchups, model, dry_run } + => cli::agent::cmd_evaluate_agents(matchups, &model, dry_run), }, // Admin