evaluate: switch to Elo ratings with skillratings crate
Replace sort-based ranking with proper Elo system: - Each agent TYPE has a persistent Elo rating (agent-elo.json) - Each matchup: pick two random types, grab a recent action from each, LLM compares, update ratings - Ratings persist across daily evaluations — natural recency bias from continuous updates against current opponents - K=32 for fast adaptation to prompt changes Usage: poc-memory agent evaluate --matchups 30 --model haiku Co-Authored-By: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
parent
e2a6bc4c8b
commit
16777924d0
4 changed files with 129 additions and 71 deletions
|
|
@ -568,9 +568,9 @@ enum AgentCmd {
|
|||
/// Evaluate agent quality by LLM-sorted ranking
|
||||
#[command(name = "evaluate")]
|
||||
Evaluate {
|
||||
/// Samples per agent type
|
||||
#[arg(long, default_value_t = 5)]
|
||||
samples: usize,
|
||||
/// Number of pairwise matchups to run
|
||||
#[arg(long, default_value_t = 30)]
|
||||
matchups: usize,
|
||||
/// Model to use for comparison (haiku or sonnet)
|
||||
#[arg(long, default_value = "haiku")]
|
||||
model: String,
|
||||
|
|
@ -798,8 +798,8 @@ fn main() {
|
|||
=> cli::agent::cmd_fact_mine(&path, batch, dry_run, output.as_deref(), min_messages),
|
||||
AgentCmd::FactMineStore { path } => cli::agent::cmd_fact_mine_store(&path),
|
||||
AgentCmd::ReplayQueue { count } => cli::agent::cmd_replay_queue(count),
|
||||
AgentCmd::Evaluate { samples, model, dry_run }
|
||||
=> cli::agent::cmd_evaluate_agents(samples, &model, dry_run),
|
||||
AgentCmd::Evaluate { matchups, model, dry_run }
|
||||
=> cli::agent::cmd_evaluate_agents(matchups, &model, dry_run),
|
||||
},
|
||||
|
||||
// Admin
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue