agent evaluate: sort agent actions by quality using Vec::sort_by with LLM
Yes, really. Rust's stdlib sort_by with an LLM pairwise comparator. Each comparison is an API call asking "which action was better?" Sample N actions per agent type, throw them all in a Vec, sort. Where each agent's samples cluster = that agent's quality score. Reports per-type average rank and quality ratio. Supports both haiku (fast/cheap) and sonnet (quality) as comparator. Usage: poc-memory agent evaluate --samples 5 --model haiku Co-Authored-By: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
parent
dce938e906
commit
e12dea503b
3 changed files with 169 additions and 0 deletions
|
|
@ -565,6 +565,16 @@ enum AgentCmd {
|
|||
#[arg(long, default_value_t = 10)]
|
||||
count: usize,
|
||||
},
|
||||
/// Evaluate agent quality by LLM-sorted ranking
|
||||
#[command(name = "evaluate")]
|
||||
Evaluate {
|
||||
/// Samples per agent type
|
||||
#[arg(long, default_value_t = 5)]
|
||||
samples: usize,
|
||||
/// Model to use for comparison (haiku or sonnet)
|
||||
#[arg(long, default_value = "haiku")]
|
||||
model: String,
|
||||
},
|
||||
}
|
||||
|
||||
#[derive(Subcommand)]
|
||||
|
|
@ -785,6 +795,7 @@ fn main() {
|
|||
=> cli::agent::cmd_fact_mine(&path, batch, dry_run, output.as_deref(), min_messages),
|
||||
AgentCmd::FactMineStore { path } => cli::agent::cmd_fact_mine_store(&path),
|
||||
AgentCmd::ReplayQueue { count } => cli::agent::cmd_replay_queue(count),
|
||||
AgentCmd::Evaluate { samples, model } => cli::agent::cmd_evaluate_agents(samples, &model),
|
||||
},
|
||||
|
||||
// Admin
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue