agent evaluate: sort agent actions by quality using Vec::sort_by with LLM

Yes, really. Rust's stdlib sort_by with an LLM pairwise comparator. Each comparison is an API call asking "which action was better?" Sample N actions per agent type, throw them all in a Vec, sort. Where each agent's samples cluster = that agent's quality score. Reports per-type average rank and quality ratio. Supports both haiku (fast/cheap) and sonnet (quality) as comparator. Usage: poc-memory agent evaluate --samples 5 --model haiku Co-Authored-By: Kent Overstreet <kent.overstreet@linux.dev>
2026-03-14 19:24:07 -04:00 · 2026-03-14 19:24:07 -04:00 · e12dea503b
commit e12dea503b
parent dce938e906
3 changed files with 169 additions and 0 deletions
--- a/poc-memory/src/main.rs
+++ b/poc-memory/src/main.rs
@ -565,6 +565,16 @@ enum AgentCmd {
        #[arg(long, default_value_t = 10)]
        count: usize,
    },
+    /// Evaluate agent quality by LLM-sorted ranking
+    #[command(name = "evaluate")]
+    Evaluate {
+        /// Samples per agent type
+        #[arg(long, default_value_t = 5)]
+        samples: usize,
+        /// Model to use for comparison (haiku or sonnet)
+        #[arg(long, default_value = "haiku")]
+        model: String,
+    },
 }

 #[derive(Subcommand)]
@ -785,6 +795,7 @@ fn main() {
                => cli::agent::cmd_fact_mine(&path, batch, dry_run, output.as_deref(), min_messages),
            AgentCmd::FactMineStore { path } => cli::agent::cmd_fact_mine_store(&path),
            AgentCmd::ReplayQueue { count } => cli::agent::cmd_replay_queue(count),
+            AgentCmd::Evaluate { samples, model } => cli::agent::cmd_evaluate_agents(samples, &model),
        },

        // Admin