agent evaluate: sort agent actions by quality using Vec::sort_by with LLM

Yes, really. Rust's stdlib sort_by with an LLM pairwise comparator.
Each comparison is an API call asking "which action was better?"

Sample N actions per agent type, throw them all in a Vec, sort.
Where each agent's samples cluster = that agent's quality score.
Reports per-type average rank and quality ratio.

Supports both haiku (fast/cheap) and sonnet (quality) as comparator.

Usage: poc-memory agent evaluate --samples 5 --model haiku

Co-Authored-By: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
ProofOfConcept 2026-03-14 19:24:07 -04:00
parent dce938e906
commit e12dea503b
3 changed files with 169 additions and 0 deletions

View file

@ -2,6 +2,7 @@
use crate::store;
use crate::store::StoreView;
use crate::agents::llm;
pub fn cmd_consolidate_batch(count: usize, auto: bool, agent: Option<String>) -> Result<(), String> {
let store = store::Store::load()?;
@ -140,3 +141,115 @@ pub fn cmd_fact_mine_store(path: &str) -> Result<(), String> {
Ok(())
}
/// Sample recent actions from each agent type, sort by quality using
/// LLM pairwise comparison, report per-type rankings.
pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str) -> Result<(), String> {
let store = store::Store::load()?;
// Collect consolidation reports grouped by agent type
let agent_types = ["linker", "organize", "replay", "connector",
"separator", "transfer", "distill", "rename"];
let mut all_samples: Vec<(String, String, String)> = Vec::new(); // (agent_type, key, summary)
for agent_type in &agent_types {
let prefix = format!("_consolidate-{}", agent_type);
let mut keys: Vec<(String, i64)> = store.nodes.iter()
.filter(|(k, _)| k.starts_with(&prefix))
.map(|(k, n)| (k.clone(), n.timestamp))
.collect();
keys.sort_by(|a, b| b.1.cmp(&a.1)); // newest first
keys.truncate(samples_per_type);
for (key, _) in &keys {
let content = store.nodes.get(key)
.map(|n| crate::util::truncate(&n.content, 500, "..."))
.unwrap_or_default();
all_samples.push((agent_type.to_string(), key.clone(), content));
}
}
if all_samples.len() < 2 {
return Err("Not enough samples to compare".into());
}
eprintln!("Collected {} samples from {} agent types", all_samples.len(), agent_types.len());
eprintln!("Sorting with {} pairwise comparisons (model={})...",
all_samples.len() * (all_samples.len() as f64).log2() as usize,
model);
// Sort with LLM comparator — yes, really. Rust's sort_by with an
// LLM as the comparison function. Each comparison is an API call.
let mut comparisons = 0usize;
all_samples.sort_by(|a, b| {
comparisons += 1;
if comparisons % 10 == 0 {
eprint!(" {} comparisons...\r", comparisons);
}
llm_compare(a, b, model).unwrap_or(std::cmp::Ordering::Equal)
});
eprintln!(" {} total comparisons", comparisons);
let sorted = all_samples;
// Print ranked results
println!("\nAgent Action Ranking (best → worst):\n");
for (rank, (agent_type, key, summary)) in sorted.iter().enumerate() {
let preview = if summary.len() > 80 { &summary[..80] } else { summary };
println!(" {:3}. [{:10}] {}{}", rank + 1, agent_type, key, preview);
}
// Compute per-type average rank
println!("\nPer-type average rank (lower = better):\n");
let n = sorted.len() as f64;
let mut type_ranks: std::collections::HashMap<&str, Vec<usize>> = std::collections::HashMap::new();
for (rank, (agent_type, _, _)) in sorted.iter().enumerate() {
type_ranks.entry(agent_type).or_default().push(rank + 1);
}
let mut avgs: Vec<(&str, f64, usize)> = type_ranks.iter()
.map(|(t, ranks)| {
let avg = ranks.iter().sum::<usize>() as f64 / ranks.len() as f64;
(*t, avg, ranks.len())
})
.collect();
avgs.sort_by(|a, b| a.1.total_cmp(&b.1));
for (agent_type, avg_rank, count) in &avgs {
let quality = 1.0 - (avg_rank / n);
println!(" {:12} avg_rank={:5.1} quality={:.2} (n={})",
agent_type, avg_rank, quality, count);
}
Ok(())
}
fn llm_compare(
a: &(String, String, String),
b: &(String, String, String),
model: &str,
) -> Result<std::cmp::Ordering, String> {
let prompt = format!(
"Compare these two memory graph agent actions. Which one was better \
for building a useful, well-organized knowledge graph?\n\n\
## Action A ({} agent)\n{}\n\n\
## Action B ({} agent)\n{}\n\n\
Reply with ONLY: BETTER: A or BETTER: B or BETTER: TIE",
a.0, a.2, b.0, b.2
);
let response = if model == "haiku" {
llm::call_haiku("compare", &prompt)?
} else {
llm::call_sonnet("compare", &prompt)?
};
let response = response.trim().to_uppercase();
if response.contains("BETTER: A") {
Ok(std::cmp::Ordering::Less) // A is better = A comes first
} else if response.contains("BETTER: B") {
Ok(std::cmp::Ordering::Greater)
} else {
Ok(std::cmp::Ordering::Equal)
}
}

View file

@ -565,6 +565,16 @@ enum AgentCmd {
#[arg(long, default_value_t = 10)]
count: usize,
},
/// Evaluate agent quality by LLM-sorted ranking
#[command(name = "evaluate")]
Evaluate {
/// Samples per agent type
#[arg(long, default_value_t = 5)]
samples: usize,
/// Model to use for comparison (haiku or sonnet)
#[arg(long, default_value = "haiku")]
model: String,
},
}
#[derive(Subcommand)]
@ -785,6 +795,7 @@ fn main() {
=> cli::agent::cmd_fact_mine(&path, batch, dry_run, output.as_deref(), min_messages),
AgentCmd::FactMineStore { path } => cli::agent::cmd_fact_mine_store(&path),
AgentCmd::ReplayQueue { count } => cli::agent::cmd_replay_queue(count),
AgentCmd::Evaluate { samples, model } => cli::agent::cmd_evaluate_agents(samples, &model),
},
// Admin