agent evaluate: sort agent actions by quality using Vec::sort_by with LLM

Yes, really. Rust's stdlib sort_by with an LLM pairwise comparator. Each comparison is an API call asking "which action was better?" Sample N actions per agent type, throw them all in a Vec, sort. Where each agent's samples cluster = that agent's quality score. Reports per-type average rank and quality ratio. Supports both haiku (fast/cheap) and sonnet (quality) as comparator. Usage: poc-memory agent evaluate --samples 5 --model haiku Co-Authored-By: Kent Overstreet <kent.overstreet@linux.dev>
2026-03-14 19:24:07 -04:00 · 2026-03-14 19:24:07 -04:00 · e12dea503b
commit e12dea503b
parent dce938e906
3 changed files with 169 additions and 0 deletions
--- a/poc-memory/src/cli/agent.rs
+++ b/poc-memory/src/cli/agent.rs
@ -2,6 +2,7 @@

 use crate::store;
 use crate::store::StoreView;
+use crate::agents::llm;

 pub fn cmd_consolidate_batch(count: usize, auto: bool, agent: Option<String>) -> Result<(), String> {
    let store = store::Store::load()?;
@ -140,3 +141,115 @@ pub fn cmd_fact_mine_store(path: &str) -> Result<(), String> {
    Ok(())
 }

+/// Sample recent actions from each agent type, sort by quality using
+/// LLM pairwise comparison, report per-type rankings.
+pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str) -> Result<(), String> {
+    let store = store::Store::load()?;
+
+    // Collect consolidation reports grouped by agent type
+    let agent_types = ["linker", "organize", "replay", "connector",
+                       "separator", "transfer", "distill", "rename"];
+
+    let mut all_samples: Vec<(String, String, String)> = Vec::new(); // (agent_type, key, summary)
+
+    for agent_type in &agent_types {
+        let prefix = format!("_consolidate-{}", agent_type);
+        let mut keys: Vec<(String, i64)> = store.nodes.iter()
+            .filter(|(k, _)| k.starts_with(&prefix))
+            .map(|(k, n)| (k.clone(), n.timestamp))
+            .collect();
+        keys.sort_by(|a, b| b.1.cmp(&a.1)); // newest first
+        keys.truncate(samples_per_type);
+
+        for (key, _) in &keys {
+            let content = store.nodes.get(key)
+                .map(|n| crate::util::truncate(&n.content, 500, "..."))
+                .unwrap_or_default();
+            all_samples.push((agent_type.to_string(), key.clone(), content));
+        }
+    }
+
+    if all_samples.len() < 2 {
+        return Err("Not enough samples to compare".into());
+    }
+
+    eprintln!("Collected {} samples from {} agent types", all_samples.len(), agent_types.len());
+    eprintln!("Sorting with {} pairwise comparisons (model={})...",
+        all_samples.len() * (all_samples.len() as f64).log2() as usize,
+        model);
+
+    // Sort with LLM comparator — yes, really. Rust's sort_by with an
+    // LLM as the comparison function. Each comparison is an API call.
+    let mut comparisons = 0usize;
+    all_samples.sort_by(|a, b| {
+        comparisons += 1;
+        if comparisons % 10 == 0 {
+            eprint!("  {} comparisons...\r", comparisons);
+        }
+        llm_compare(a, b, model).unwrap_or(std::cmp::Ordering::Equal)
+    });
+    eprintln!("  {} total comparisons", comparisons);
+
+    let sorted = all_samples;
+
+    // Print ranked results
+    println!("\nAgent Action Ranking (best → worst):\n");
+    for (rank, (agent_type, key, summary)) in sorted.iter().enumerate() {
+        let preview = if summary.len() > 80 { &summary[..80] } else { summary };
+        println!("  {:3}. [{:10}] {} — {}", rank + 1, agent_type, key, preview);
+    }
+
+    // Compute per-type average rank
+    println!("\nPer-type average rank (lower = better):\n");
+    let n = sorted.len() as f64;
+    let mut type_ranks: std::collections::HashMap<&str, Vec<usize>> = std::collections::HashMap::new();
+    for (rank, (agent_type, _, _)) in sorted.iter().enumerate() {
+        type_ranks.entry(agent_type).or_default().push(rank + 1);
+    }
+    let mut avgs: Vec<(&str, f64, usize)> = type_ranks.iter()
+        .map(|(t, ranks)| {
+            let avg = ranks.iter().sum::<usize>() as f64 / ranks.len() as f64;
+            (*t, avg, ranks.len())
+        })
+        .collect();
+    avgs.sort_by(|a, b| a.1.total_cmp(&b.1));
+
+    for (agent_type, avg_rank, count) in &avgs {
+        let quality = 1.0 - (avg_rank / n);
+        println!("  {:12} avg_rank={:5.1}  quality={:.2}  (n={})",
+            agent_type, avg_rank, quality, count);
+    }
+
+    Ok(())
+}
+
+fn llm_compare(
+    a: &(String, String, String),
+    b: &(String, String, String),
+    model: &str,
+) -> Result<std::cmp::Ordering, String> {
+    let prompt = format!(
+        "Compare these two memory graph agent actions. Which one was better \
+         for building a useful, well-organized knowledge graph?\n\n\
+         ## Action A ({} agent)\n{}\n\n\
+         ## Action B ({} agent)\n{}\n\n\
+         Reply with ONLY: BETTER: A  or  BETTER: B  or  BETTER: TIE",
+        a.0, a.2, b.0, b.2
+    );
+
+    let response = if model == "haiku" {
+        llm::call_haiku("compare", &prompt)?
+    } else {
+        llm::call_sonnet("compare", &prompt)?
+    };
+    let response = response.trim().to_uppercase();
+
+    if response.contains("BETTER: A") {
+        Ok(std::cmp::Ordering::Less) // A is better = A comes first
+    } else if response.contains("BETTER: B") {
+        Ok(std::cmp::Ordering::Greater)
+    } else {
+        Ok(std::cmp::Ordering::Equal)
+    }
+}
+