From e12dea503b0f17cfdc682ca04f89e5449e961834 Mon Sep 17 00:00:00 2001
From: ProofOfConcept <poc@bcachefs.org>
Date: Sat, 14 Mar 2026 19:24:07 -0400
Subject: [PATCH] agent evaluate: sort agent actions by quality using
 Vec::sort_by with LLM

Yes, really. Rust's stdlib sort_by with an LLM pairwise comparator.
Each comparison is an API call asking "which action was better?"

Sample N actions per agent type, throw them all in a Vec, sort.
Where each agent's samples cluster = that agent's quality score.
Reports per-type average rank and quality ratio.

Supports both haiku (fast/cheap) and sonnet (quality) as comparator.

Usage: poc-memory agent evaluate --samples 5 --model haiku

Co-Authored-By: Kent Overstreet <kent.overstreet@linux.dev>
---
 poc-memory/agents/compare.agent |  45 +++++++++++++
 poc-memory/src/cli/agent.rs     | 113 ++++++++++++++++++++++++++++++++
 poc-memory/src/main.rs          |  11 ++++
 3 files changed, 169 insertions(+)
 create mode 100644 poc-memory/agents/compare.agent
diff --git a/poc-memory/agents/compare.agent b/poc-memory/agents/compare.agent
new file mode 100644
index 0000000..51927fe
--- /dev/null
+++ b/poc-memory/agents/compare.agent
@@ -0,0 +1,45 @@
+{"agent":"compare","query":"","model":"haiku","schedule":""}
+
+# Compare Agent — Pairwise Action Quality Comparison
+
+You compare two memory graph actions and decide which one was better.
+
+## Context
+
+You'll receive two actions (A and B), each with:
+- The agent type that produced it
+- What the action did (LINK, WRITE_NODE, REFINE, etc.)
+- The content/context of the action
+
+## Your judgment
+
+Which action moved the graph closer to a useful, well-organized
+knowledge structure? Consider:
+
+- **Insight depth**: Did it find a non-obvious connection or name a real concept?
+- **Precision**: Are the links between genuinely related nodes?
+- **Integration**: Does it reduce fragmentation, connect isolated clusters?
+- **Quality over quantity**: One perfect link beats five mediocre ones.
+- **Hub creation**: Naming unnamed concepts scores high.
+- **Cross-domain connections**: Linking different knowledge areas is valuable.
+
+## Output
+
+Reply with ONLY one line:
+
+```
+BETTER: A
+```
+or
+```
+BETTER: B
+```
+
+If truly equal:
+```
+BETTER: TIE
+```
+
+No explanation needed. Just the judgment.
+
+{{compare}}
diff --git a/poc-memory/src/cli/agent.rs b/poc-memory/src/cli/agent.rs
index 3b2b5a4..c81c5db 100644
--- a/poc-memory/src/cli/agent.rs
+++ b/poc-memory/src/cli/agent.rs
@@ -2,6 +2,7 @@
 
 use crate::store;
 use crate::store::StoreView;
+use crate::agents::llm;
 
 pub fn cmd_consolidate_batch(count: usize, auto: bool, agent: Option<String>) -> Result<(), String> {
     let store = store::Store::load()?;
@@ -140,3 +141,115 @@ pub fn cmd_fact_mine_store(path: &str) -> Result<(), String> {
     Ok(())
 }
 
+/// Sample recent actions from each agent type, sort by quality using
+/// LLM pairwise comparison, report per-type rankings.
+pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str) -> Result<(), String> {
+    let store = store::Store::load()?;
+
+    // Collect consolidation reports grouped by agent type
+    let agent_types = ["linker", "organize", "replay", "connector",
+                       "separator", "transfer", "distill", "rename"];
+
+    let mut all_samples: Vec<(String, String, String)> = Vec::new(); // (agent_type, key, summary)
+
+    for agent_type in &agent_types {
+        let prefix = format!("_consolidate-{}", agent_type);
+        let mut keys: Vec<(String, i64)> = store.nodes.iter()
+            .filter(|(k, _)| k.starts_with(&prefix))
+            .map(|(k, n)| (k.clone(), n.timestamp))
+            .collect();
+        keys.sort_by(|a, b| b.1.cmp(&a.1)); // newest first
+        keys.truncate(samples_per_type);
+
+        for (key, _) in &keys {
+            let content = store.nodes.get(key)
+                .map(|n| crate::util::truncate(&n.content, 500, "..."))
+                .unwrap_or_default();
+            all_samples.push((agent_type.to_string(), key.clone(), content));
+        }
+    }
+
+    if all_samples.len() < 2 {
+        return Err("Not enough samples to compare".into());
+    }
+
+    eprintln!("Collected {} samples from {} agent types", all_samples.len(), agent_types.len());
+    eprintln!("Sorting with {} pairwise comparisons (model={})...",
+        all_samples.len() * (all_samples.len() as f64).log2() as usize,
+        model);
+
+    // Sort with LLM comparator — yes, really. Rust's sort_by with an
+    // LLM as the comparison function. Each comparison is an API call.
+    let mut comparisons = 0usize;
+    all_samples.sort_by(|a, b| {
+        comparisons += 1;
+        if comparisons % 10 == 0 {
+            eprint!("  {} comparisons...\r", comparisons);
+        }
+        llm_compare(a, b, model).unwrap_or(std::cmp::Ordering::Equal)
+    });
+    eprintln!("  {} total comparisons", comparisons);
+
+    let sorted = all_samples;
+
+    // Print ranked results
+    println!("\nAgent Action Ranking (best → worst):\n");
+    for (rank, (agent_type, key, summary)) in sorted.iter().enumerate() {
+        let preview = if summary.len() > 80 { &summary[..80] } else { summary };
+        println!("  {:3}. [{:10}] {} — {}", rank + 1, agent_type, key, preview);
+    }
+
+    // Compute per-type average rank
+    println!("\nPer-type average rank (lower = better):\n");
+    let n = sorted.len() as f64;
+    let mut type_ranks: std::collections::HashMap<&str, Vec<usize>> = std::collections::HashMap::new();
+    for (rank, (agent_type, _, _)) in sorted.iter().enumerate() {
+        type_ranks.entry(agent_type).or_default().push(rank + 1);
+    }
+    let mut avgs: Vec<(&str, f64, usize)> = type_ranks.iter()
+        .map(|(t, ranks)| {
+            let avg = ranks.iter().sum::<usize>() as f64 / ranks.len() as f64;
+            (*t, avg, ranks.len())
+        })
+        .collect();
+    avgs.sort_by(|a, b| a.1.total_cmp(&b.1));
+
+    for (agent_type, avg_rank, count) in &avgs {
+        let quality = 1.0 - (avg_rank / n);
+        println!("  {:12} avg_rank={:5.1}  quality={:.2}  (n={})",
+            agent_type, avg_rank, quality, count);
+    }
+
+    Ok(())
+}
+
+fn llm_compare(
+    a: &(String, String, String),
+    b: &(String, String, String),
+    model: &str,
+) -> Result<std::cmp::Ordering, String> {
+    let prompt = format!(
+        "Compare these two memory graph agent actions. Which one was better \
+         for building a useful, well-organized knowledge graph?\n\n\
+         ## Action A ({} agent)\n{}\n\n\
+         ## Action B ({} agent)\n{}\n\n\
+         Reply with ONLY: BETTER: A  or  BETTER: B  or  BETTER: TIE",
+        a.0, a.2, b.0, b.2
+    );
+
+    let response = if model == "haiku" {
+        llm::call_haiku("compare", &prompt)?
+    } else {
+        llm::call_sonnet("compare", &prompt)?
+    };
+    let response = response.trim().to_uppercase();
+
+    if response.contains("BETTER: A") {
+        Ok(std::cmp::Ordering::Less) // A is better = A comes first
+    } else if response.contains("BETTER: B") {
+        Ok(std::cmp::Ordering::Greater)
+    } else {
+        Ok(std::cmp::Ordering::Equal)
+    }
+}
+
diff --git a/poc-memory/src/main.rs b/poc-memory/src/main.rs
index d0029e3..1b37e4f 100644
--- a/poc-memory/src/main.rs
+++ b/poc-memory/src/main.rs
@@ -565,6 +565,16 @@ enum AgentCmd {
         #[arg(long, default_value_t = 10)]
         count: usize,
     },
+    /// Evaluate agent quality by LLM-sorted ranking
+    #[command(name = "evaluate")]
+    Evaluate {
+        /// Samples per agent type
+        #[arg(long, default_value_t = 5)]
+        samples: usize,
+        /// Model to use for comparison (haiku or sonnet)
+        #[arg(long, default_value = "haiku")]
+        model: String,
+    },
 }
 
 #[derive(Subcommand)]
@@ -785,6 +795,7 @@ fn main() {
                 => cli::agent::cmd_fact_mine(&path, batch, dry_run, output.as_deref(), min_messages),
             AgentCmd::FactMineStore { path } => cli::agent::cmd_fact_mine_store(&path),
             AgentCmd::ReplayQueue { count } => cli::agent::cmd_replay_queue(count),
+            AgentCmd::Evaluate { samples, model } => cli::agent::cmd_evaluate_agents(samples, &model),
         },
 
         // Admin