From e12dea503b0f17cfdc682ca04f89e5449e961834 Mon Sep 17 00:00:00 2001 From: ProofOfConcept Date: Sat, 14 Mar 2026 19:24:07 -0400 Subject: [PATCH] agent evaluate: sort agent actions by quality using Vec::sort_by with LLM Yes, really. Rust's stdlib sort_by with an LLM pairwise comparator. Each comparison is an API call asking "which action was better?" Sample N actions per agent type, throw them all in a Vec, sort. Where each agent's samples cluster = that agent's quality score. Reports per-type average rank and quality ratio. Supports both haiku (fast/cheap) and sonnet (quality) as comparator. Usage: poc-memory agent evaluate --samples 5 --model haiku Co-Authored-By: Kent Overstreet --- poc-memory/agents/compare.agent | 45 +++++++++++++ poc-memory/src/cli/agent.rs | 113 ++++++++++++++++++++++++++++++++ poc-memory/src/main.rs | 11 ++++ 3 files changed, 169 insertions(+) create mode 100644 poc-memory/agents/compare.agent diff --git a/poc-memory/agents/compare.agent b/poc-memory/agents/compare.agent new file mode 100644 index 0000000..51927fe --- /dev/null +++ b/poc-memory/agents/compare.agent @@ -0,0 +1,45 @@ +{"agent":"compare","query":"","model":"haiku","schedule":""} + +# Compare Agent — Pairwise Action Quality Comparison + +You compare two memory graph actions and decide which one was better. + +## Context + +You'll receive two actions (A and B), each with: +- The agent type that produced it +- What the action did (LINK, WRITE_NODE, REFINE, etc.) +- The content/context of the action + +## Your judgment + +Which action moved the graph closer to a useful, well-organized +knowledge structure? Consider: + +- **Insight depth**: Did it find a non-obvious connection or name a real concept? +- **Precision**: Are the links between genuinely related nodes? +- **Integration**: Does it reduce fragmentation, connect isolated clusters? +- **Quality over quantity**: One perfect link beats five mediocre ones. +- **Hub creation**: Naming unnamed concepts scores high. +- **Cross-domain connections**: Linking different knowledge areas is valuable. + +## Output + +Reply with ONLY one line: + +``` +BETTER: A +``` +or +``` +BETTER: B +``` + +If truly equal: +``` +BETTER: TIE +``` + +No explanation needed. Just the judgment. + +{{compare}} diff --git a/poc-memory/src/cli/agent.rs b/poc-memory/src/cli/agent.rs index 3b2b5a4..c81c5db 100644 --- a/poc-memory/src/cli/agent.rs +++ b/poc-memory/src/cli/agent.rs @@ -2,6 +2,7 @@ use crate::store; use crate::store::StoreView; +use crate::agents::llm; pub fn cmd_consolidate_batch(count: usize, auto: bool, agent: Option) -> Result<(), String> { let store = store::Store::load()?; @@ -140,3 +141,115 @@ pub fn cmd_fact_mine_store(path: &str) -> Result<(), String> { Ok(()) } +/// Sample recent actions from each agent type, sort by quality using +/// LLM pairwise comparison, report per-type rankings. +pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str) -> Result<(), String> { + let store = store::Store::load()?; + + // Collect consolidation reports grouped by agent type + let agent_types = ["linker", "organize", "replay", "connector", + "separator", "transfer", "distill", "rename"]; + + let mut all_samples: Vec<(String, String, String)> = Vec::new(); // (agent_type, key, summary) + + for agent_type in &agent_types { + let prefix = format!("_consolidate-{}", agent_type); + let mut keys: Vec<(String, i64)> = store.nodes.iter() + .filter(|(k, _)| k.starts_with(&prefix)) + .map(|(k, n)| (k.clone(), n.timestamp)) + .collect(); + keys.sort_by(|a, b| b.1.cmp(&a.1)); // newest first + keys.truncate(samples_per_type); + + for (key, _) in &keys { + let content = store.nodes.get(key) + .map(|n| crate::util::truncate(&n.content, 500, "...")) + .unwrap_or_default(); + all_samples.push((agent_type.to_string(), key.clone(), content)); + } + } + + if all_samples.len() < 2 { + return Err("Not enough samples to compare".into()); + } + + eprintln!("Collected {} samples from {} agent types", all_samples.len(), agent_types.len()); + eprintln!("Sorting with {} pairwise comparisons (model={})...", + all_samples.len() * (all_samples.len() as f64).log2() as usize, + model); + + // Sort with LLM comparator — yes, really. Rust's sort_by with an + // LLM as the comparison function. Each comparison is an API call. + let mut comparisons = 0usize; + all_samples.sort_by(|a, b| { + comparisons += 1; + if comparisons % 10 == 0 { + eprint!(" {} comparisons...\r", comparisons); + } + llm_compare(a, b, model).unwrap_or(std::cmp::Ordering::Equal) + }); + eprintln!(" {} total comparisons", comparisons); + + let sorted = all_samples; + + // Print ranked results + println!("\nAgent Action Ranking (best → worst):\n"); + for (rank, (agent_type, key, summary)) in sorted.iter().enumerate() { + let preview = if summary.len() > 80 { &summary[..80] } else { summary }; + println!(" {:3}. [{:10}] {} — {}", rank + 1, agent_type, key, preview); + } + + // Compute per-type average rank + println!("\nPer-type average rank (lower = better):\n"); + let n = sorted.len() as f64; + let mut type_ranks: std::collections::HashMap<&str, Vec> = std::collections::HashMap::new(); + for (rank, (agent_type, _, _)) in sorted.iter().enumerate() { + type_ranks.entry(agent_type).or_default().push(rank + 1); + } + let mut avgs: Vec<(&str, f64, usize)> = type_ranks.iter() + .map(|(t, ranks)| { + let avg = ranks.iter().sum::() as f64 / ranks.len() as f64; + (*t, avg, ranks.len()) + }) + .collect(); + avgs.sort_by(|a, b| a.1.total_cmp(&b.1)); + + for (agent_type, avg_rank, count) in &avgs { + let quality = 1.0 - (avg_rank / n); + println!(" {:12} avg_rank={:5.1} quality={:.2} (n={})", + agent_type, avg_rank, quality, count); + } + + Ok(()) +} + +fn llm_compare( + a: &(String, String, String), + b: &(String, String, String), + model: &str, +) -> Result { + let prompt = format!( + "Compare these two memory graph agent actions. Which one was better \ + for building a useful, well-organized knowledge graph?\n\n\ + ## Action A ({} agent)\n{}\n\n\ + ## Action B ({} agent)\n{}\n\n\ + Reply with ONLY: BETTER: A or BETTER: B or BETTER: TIE", + a.0, a.2, b.0, b.2 + ); + + let response = if model == "haiku" { + llm::call_haiku("compare", &prompt)? + } else { + llm::call_sonnet("compare", &prompt)? + }; + let response = response.trim().to_uppercase(); + + if response.contains("BETTER: A") { + Ok(std::cmp::Ordering::Less) // A is better = A comes first + } else if response.contains("BETTER: B") { + Ok(std::cmp::Ordering::Greater) + } else { + Ok(std::cmp::Ordering::Equal) + } +} + diff --git a/poc-memory/src/main.rs b/poc-memory/src/main.rs index d0029e3..1b37e4f 100644 --- a/poc-memory/src/main.rs +++ b/poc-memory/src/main.rs @@ -565,6 +565,16 @@ enum AgentCmd { #[arg(long, default_value_t = 10)] count: usize, }, + /// Evaluate agent quality by LLM-sorted ranking + #[command(name = "evaluate")] + Evaluate { + /// Samples per agent type + #[arg(long, default_value_t = 5)] + samples: usize, + /// Model to use for comparison (haiku or sonnet) + #[arg(long, default_value = "haiku")] + model: String, + }, } #[derive(Subcommand)] @@ -785,6 +795,7 @@ fn main() { => cli::agent::cmd_fact_mine(&path, batch, dry_run, output.as_deref(), min_messages), AgentCmd::FactMineStore { path } => cli::agent::cmd_fact_mine_store(&path), AgentCmd::ReplayQueue { count } => cli::agent::cmd_replay_queue(count), + AgentCmd::Evaluate { samples, model } => cli::agent::cmd_evaluate_agents(samples, &model), }, // Admin