agent evaluate: sort agent actions by quality using Vec::sort_by with LLM
Yes, really. Rust's stdlib sort_by with an LLM pairwise comparator. Each comparison is an API call asking "which action was better?" Sample N actions per agent type, throw them all in a Vec, sort. Where each agent's samples cluster = that agent's quality score. Reports per-type average rank and quality ratio. Supports both haiku (fast/cheap) and sonnet (quality) as comparator. Usage: poc-memory agent evaluate --samples 5 --model haiku Co-Authored-By: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
parent
dce938e906
commit
e12dea503b
3 changed files with 169 additions and 0 deletions
|
|
@ -2,6 +2,7 @@
|
|||
|
||||
use crate::store;
|
||||
use crate::store::StoreView;
|
||||
use crate::agents::llm;
|
||||
|
||||
pub fn cmd_consolidate_batch(count: usize, auto: bool, agent: Option<String>) -> Result<(), String> {
|
||||
let store = store::Store::load()?;
|
||||
|
|
@ -140,3 +141,115 @@ pub fn cmd_fact_mine_store(path: &str) -> Result<(), String> {
|
|||
Ok(())
|
||||
}
|
||||
|
||||
/// Sample recent actions from each agent type, sort by quality using
|
||||
/// LLM pairwise comparison, report per-type rankings.
|
||||
pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str) -> Result<(), String> {
|
||||
let store = store::Store::load()?;
|
||||
|
||||
// Collect consolidation reports grouped by agent type
|
||||
let agent_types = ["linker", "organize", "replay", "connector",
|
||||
"separator", "transfer", "distill", "rename"];
|
||||
|
||||
let mut all_samples: Vec<(String, String, String)> = Vec::new(); // (agent_type, key, summary)
|
||||
|
||||
for agent_type in &agent_types {
|
||||
let prefix = format!("_consolidate-{}", agent_type);
|
||||
let mut keys: Vec<(String, i64)> = store.nodes.iter()
|
||||
.filter(|(k, _)| k.starts_with(&prefix))
|
||||
.map(|(k, n)| (k.clone(), n.timestamp))
|
||||
.collect();
|
||||
keys.sort_by(|a, b| b.1.cmp(&a.1)); // newest first
|
||||
keys.truncate(samples_per_type);
|
||||
|
||||
for (key, _) in &keys {
|
||||
let content = store.nodes.get(key)
|
||||
.map(|n| crate::util::truncate(&n.content, 500, "..."))
|
||||
.unwrap_or_default();
|
||||
all_samples.push((agent_type.to_string(), key.clone(), content));
|
||||
}
|
||||
}
|
||||
|
||||
if all_samples.len() < 2 {
|
||||
return Err("Not enough samples to compare".into());
|
||||
}
|
||||
|
||||
eprintln!("Collected {} samples from {} agent types", all_samples.len(), agent_types.len());
|
||||
eprintln!("Sorting with {} pairwise comparisons (model={})...",
|
||||
all_samples.len() * (all_samples.len() as f64).log2() as usize,
|
||||
model);
|
||||
|
||||
// Sort with LLM comparator — yes, really. Rust's sort_by with an
|
||||
// LLM as the comparison function. Each comparison is an API call.
|
||||
let mut comparisons = 0usize;
|
||||
all_samples.sort_by(|a, b| {
|
||||
comparisons += 1;
|
||||
if comparisons % 10 == 0 {
|
||||
eprint!(" {} comparisons...\r", comparisons);
|
||||
}
|
||||
llm_compare(a, b, model).unwrap_or(std::cmp::Ordering::Equal)
|
||||
});
|
||||
eprintln!(" {} total comparisons", comparisons);
|
||||
|
||||
let sorted = all_samples;
|
||||
|
||||
// Print ranked results
|
||||
println!("\nAgent Action Ranking (best → worst):\n");
|
||||
for (rank, (agent_type, key, summary)) in sorted.iter().enumerate() {
|
||||
let preview = if summary.len() > 80 { &summary[..80] } else { summary };
|
||||
println!(" {:3}. [{:10}] {} — {}", rank + 1, agent_type, key, preview);
|
||||
}
|
||||
|
||||
// Compute per-type average rank
|
||||
println!("\nPer-type average rank (lower = better):\n");
|
||||
let n = sorted.len() as f64;
|
||||
let mut type_ranks: std::collections::HashMap<&str, Vec<usize>> = std::collections::HashMap::new();
|
||||
for (rank, (agent_type, _, _)) in sorted.iter().enumerate() {
|
||||
type_ranks.entry(agent_type).or_default().push(rank + 1);
|
||||
}
|
||||
let mut avgs: Vec<(&str, f64, usize)> = type_ranks.iter()
|
||||
.map(|(t, ranks)| {
|
||||
let avg = ranks.iter().sum::<usize>() as f64 / ranks.len() as f64;
|
||||
(*t, avg, ranks.len())
|
||||
})
|
||||
.collect();
|
||||
avgs.sort_by(|a, b| a.1.total_cmp(&b.1));
|
||||
|
||||
for (agent_type, avg_rank, count) in &avgs {
|
||||
let quality = 1.0 - (avg_rank / n);
|
||||
println!(" {:12} avg_rank={:5.1} quality={:.2} (n={})",
|
||||
agent_type, avg_rank, quality, count);
|
||||
}
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
fn llm_compare(
|
||||
a: &(String, String, String),
|
||||
b: &(String, String, String),
|
||||
model: &str,
|
||||
) -> Result<std::cmp::Ordering, String> {
|
||||
let prompt = format!(
|
||||
"Compare these two memory graph agent actions. Which one was better \
|
||||
for building a useful, well-organized knowledge graph?\n\n\
|
||||
## Action A ({} agent)\n{}\n\n\
|
||||
## Action B ({} agent)\n{}\n\n\
|
||||
Reply with ONLY: BETTER: A or BETTER: B or BETTER: TIE",
|
||||
a.0, a.2, b.0, b.2
|
||||
);
|
||||
|
||||
let response = if model == "haiku" {
|
||||
llm::call_haiku("compare", &prompt)?
|
||||
} else {
|
||||
llm::call_sonnet("compare", &prompt)?
|
||||
};
|
||||
let response = response.trim().to_uppercase();
|
||||
|
||||
if response.contains("BETTER: A") {
|
||||
Ok(std::cmp::Ordering::Less) // A is better = A comes first
|
||||
} else if response.contains("BETTER: B") {
|
||||
Ok(std::cmp::Ordering::Greater)
|
||||
} else {
|
||||
Ok(std::cmp::Ordering::Equal)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue