evaluate: fix agent prompt path, dedup affected nodes, add --dry-run

- Use CARGO_MANIFEST_DIR for agent file path (same as defs.rs)
- Dedup affected nodes extracted from reports
- --dry-run shows example comparison prompt without LLM calls

Co-Authored-By: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
ProofOfConcept 2026-03-14 19:44:12 -04:00
parent 415180eeab
commit 0cecfdb352
2 changed files with 39 additions and 11 deletions

View file

@ -144,7 +144,7 @@ pub fn cmd_fact_mine_store(path: &str) -> Result<(), String> {
/// Sample recent actions from each agent type, sort by quality using /// Sample recent actions from each agent type, sort by quality using
/// LLM pairwise comparison, report per-type rankings. /// LLM pairwise comparison, report per-type rankings.
pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str) -> Result<(), String> { pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str, dry_run: bool) -> Result<(), String> {
let store = store::Store::load()?; let store = store::Store::load()?;
// Collect consolidation reports grouped by agent type // Collect consolidation reports grouped by agent type
@ -152,9 +152,10 @@ pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str) -> Result<(), S
"separator", "transfer", "distill", "rename"]; "separator", "transfer", "distill", "rename"];
// Load agent prompt files for context // Load agent prompt files for context
let prompts_dir = crate::config::get().data_dir let prompts_dir = {
.parent().unwrap_or(std::path::Path::new(".")) let repo = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("agents");
.join("poc-memory/agents"); if repo.is_dir() { repo } else { crate::store::memory_dir().join("agents") }
};
let mut all_samples: Vec<(String, String, String)> = Vec::new(); // (agent_type, key, context) let mut all_samples: Vec<(String, String, String)> = Vec::new(); // (agent_type, key, context)
@ -181,9 +182,10 @@ pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str) -> Result<(), S
// Extract target node keys mentioned in the report and include their content // Extract target node keys mentioned in the report and include their content
let mut target_content = String::new(); let mut target_content = String::new();
let mut seen_keys = std::collections::HashSet::new();
for word in report.split_whitespace() { for word in report.split_whitespace() {
let clean = word.trim_matches(|c: char| !c.is_alphanumeric() && c != '-' && c != '_'); let clean = word.trim_matches(|c: char| !c.is_alphanumeric() && c != '-' && c != '_');
if clean.len() > 10 && store.nodes.contains_key(clean) { if clean.len() > 10 && seen_keys.insert(clean.to_string()) && store.nodes.contains_key(clean) {
if let Some(node) = store.nodes.get(clean) { if let Some(node) = store.nodes.get(clean) {
let preview = crate::util::truncate(&node.content, 200, "..."); let preview = crate::util::truncate(&node.content, 200, "...");
target_content.push_str(&format!("\n### {}\n{}\n", clean, preview)); target_content.push_str(&format!("\n### {}\n{}\n", clean, preview));
@ -212,6 +214,21 @@ pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str) -> Result<(), S
all_samples.len() * (all_samples.len() as f64).log2() as usize, all_samples.len() * (all_samples.len() as f64).log2() as usize,
model); model);
if dry_run {
// Show what a comparison looks like without calling the LLM
if all_samples.len() >= 2 {
let a = &all_samples[0];
let b = &all_samples[all_samples.len() - 1];
let prompt = build_compare_prompt(a, b);
println!("=== DRY RUN: Example comparison prompt ===\n");
println!("{}", prompt);
println!("\n=== {} samples collected, would do ~{} comparisons ===",
all_samples.len(),
all_samples.len() * (all_samples.len() as f64).log2() as usize);
}
return Ok(());
}
// Sort with LLM comparator — yes, really. Rayon's parallel merge sort // Sort with LLM comparator — yes, really. Rayon's parallel merge sort
// with an LLM as the comparison function. Multiple API calls in parallel. // with an LLM as the comparison function. Multiple API calls in parallel.
let comparisons = AtomicUsize::new(0); let comparisons = AtomicUsize::new(0);
@ -258,12 +275,11 @@ pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str) -> Result<(), S
Ok(()) Ok(())
} }
fn llm_compare( fn build_compare_prompt(
a: &(String, String, String), a: &(String, String, String),
b: &(String, String, String), b: &(String, String, String),
model: &str, ) -> String {
) -> Result<std::cmp::Ordering, String> { if a.0 == b.0 {
let prompt = if a.0 == b.0 {
// Same agent type — show instructions once // Same agent type — show instructions once
// Split context at "## Report output" to extract shared prompt // Split context at "## Report output" to extract shared prompt
let split_a: Vec<&str> = a.2.splitn(2, "## Report output").collect(); let split_a: Vec<&str> = a.2.splitn(2, "## Report output").collect();
@ -290,7 +306,15 @@ fn llm_compare(
BETTER: A or BETTER: B or BETTER: TIE", BETTER: A or BETTER: B or BETTER: TIE",
a.0, a.2, b.0, b.2 a.0, a.2, b.0, b.2
) )
}; }
}
fn llm_compare(
a: &(String, String, String),
b: &(String, String, String),
model: &str,
) -> Result<std::cmp::Ordering, String> {
let prompt = build_compare_prompt(a, b);
let response = if model == "haiku" { let response = if model == "haiku" {
llm::call_haiku("compare", &prompt)? llm::call_haiku("compare", &prompt)?

View file

@ -574,6 +574,9 @@ enum AgentCmd {
/// Model to use for comparison (haiku or sonnet) /// Model to use for comparison (haiku or sonnet)
#[arg(long, default_value = "haiku")] #[arg(long, default_value = "haiku")]
model: String, model: String,
/// Show example comparison prompt without calling LLM
#[arg(long)]
dry_run: bool,
}, },
} }
@ -795,7 +798,8 @@ fn main() {
=> cli::agent::cmd_fact_mine(&path, batch, dry_run, output.as_deref(), min_messages), => cli::agent::cmd_fact_mine(&path, batch, dry_run, output.as_deref(), min_messages),
AgentCmd::FactMineStore { path } => cli::agent::cmd_fact_mine_store(&path), AgentCmd::FactMineStore { path } => cli::agent::cmd_fact_mine_store(&path),
AgentCmd::ReplayQueue { count } => cli::agent::cmd_replay_queue(count), AgentCmd::ReplayQueue { count } => cli::agent::cmd_replay_queue(count),
AgentCmd::Evaluate { samples, model } => cli::agent::cmd_evaluate_agents(samples, &model), AgentCmd::Evaluate { samples, model, dry_run }
=> cli::agent::cmd_evaluate_agents(samples, &model, dry_run),
}, },
// Admin // Admin