diff --git a/poc-memory/src/cli/agent.rs b/poc-memory/src/cli/agent.rs index 73fed11..7eb2a7c 100644 --- a/poc-memory/src/cli/agent.rs +++ b/poc-memory/src/cli/agent.rs @@ -144,7 +144,7 @@ pub fn cmd_fact_mine_store(path: &str) -> Result<(), String> { /// Sample recent actions from each agent type, sort by quality using /// LLM pairwise comparison, report per-type rankings. -pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str) -> Result<(), String> { +pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str, dry_run: bool) -> Result<(), String> { let store = store::Store::load()?; // Collect consolidation reports grouped by agent type @@ -152,9 +152,10 @@ pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str) -> Result<(), S "separator", "transfer", "distill", "rename"]; // Load agent prompt files for context - let prompts_dir = crate::config::get().data_dir - .parent().unwrap_or(std::path::Path::new(".")) - .join("poc-memory/agents"); + let prompts_dir = { + let repo = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("agents"); + if repo.is_dir() { repo } else { crate::store::memory_dir().join("agents") } + }; let mut all_samples: Vec<(String, String, String)> = Vec::new(); // (agent_type, key, context) @@ -181,9 +182,10 @@ pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str) -> Result<(), S // Extract target node keys mentioned in the report and include their content let mut target_content = String::new(); + let mut seen_keys = std::collections::HashSet::new(); for word in report.split_whitespace() { let clean = word.trim_matches(|c: char| !c.is_alphanumeric() && c != '-' && c != '_'); - if clean.len() > 10 && store.nodes.contains_key(clean) { + if clean.len() > 10 && seen_keys.insert(clean.to_string()) && store.nodes.contains_key(clean) { if let Some(node) = store.nodes.get(clean) { let preview = crate::util::truncate(&node.content, 200, "..."); target_content.push_str(&format!("\n### {}\n{}\n", clean, preview)); @@ -212,6 +214,21 @@ pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str) -> Result<(), S all_samples.len() * (all_samples.len() as f64).log2() as usize, model); + if dry_run { + // Show what a comparison looks like without calling the LLM + if all_samples.len() >= 2 { + let a = &all_samples[0]; + let b = &all_samples[all_samples.len() - 1]; + let prompt = build_compare_prompt(a, b); + println!("=== DRY RUN: Example comparison prompt ===\n"); + println!("{}", prompt); + println!("\n=== {} samples collected, would do ~{} comparisons ===", + all_samples.len(), + all_samples.len() * (all_samples.len() as f64).log2() as usize); + } + return Ok(()); + } + // Sort with LLM comparator — yes, really. Rayon's parallel merge sort // with an LLM as the comparison function. Multiple API calls in parallel. let comparisons = AtomicUsize::new(0); @@ -258,12 +275,11 @@ pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str) -> Result<(), S Ok(()) } -fn llm_compare( +fn build_compare_prompt( a: &(String, String, String), b: &(String, String, String), - model: &str, -) -> Result { - let prompt = if a.0 == b.0 { +) -> String { + if a.0 == b.0 { // Same agent type — show instructions once // Split context at "## Report output" to extract shared prompt let split_a: Vec<&str> = a.2.splitn(2, "## Report output").collect(); @@ -290,7 +306,15 @@ fn llm_compare( BETTER: A or BETTER: B or BETTER: TIE", a.0, a.2, b.0, b.2 ) - }; + } +} + +fn llm_compare( + a: &(String, String, String), + b: &(String, String, String), + model: &str, +) -> Result { + let prompt = build_compare_prompt(a, b); let response = if model == "haiku" { llm::call_haiku("compare", &prompt)? diff --git a/poc-memory/src/main.rs b/poc-memory/src/main.rs index 1b37e4f..c01cfad 100644 --- a/poc-memory/src/main.rs +++ b/poc-memory/src/main.rs @@ -574,6 +574,9 @@ enum AgentCmd { /// Model to use for comparison (haiku or sonnet) #[arg(long, default_value = "haiku")] model: String, + /// Show example comparison prompt without calling LLM + #[arg(long)] + dry_run: bool, }, } @@ -795,7 +798,8 @@ fn main() { => cli::agent::cmd_fact_mine(&path, batch, dry_run, output.as_deref(), min_messages), AgentCmd::FactMineStore { path } => cli::agent::cmd_fact_mine_store(&path), AgentCmd::ReplayQueue { count } => cli::agent::cmd_replay_queue(count), - AgentCmd::Evaluate { samples, model } => cli::agent::cmd_evaluate_agents(samples, &model), + AgentCmd::Evaluate { samples, model, dry_run } + => cli::agent::cmd_evaluate_agents(samples, &model, dry_run), }, // Admin