evaluate: fix agent prompt path, dedup affected nodes, add --dry-run

- Use CARGO_MANIFEST_DIR for agent file path (same as defs.rs) - Dedup affected nodes extracted from reports - --dry-run shows example comparison prompt without LLM calls Co-Authored-By: Kent Overstreet <kent.overstreet@linux.dev>
2026-03-14 19:44:12 -04:00 · 2026-03-14 19:44:12 -04:00 · 0cecfdb352
commit 0cecfdb352
parent 415180eeab
2 changed files with 39 additions and 11 deletions
--- a/poc-memory/src/cli/agent.rs
+++ b/poc-memory/src/cli/agent.rs
@ -144,7 +144,7 @@ pub fn cmd_fact_mine_store(path: &str) -> Result<(), String> {

 /// Sample recent actions from each agent type, sort by quality using
 /// LLM pairwise comparison, report per-type rankings.
-pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str) -> Result<(), String> {
+pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str, dry_run: bool) -> Result<(), String> {
    let store = store::Store::load()?;

    // Collect consolidation reports grouped by agent type
@ -152,9 +152,10 @@ pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str) -> Result<(), S
                       "separator", "transfer", "distill", "rename"];

    // Load agent prompt files for context
-    let prompts_dir = crate::config::get().data_dir
-        .parent().unwrap_or(std::path::Path::new("."))
-        .join("poc-memory/agents");
+    let prompts_dir = {
+        let repo = std::path::PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("agents");
+        if repo.is_dir() { repo } else { crate::store::memory_dir().join("agents") }
+    };

    let mut all_samples: Vec<(String, String, String)> = Vec::new(); // (agent_type, key, context)

@ -181,9 +182,10 @@ pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str) -> Result<(), S

            // Extract target node keys mentioned in the report and include their content
            let mut target_content = String::new();
+            let mut seen_keys = std::collections::HashSet::new();
            for word in report.split_whitespace() {
                let clean = word.trim_matches(|c: char| !c.is_alphanumeric() && c != '-' && c != '_');
-                if clean.len() > 10 && store.nodes.contains_key(clean) {
+                if clean.len() > 10 && seen_keys.insert(clean.to_string()) && store.nodes.contains_key(clean) {
                    if let Some(node) = store.nodes.get(clean) {
                        let preview = crate::util::truncate(&node.content, 200, "...");
                        target_content.push_str(&format!("\n### {}\n{}\n", clean, preview));
@ -212,6 +214,21 @@ pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str) -> Result<(), S
        all_samples.len() * (all_samples.len() as f64).log2() as usize,
        model);

+    if dry_run {
+        // Show what a comparison looks like without calling the LLM
+        if all_samples.len() >= 2 {
+            let a = &all_samples[0];
+            let b = &all_samples[all_samples.len() - 1];
+            let prompt = build_compare_prompt(a, b);
+            println!("=== DRY RUN: Example comparison prompt ===\n");
+            println!("{}", prompt);
+            println!("\n=== {} samples collected, would do ~{} comparisons ===",
+                all_samples.len(),
+                all_samples.len() * (all_samples.len() as f64).log2() as usize);
+        }
+        return Ok(());
+    }
+
    // Sort with LLM comparator — yes, really. Rayon's parallel merge sort
    // with an LLM as the comparison function. Multiple API calls in parallel.
    let comparisons = AtomicUsize::new(0);
@ -258,12 +275,11 @@ pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str) -> Result<(), S
    Ok(())
 }

-fn llm_compare(
+fn build_compare_prompt(
    a: &(String, String, String),
    b: &(String, String, String),
-    model: &str,
-) -> Result<std::cmp::Ordering, String> {
-    let prompt = if a.0 == b.0 {
+) -> String {
+    if a.0 == b.0 {
        // Same agent type — show instructions once
        // Split context at "## Report output" to extract shared prompt
        let split_a: Vec<&str> = a.2.splitn(2, "## Report output").collect();
@ -290,7 +306,15 @@ fn llm_compare(
             BETTER: A  or  BETTER: B  or  BETTER: TIE",
            a.0, a.2, b.0, b.2
        )
-    };
+    }
+}
+
+fn llm_compare(
+    a: &(String, String, String),
+    b: &(String, String, String),
+    model: &str,
+) -> Result<std::cmp::Ordering, String> {
+    let prompt = build_compare_prompt(a, b);

    let response = if model == "haiku" {
        llm::call_haiku("compare", &prompt)?