evaluate: include agent prompt + affected nodes in comparisons

Each comparison now shows the LLM:
- Agent instructions (the .agent prompt file)
- Report output (what the agent did)
- Affected nodes content (what it changed)

The comparator sees intent, action, and impact — can judge whether
a deletion was correct, whether links are meaningful, whether
WRITE_NODEs capture real insights.

Co-Authored-By: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
ProofOfConcept 2026-03-14 19:34:10 -04:00
parent 433d36aea8
commit b964335317

View file

@ -151,22 +151,55 @@ pub fn cmd_evaluate_agents(samples_per_type: usize, model: &str) -> Result<(), S
let agent_types = ["linker", "organize", "replay", "connector", let agent_types = ["linker", "organize", "replay", "connector",
"separator", "transfer", "distill", "rename"]; "separator", "transfer", "distill", "rename"];
let mut all_samples: Vec<(String, String, String)> = Vec::new(); // (agent_type, key, summary) // Load agent prompt files for context
let prompts_dir = crate::config::get().data_dir
.parent().unwrap_or(std::path::Path::new("."))
.join("poc-memory/agents");
let mut all_samples: Vec<(String, String, String)> = Vec::new(); // (agent_type, key, context)
for agent_type in &agent_types { for agent_type in &agent_types {
// Load the agent's prompt file (skip JSON header line)
let prompt_file = prompts_dir.join(format!("{}.agent", agent_type));
let agent_prompt = std::fs::read_to_string(&prompt_file)
.unwrap_or_default()
.lines().skip(1).collect::<Vec<_>>().join("\n");
let agent_prompt = crate::util::truncate(&agent_prompt, 500, "...");
let prefix = format!("_consolidate-{}", agent_type); let prefix = format!("_consolidate-{}", agent_type);
let mut keys: Vec<(String, i64)> = store.nodes.iter() let mut keys: Vec<(String, i64)> = store.nodes.iter()
.filter(|(k, _)| k.starts_with(&prefix)) .filter(|(k, _)| k.starts_with(&prefix))
.map(|(k, n)| (k.clone(), n.timestamp)) .map(|(k, n)| (k.clone(), n.timestamp))
.collect(); .collect();
keys.sort_by(|a, b| b.1.cmp(&a.1)); // newest first keys.sort_by(|a, b| b.1.cmp(&a.1));
keys.truncate(samples_per_type); keys.truncate(samples_per_type);
for (key, _) in &keys { for (key, _) in &keys {
let content = store.nodes.get(key) let report = store.nodes.get(key)
.map(|n| crate::util::truncate(&n.content, 500, "...")) .map(|n| n.content.clone())
.unwrap_or_default(); .unwrap_or_default();
all_samples.push((agent_type.to_string(), key.clone(), content));
// Extract target node keys mentioned in the report and include their content
let mut target_content = String::new();
for word in report.split_whitespace() {
let clean = word.trim_matches(|c: char| !c.is_alphanumeric() && c != '-' && c != '_');
if clean.len() > 10 && store.nodes.contains_key(clean) {
if let Some(node) = store.nodes.get(clean) {
let preview = crate::util::truncate(&node.content, 200, "...");
target_content.push_str(&format!("\n### {}\n{}\n", clean, preview));
if target_content.len() > 1500 { break; }
}
}
}
let context = format!(
"## Agent instructions\n{}\n\n## Report output\n{}\n\n## Affected nodes\n{}",
agent_prompt,
crate::util::truncate(&report, 1000, "..."),
if target_content.is_empty() { "(none found)".into() } else { target_content }
);
all_samples.push((agent_type.to_string(), key.clone(), context));
} }
} }