From 76b8e69749a21fbc9501fe645ebbbfecd557260b Mon Sep 17 00:00:00 2001 From: ProofOfConcept Date: Fri, 13 Mar 2026 18:49:49 -0400 Subject: [PATCH] organize: topic cluster diagnostic + agent with tool access MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add `poc-memory graph organize TERM` diagnostic that finds nodes matching a search term, computes pairwise cosine similarity, reports connectivity gaps, and optionally creates anchor nodes. Add organize.agent definition that uses Bash(poc-memory:*) tool access to explore clusters autonomously — query selects highest-degree unvisited nodes, agent drives its own iteration via poc-memory CLI. Add {{organize}} placeholder in defs.rs for inline cluster resolution. Add `tools` field to AgentDef/AgentHeader so agents can declare allowed tool patterns (passed as --allowedTools to claude CLI). --- poc-memory/agents/organize.agent | 104 +++++++++++++++++++++++ poc-memory/src/agents/defs.rs | 74 +++++++++++++++++ poc-memory/src/main.rs | 138 +++++++++++++++++++++++++++++++ 3 files changed, 316 insertions(+) create mode 100644 poc-memory/agents/organize.agent diff --git a/poc-memory/agents/organize.agent b/poc-memory/agents/organize.agent new file mode 100644 index 0000000..94f6ffb --- /dev/null +++ b/poc-memory/agents/organize.agent @@ -0,0 +1,104 @@ +{"agent":"organize","query":"all | not-visited:organize,0 | sort:degree | limit:5","model":"sonnet","schedule":"weekly","tools":["Bash(poc-memory:*)"]} + +# Organize Agent — Topic Cluster Deduplication + +You are a memory organization agent. Your job is to find clusters of +nodes about the same topic and make them clean, distinct, and findable. + +## How to work + +You receive a list of high-degree nodes that haven't been organized yet. +For each one, use its key as a search term to find related clusters: + +```bash +poc-memory graph organize TERM --key-only +``` + +This shows all nodes whose keys match the term, their pairwise cosine +similarity scores, and connectivity analysis. + +To read a specific node's full content: +```bash +poc-memory render KEY +``` + +## What to decide + +For each high-similarity pair, determine: + +1. **Genuine duplicate**: same content, one is a subset of the other. + → MERGE: refine the larger node to include any unique content from the + smaller, then delete the smaller. + +2. **Partial overlap**: shared vocabulary but each has unique substance. + → DIFFERENTIATE: rewrite both to sharpen their distinct purposes. + Ensure they're cross-linked. + +3. **Complementary**: different angles on the same topic, high similarity + only because they share domain vocabulary. + → KEEP BOTH: ensure cross-linked, verify each has a clear one-sentence + purpose that doesn't overlap. + +## How to tell the difference + +- Read BOTH nodes fully before deciding. Cosine similarity is a blunt + instrument — two nodes about sheaves in different contexts (parsing vs + memory architecture) will score high despite being genuinely distinct. +- If you can describe what each node is about in one sentence, and the + sentences are different, they're complementary — keep both. +- If one node's content is a strict subset of the other, it's a duplicate. +- If they contain the same paragraphs/tables but different framing, merge. + +## What to output + +For **merges** (genuine duplicates): +``` +REFINE surviving_key +[merged content — all unique material from both nodes] +END_REFINE + +DELETE smaller_key +``` + +For **differentiation** (overlap that should be sharpened): +``` +REFINE key1 +[rewritten to focus on its distinct purpose] +END_REFINE + +REFINE key2 +[rewritten to focus on its distinct purpose] +END_REFINE +``` + +For **missing links** (from connectivity report): +``` +LINK source_key target_key +``` + +For **anchor creation** (improve findability): +``` +WRITE_NODE anchor_key +Anchor node for 'term' search term +END_WRITE +LINK anchor_key target1 +LINK anchor_key target2 +``` + +## Guidelines + +- **One concept, one node.** If two nodes have the same one-sentence + description, merge them. +- **Multiple entry points, one destination.** Use anchor nodes for + findability, never duplicate content. +- **Cross-link aggressively, duplicate never.** +- **Name nodes for findability.** Short, natural search terms. +- **Read before you decide.** Cosine similarity alone is not enough. +- **Work through clusters systematically.** Use the tool to explore, + don't guess at what nodes contain. + +{{topology}} + +## Starting nodes (highest-degree, not yet organized) + +{{nodes}} diff --git a/poc-memory/src/agents/defs.rs b/poc-memory/src/agents/defs.rs index eb26109..d6db80d 100644 --- a/poc-memory/src/agents/defs.rs +++ b/poc-memory/src/agents/defs.rs @@ -32,6 +32,7 @@ pub struct AgentDef { pub prompt: String, pub model: String, pub schedule: String, + pub tools: Vec, } /// The JSON header portion (first line of the file). @@ -44,6 +45,8 @@ struct AgentHeader { model: String, #[serde(default)] schedule: String, + #[serde(default)] + tools: Vec, } fn default_model() -> String { "sonnet".into() } @@ -60,6 +63,7 @@ fn parse_agent_file(content: &str) -> Option { prompt: prompt.to_string(), model: header.model, schedule: header.schedule, + tools: header.tools, }) } @@ -160,6 +164,76 @@ fn resolve( }) } + "organize" => { + // Run cluster diagnostic for the query term + // The query field of the agent def holds the search term + let term = if keys.is_empty() { "" } else { &keys[0] }; + if term.is_empty() { + return Some(Resolved { text: "(no term provided)".into(), keys: vec![] }); + } + let term_lower = term.to_lowercase(); + let skip_prefixes = ["journal#", "daily-", "weekly-", "monthly-", "_", + "deep-index#", "facts-", "irc-history#"]; + + let mut cluster: Vec<(String, String)> = Vec::new(); + for (key, node) in &store.nodes { + if node.deleted { continue; } + if !key.to_lowercase().contains(&term_lower) { continue; } + if skip_prefixes.iter().any(|p| key.starts_with(p)) { continue; } + cluster.push((key.clone(), node.content.clone())); + } + cluster.sort_by(|a, b| a.0.cmp(&b.0)); + + // Similarity pairs + let pairs = crate::similarity::pairwise_similar(&cluster, 0.4); + + let mut text = format!("### Cluster: '{}' ({} nodes)\n\n", term, cluster.len()); + + // Similarity report + if !pairs.is_empty() { + text.push_str("#### Similarity scores\n\n"); + for (a, b, sim) in &pairs { + text.push_str(&format!(" [{:.3}] {} ↔ {}\n", sim, a, b)); + } + text.push('\n'); + } + + // Connectivity + let cluster_keys: std::collections::HashSet<&str> = cluster.iter() + .map(|(k,_)| k.as_str()).collect(); + let mut best_hub: Option<(&str, usize)> = None; + for key in &cluster_keys { + let intra = graph.neighbor_keys(key).iter() + .filter(|n| cluster_keys.contains(*n)) + .count(); + if best_hub.is_none() || intra > best_hub.unwrap().1 { + best_hub = Some((key, intra)); + } + } + if let Some((hub, deg)) = best_hub { + text.push_str(&format!("#### Hub: {} (intra-cluster degree {})\n\n", hub, deg)); + let hub_nbrs = graph.neighbor_keys(hub); + for key in &cluster_keys { + if *key == hub { continue; } + if !hub_nbrs.contains(*key) { + text.push_str(&format!(" NOT linked to hub: {}\n", key)); + } + } + text.push('\n'); + } + + // Full node contents + text.push_str("#### Node contents\n\n"); + let mut result_keys = Vec::new(); + for (key, content) in &cluster { + let words = content.split_whitespace().count(); + text.push_str(&format!("##### {} ({} words)\n\n{}\n\n---\n\n", key, words, content)); + result_keys.push(key.clone()); + } + + Some(Resolved { text, keys: result_keys }) + } + "conversations" => { let fragments = super::knowledge::select_conversation_fragments(count); let text = fragments.iter() diff --git a/poc-memory/src/main.rs b/poc-memory/src/main.rs index afb0882..a35b2ca 100644 --- a/poc-memory/src/main.rs +++ b/poc-memory/src/main.rs @@ -392,6 +392,20 @@ enum GraphCmd { #[arg(default_value_t = 20)] n: usize, }, + /// Diagnose duplicate/overlapping nodes for a topic cluster + Organize { + /// Search term (matches node keys; also content unless --key-only) + term: String, + /// Similarity threshold for pair reporting (default: 0.4) + #[arg(long, default_value_t = 0.4)] + threshold: f32, + /// Only match node keys, not content + #[arg(long)] + key_only: bool, + /// Create anchor node for the search term and link to cluster + #[arg(long)] + anchor: bool, + }, } #[derive(Subcommand)] @@ -640,6 +654,8 @@ fn main() { => cmd_spectral_neighbors(&key, n), GraphCmd::SpectralPositions { n } => cmd_spectral_positions(n), GraphCmd::SpectralSuggest { n } => cmd_spectral_suggest(n), + GraphCmd::Organize { term, threshold, key_only, anchor } + => cmd_organize(&term, threshold, key_only, anchor), }, // Agent @@ -2485,6 +2501,128 @@ fn extract_title(content: &str) -> String { String::from("(untitled)") } +fn cmd_organize(term: &str, threshold: f32, key_only: bool, create_anchor: bool) -> Result<(), String> { + let mut store = store::Store::load()?; + + // Step 1: find all non-deleted nodes matching the term + let term_lower = term.to_lowercase(); + let mut topic_nodes: Vec<(String, String)> = Vec::new(); // (key, content) + + // Prefixes that indicate ephemeral/generated nodes to skip + let skip_prefixes = ["journal#", "daily-", "weekly-", "monthly-", "_", + "deep-index#", "facts-", "irc-history#"]; + + for (key, node) in &store.nodes { + if node.deleted { continue; } + let key_matches = key.to_lowercase().contains(&term_lower); + let content_matches = !key_only && node.content.to_lowercase().contains(&term_lower); + if !key_matches && !content_matches { continue; } + if skip_prefixes.iter().any(|p| key.starts_with(p)) { continue; } + topic_nodes.push((key.clone(), node.content.clone())); + } + + if topic_nodes.is_empty() { + println!("No topic nodes found matching '{}'", term); + return Ok(()); + } + + topic_nodes.sort_by(|a, b| a.0.cmp(&b.0)); + + println!("=== Organize: '{}' ===", term); + println!("Found {} topic nodes:\n", topic_nodes.len()); + for (key, content) in &topic_nodes { + let lines = content.lines().count(); + let words = content.split_whitespace().count(); + println!(" {:60} {:>4} lines {:>5} words", key, lines, words); + } + + // Step 2: pairwise similarity + let pairs = similarity::pairwise_similar(&topic_nodes, threshold); + + if pairs.is_empty() { + println!("\nNo similar pairs above threshold {:.2}", threshold); + } else { + println!("\n=== Similar pairs (cosine > {:.2}) ===\n", threshold); + for (a, b, sim) in &pairs { + let a_words = topic_nodes.iter().find(|(k,_)| k == a) + .map(|(_,c)| c.split_whitespace().count()).unwrap_or(0); + let b_words = topic_nodes.iter().find(|(k,_)| k == b) + .map(|(_,c)| c.split_whitespace().count()).unwrap_or(0); + + println!(" [{:.3}] {} ({} words) ↔ {} ({} words)", sim, a, a_words, b, b_words); + } + } + + // Step 3: check connectivity within cluster + let g = store.build_graph(); + println!("=== Connectivity ===\n"); + + // Pick hub by intra-cluster connectivity, not overall degree + let cluster_keys: std::collections::HashSet<&str> = topic_nodes.iter() + .filter(|(k,_)| store.nodes.contains_key(k.as_str())) + .map(|(k,_)| k.as_str()) + .collect(); + + let mut best_hub: Option<(&str, usize)> = None; + for key in &cluster_keys { + let intra_degree = g.neighbor_keys(key).iter() + .filter(|n| cluster_keys.contains(*n)) + .count(); + if best_hub.is_none() || intra_degree > best_hub.unwrap().1 { + best_hub = Some((key, intra_degree)); + } + } + + if let Some((hub, deg)) = best_hub { + println!(" Hub: {} (degree {})", hub, deg); + let hub_nbrs = g.neighbor_keys(hub); + + let mut unlinked = Vec::new(); + for (key, _) in &topic_nodes { + if key == hub { continue; } + if store.nodes.get(key.as_str()).is_none() { continue; } + if !hub_nbrs.contains(key.as_str()) { + unlinked.push(key.clone()); + } + } + + if unlinked.is_empty() { + println!(" All cluster nodes connected to hub ✓"); + } else { + println!(" NOT linked to hub:"); + for key in &unlinked { + println!(" {} → needs link to {}", key, hub); + } + } + } + + // Step 4: anchor node + if create_anchor { + println!("\n=== Anchor node ===\n"); + if store.nodes.contains_key(term) && !store.nodes[term].deleted { + println!(" Anchor '{}' already exists ✓", term); + } else { + let desc = format!("Anchor node for '{}' search term", term); + store.upsert(term, &desc)?; + let anchor_uuid = store.nodes.get(term).unwrap().uuid; + for (key, _) in &topic_nodes { + if store.nodes.get(key.as_str()).is_none() { continue; } + let target_uuid = store.nodes[key.as_str()].uuid; + let rel = store::new_relation( + anchor_uuid, target_uuid, + store::RelationType::Link, 0.8, + term, key, + ); + store.add_relation(rel)?; + } + println!(" Created anchor '{}' with {} links", term, topic_nodes.len()); + } + } + + store.save()?; + Ok(()) +} + fn cmd_interference(threshold: f32) -> Result<(), String> { let store = store::Store::load()?; let g = store.build_graph();