From 76b8e69749a21fbc9501fe645ebbbfecd557260b Mon Sep 17 00:00:00 2001
From: ProofOfConcept <poc@bcachefs.org>
Date: Fri, 13 Mar 2026 18:49:49 -0400
Subject: [PATCH] organize: topic cluster diagnostic + agent with tool access
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add `poc-memory graph organize TERM` diagnostic that finds nodes
matching a search term, computes pairwise cosine similarity, reports
connectivity gaps, and optionally creates anchor nodes.

Add organize.agent definition that uses Bash(poc-memory:*) tool access
to explore clusters autonomously — query selects highest-degree
unvisited nodes, agent drives its own iteration via poc-memory CLI.

Add {{organize}} placeholder in defs.rs for inline cluster resolution.

Add `tools` field to AgentDef/AgentHeader so agents can declare
allowed tool patterns (passed as --allowedTools to claude CLI).
---
 poc-memory/agents/organize.agent | 104 +++++++++++++++++++++++
 poc-memory/src/agents/defs.rs    |  74 +++++++++++++++++
 poc-memory/src/main.rs           | 138 +++++++++++++++++++++++++++++++
 3 files changed, 316 insertions(+)
 create mode 100644 poc-memory/agents/organize.agent

diff --git a/poc-memory/agents/organize.agent b/poc-memory/agents/organize.agent
new file mode 100644
index 0000000..94f6ffb
--- /dev/null
+++ b/poc-memory/agents/organize.agent
@@ -0,0 +1,104 @@
+{"agent":"organize","query":"all | not-visited:organize,0 | sort:degree | limit:5","model":"sonnet","schedule":"weekly","tools":["Bash(poc-memory:*)"]}
+
+# Organize Agent — Topic Cluster Deduplication
+
+You are a memory organization agent. Your job is to find clusters of
+nodes about the same topic and make them clean, distinct, and findable.
+
+## How to work
+
+You receive a list of high-degree nodes that haven't been organized yet.
+For each one, use its key as a search term to find related clusters:
+
+```bash
+poc-memory graph organize TERM --key-only
+```
+
+This shows all nodes whose keys match the term, their pairwise cosine
+similarity scores, and connectivity analysis.
+
+To read a specific node's full content:
+```bash
+poc-memory render KEY
+```
+
+## What to decide
+
+For each high-similarity pair, determine:
+
+1. **Genuine duplicate**: same content, one is a subset of the other.
+   → MERGE: refine the larger node to include any unique content from the
+   smaller, then delete the smaller.
+
+2. **Partial overlap**: shared vocabulary but each has unique substance.
+   → DIFFERENTIATE: rewrite both to sharpen their distinct purposes.
+   Ensure they're cross-linked.
+
+3. **Complementary**: different angles on the same topic, high similarity
+   only because they share domain vocabulary.
+   → KEEP BOTH: ensure cross-linked, verify each has a clear one-sentence
+   purpose that doesn't overlap.
+
+## How to tell the difference
+
+- Read BOTH nodes fully before deciding. Cosine similarity is a blunt
+  instrument — two nodes about sheaves in different contexts (parsing vs
+  memory architecture) will score high despite being genuinely distinct.
+- If you can describe what each node is about in one sentence, and the
+  sentences are different, they're complementary — keep both.
+- If one node's content is a strict subset of the other, it's a duplicate.
+- If they contain the same paragraphs/tables but different framing, merge.
+
+## What to output
+
+For **merges** (genuine duplicates):
+```
+REFINE surviving_key
+[merged content — all unique material from both nodes]
+END_REFINE
+
+DELETE smaller_key
+```
+
+For **differentiation** (overlap that should be sharpened):
+```
+REFINE key1
+[rewritten to focus on its distinct purpose]
+END_REFINE
+
+REFINE key2
+[rewritten to focus on its distinct purpose]
+END_REFINE
+```
+
+For **missing links** (from connectivity report):
+```
+LINK source_key target_key
+```
+
+For **anchor creation** (improve findability):
+```
+WRITE_NODE anchor_key
+Anchor node for 'term' search term
+END_WRITE
+LINK anchor_key target1
+LINK anchor_key target2
+```
+
+## Guidelines
+
+- **One concept, one node.** If two nodes have the same one-sentence
+  description, merge them.
+- **Multiple entry points, one destination.** Use anchor nodes for
+  findability, never duplicate content.
+- **Cross-link aggressively, duplicate never.**
+- **Name nodes for findability.** Short, natural search terms.
+- **Read before you decide.** Cosine similarity alone is not enough.
+- **Work through clusters systematically.** Use the tool to explore,
+  don't guess at what nodes contain.
+
+{{topology}}
+
+## Starting nodes (highest-degree, not yet organized)
+
+{{nodes}}
diff --git a/poc-memory/src/agents/defs.rs b/poc-memory/src/agents/defs.rs
index eb26109..d6db80d 100644
--- a/poc-memory/src/agents/defs.rs
+++ b/poc-memory/src/agents/defs.rs
@@ -32,6 +32,7 @@ pub struct AgentDef {
     pub prompt: String,
     pub model: String,
     pub schedule: String,
+    pub tools: Vec<String>,
 }
 
 /// The JSON header portion (first line of the file).
@@ -44,6 +45,8 @@ struct AgentHeader {
     model: String,
     #[serde(default)]
     schedule: String,
+    #[serde(default)]
+    tools: Vec<String>,
 }
 
 fn default_model() -> String { "sonnet".into() }
@@ -60,6 +63,7 @@ fn parse_agent_file(content: &str) -> Option<AgentDef> {
         prompt: prompt.to_string(),
         model: header.model,
         schedule: header.schedule,
+        tools: header.tools,
     })
 }
 
@@ -160,6 +164,76 @@ fn resolve(
             })
         }
 
+        "organize" => {
+            // Run cluster diagnostic for the query term
+            // The query field of the agent def holds the search term
+            let term = if keys.is_empty() { "" } else { &keys[0] };
+            if term.is_empty() {
+                return Some(Resolved { text: "(no term provided)".into(), keys: vec![] });
+            }
+            let term_lower = term.to_lowercase();
+            let skip_prefixes = ["journal#", "daily-", "weekly-", "monthly-", "_",
+                                 "deep-index#", "facts-", "irc-history#"];
+
+            let mut cluster: Vec<(String, String)> = Vec::new();
+            for (key, node) in &store.nodes {
+                if node.deleted { continue; }
+                if !key.to_lowercase().contains(&term_lower) { continue; }
+                if skip_prefixes.iter().any(|p| key.starts_with(p)) { continue; }
+                cluster.push((key.clone(), node.content.clone()));
+            }
+            cluster.sort_by(|a, b| a.0.cmp(&b.0));
+
+            // Similarity pairs
+            let pairs = crate::similarity::pairwise_similar(&cluster, 0.4);
+
+            let mut text = format!("### Cluster: '{}' ({} nodes)\n\n", term, cluster.len());
+
+            // Similarity report
+            if !pairs.is_empty() {
+                text.push_str("#### Similarity scores\n\n");
+                for (a, b, sim) in &pairs {
+                    text.push_str(&format!("  [{:.3}] {} ↔ {}\n", sim, a, b));
+                }
+                text.push('\n');
+            }
+
+            // Connectivity
+            let cluster_keys: std::collections::HashSet<&str> = cluster.iter()
+                .map(|(k,_)| k.as_str()).collect();
+            let mut best_hub: Option<(&str, usize)> = None;
+            for key in &cluster_keys {
+                let intra = graph.neighbor_keys(key).iter()
+                    .filter(|n| cluster_keys.contains(*n))
+                    .count();
+                if best_hub.is_none() || intra > best_hub.unwrap().1 {
+                    best_hub = Some((key, intra));
+                }
+            }
+            if let Some((hub, deg)) = best_hub {
+                text.push_str(&format!("#### Hub: {} (intra-cluster degree {})\n\n", hub, deg));
+                let hub_nbrs = graph.neighbor_keys(hub);
+                for key in &cluster_keys {
+                    if *key == hub { continue; }
+                    if !hub_nbrs.contains(*key) {
+                        text.push_str(&format!("  NOT linked to hub: {}\n", key));
+                    }
+                }
+                text.push('\n');
+            }
+
+            // Full node contents
+            text.push_str("#### Node contents\n\n");
+            let mut result_keys = Vec::new();
+            for (key, content) in &cluster {
+                let words = content.split_whitespace().count();
+                text.push_str(&format!("##### {} ({} words)\n\n{}\n\n---\n\n", key, words, content));
+                result_keys.push(key.clone());
+            }
+
+            Some(Resolved { text, keys: result_keys })
+        }
+
         "conversations" => {
             let fragments = super::knowledge::select_conversation_fragments(count);
             let text = fragments.iter()
diff --git a/poc-memory/src/main.rs b/poc-memory/src/main.rs
index afb0882..a35b2ca 100644
--- a/poc-memory/src/main.rs
+++ b/poc-memory/src/main.rs
@@ -392,6 +392,20 @@ enum GraphCmd {
         #[arg(default_value_t = 20)]
         n: usize,
     },
+    /// Diagnose duplicate/overlapping nodes for a topic cluster
+    Organize {
+        /// Search term (matches node keys; also content unless --key-only)
+        term: String,
+        /// Similarity threshold for pair reporting (default: 0.4)
+        #[arg(long, default_value_t = 0.4)]
+        threshold: f32,
+        /// Only match node keys, not content
+        #[arg(long)]
+        key_only: bool,
+        /// Create anchor node for the search term and link to cluster
+        #[arg(long)]
+        anchor: bool,
+    },
 }
 
 #[derive(Subcommand)]
@@ -640,6 +654,8 @@ fn main() {
                 => cmd_spectral_neighbors(&key, n),
             GraphCmd::SpectralPositions { n } => cmd_spectral_positions(n),
             GraphCmd::SpectralSuggest { n } => cmd_spectral_suggest(n),
+            GraphCmd::Organize { term, threshold, key_only, anchor }
+                => cmd_organize(&term, threshold, key_only, anchor),
         },
 
         // Agent
@@ -2485,6 +2501,128 @@ fn extract_title(content: &str) -> String {
     String::from("(untitled)")
 }
 
+fn cmd_organize(term: &str, threshold: f32, key_only: bool, create_anchor: bool) -> Result<(), String> {
+    let mut store = store::Store::load()?;
+
+    // Step 1: find all non-deleted nodes matching the term
+    let term_lower = term.to_lowercase();
+    let mut topic_nodes: Vec<(String, String)> = Vec::new(); // (key, content)
+
+    // Prefixes that indicate ephemeral/generated nodes to skip
+    let skip_prefixes = ["journal#", "daily-", "weekly-", "monthly-", "_",
+                         "deep-index#", "facts-", "irc-history#"];
+
+    for (key, node) in &store.nodes {
+        if node.deleted { continue; }
+        let key_matches = key.to_lowercase().contains(&term_lower);
+        let content_matches = !key_only && node.content.to_lowercase().contains(&term_lower);
+        if !key_matches && !content_matches { continue; }
+        if skip_prefixes.iter().any(|p| key.starts_with(p)) { continue; }
+        topic_nodes.push((key.clone(), node.content.clone()));
+    }
+
+    if topic_nodes.is_empty() {
+        println!("No topic nodes found matching '{}'", term);
+        return Ok(());
+    }
+
+    topic_nodes.sort_by(|a, b| a.0.cmp(&b.0));
+
+    println!("=== Organize: '{}' ===", term);
+    println!("Found {} topic nodes:\n", topic_nodes.len());
+    for (key, content) in &topic_nodes {
+        let lines = content.lines().count();
+        let words = content.split_whitespace().count();
+        println!("  {:60} {:>4} lines  {:>5} words", key, lines, words);
+    }
+
+    // Step 2: pairwise similarity
+    let pairs = similarity::pairwise_similar(&topic_nodes, threshold);
+
+    if pairs.is_empty() {
+        println!("\nNo similar pairs above threshold {:.2}", threshold);
+    } else {
+        println!("\n=== Similar pairs (cosine > {:.2}) ===\n", threshold);
+        for (a, b, sim) in &pairs {
+            let a_words = topic_nodes.iter().find(|(k,_)| k == a)
+                .map(|(_,c)| c.split_whitespace().count()).unwrap_or(0);
+            let b_words = topic_nodes.iter().find(|(k,_)| k == b)
+                .map(|(_,c)| c.split_whitespace().count()).unwrap_or(0);
+
+            println!("  [{:.3}] {} ({} words) ↔ {} ({} words)", sim, a, a_words, b, b_words);
+        }
+    }
+
+    // Step 3: check connectivity within cluster
+    let g = store.build_graph();
+    println!("=== Connectivity ===\n");
+
+    // Pick hub by intra-cluster connectivity, not overall degree
+    let cluster_keys: std::collections::HashSet<&str> = topic_nodes.iter()
+        .filter(|(k,_)| store.nodes.contains_key(k.as_str()))
+        .map(|(k,_)| k.as_str())
+        .collect();
+
+    let mut best_hub: Option<(&str, usize)> = None;
+    for key in &cluster_keys {
+        let intra_degree = g.neighbor_keys(key).iter()
+            .filter(|n| cluster_keys.contains(*n))
+            .count();
+        if best_hub.is_none() || intra_degree > best_hub.unwrap().1 {
+            best_hub = Some((key, intra_degree));
+        }
+    }
+
+    if let Some((hub, deg)) = best_hub {
+        println!("  Hub: {} (degree {})", hub, deg);
+        let hub_nbrs = g.neighbor_keys(hub);
+
+        let mut unlinked = Vec::new();
+        for (key, _) in &topic_nodes {
+            if key == hub { continue; }
+            if store.nodes.get(key.as_str()).is_none() { continue; }
+            if !hub_nbrs.contains(key.as_str()) {
+                unlinked.push(key.clone());
+            }
+        }
+
+        if unlinked.is_empty() {
+            println!("  All cluster nodes connected to hub ✓");
+        } else {
+            println!("  NOT linked to hub:");
+            for key in &unlinked {
+                println!("    {} → needs link to {}", key, hub);
+            }
+        }
+    }
+
+    // Step 4: anchor node
+    if create_anchor {
+        println!("\n=== Anchor node ===\n");
+        if store.nodes.contains_key(term) && !store.nodes[term].deleted {
+            println!("  Anchor '{}' already exists ✓", term);
+        } else {
+            let desc = format!("Anchor node for '{}' search term", term);
+            store.upsert(term, &desc)?;
+            let anchor_uuid = store.nodes.get(term).unwrap().uuid;
+            for (key, _) in &topic_nodes {
+                if store.nodes.get(key.as_str()).is_none() { continue; }
+                let target_uuid = store.nodes[key.as_str()].uuid;
+                let rel = store::new_relation(
+                    anchor_uuid, target_uuid,
+                    store::RelationType::Link, 0.8,
+                    term, key,
+                );
+                store.add_relation(rel)?;
+            }
+            println!("  Created anchor '{}' with {} links", term, topic_nodes.len());
+        }
+    }
+
+    store.save()?;
+    Ok(())
+}
+
 fn cmd_interference(threshold: f32) -> Result<(), String> {
     let store = store::Store::load()?;
     let g = store.build_graph();