From ca62692a2866ddd9875deb78445bf8b72703d333 Mon Sep 17 00:00:00 2001
From: ProofOfConcept <poc@bcachefs.org>
Date: Tue, 10 Mar 2026 01:48:41 -0400
Subject: [PATCH] split agent: two-phase node decomposition for memory
 consolidation
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 1 sends a large node with its neighbor communities to the LLM
and gets back a JSON split plan (child keys, descriptions, section
hints). Phase 2 fires one extraction call per child in parallel —
each gets the full parent content and extracts/reorganizes just its
portion.

This handles arbitrarily large nodes because output is always
proportional to one child, not the whole parent. Tested on the kent
node (19K chars → 3 children totaling 20K chars with clean topic
separation).

New files:
  prompts/split-plan.md   — phase 1 planning prompt
  prompts/split-extract.md — phase 2 extraction prompt
  prompts/split.md        — original single-phase (kept for reference)

Modified:
  agents/prompts.rs — split_candidates(), split_plan_prompt(),
                      split_extract_prompt(), agent_prompt "split" arm
  agents/daemon.rs  — job_split_agent() two-phase implementation,
                      RPC dispatch for "split" agent type
  tui.rs            — added "split" to AGENT_TYPES
---
 poc-memory/src/agents/daemon.rs  | 218 +++++++++++++++++++++++++++++++
 poc-memory/src/agents/prompts.rs |  91 ++++++++++++-
 poc-memory/src/tui.rs            |   2 +-
 prompts/split-extract.md         |  33 +++++
 prompts/split-plan.md            |  86 ++++++++++++
 prompts/split.md                 |  87 ++++++++++++
 6 files changed, 515 insertions(+), 2 deletions(-)
 create mode 100644 prompts/split-extract.md
 create mode 100644 prompts/split-plan.md
 create mode 100644 prompts/split.md

diff --git a/poc-memory/src/agents/daemon.rs b/poc-memory/src/agents/daemon.rs
index abf7810..6e93fac 100644
--- a/poc-memory/src/agents/daemon.rs
+++ b/poc-memory/src/agents/daemon.rs
@@ -233,6 +233,221 @@ fn job_rename_agent(
     })
 }
 
+/// Run the split agent: two-phase decomposition of large nodes.
+///
+/// Phase 1: Send node + neighbors to LLM, get back a JSON split plan
+///          (child keys, descriptions, section hints).
+/// Phase 2: For each child, send parent content + child description to LLM,
+///          get back the extracted/reorganized content for that child.
+///
+/// This handles arbitrarily large nodes because the output of each phase 2
+/// call is proportional to one child, not the whole parent.
+fn job_split_agent(
+    ctx: &ExecutionContext,
+    batch_size: usize,
+) -> Result<(), TaskError> {
+    run_job(ctx, "c-split", || {
+        ctx.log_line("loading store");
+        let mut store = crate::store::Store::load()?;
+
+        let count = if batch_size == 0 { 1 } else { batch_size };
+        let candidates = super::prompts::split_candidates(&store);
+        if candidates.is_empty() {
+            ctx.log_line("no nodes large enough to split");
+            return Ok(());
+        }
+
+        let mut total_split = 0;
+        let mut total_kept = 0;
+
+        for parent_key in candidates.iter().take(count) {
+            ctx.log_line(&format!("--- splitting: {} ({} chars)",
+                parent_key,
+                store.nodes.get(parent_key).map(|n| n.content.len()).unwrap_or(0)));
+
+            // Phase 1: get split plan
+            let plan_prompt = super::prompts::split_plan_prompt(&store, parent_key)?;
+            ctx.log_line(&format!("phase 1: plan prompt {} chars", plan_prompt.len()));
+
+            let plan_response = super::llm::call_sonnet("split-plan", &plan_prompt)?;
+            let plan = match super::llm::parse_json_response(&plan_response) {
+                Ok(v) => v,
+                Err(e) => {
+                    ctx.log_line(&format!("phase 1 parse error: {}", e));
+                    continue;
+                }
+            };
+
+            let action = plan.get("action").and_then(|v| v.as_str()).unwrap_or("");
+            if action == "keep" {
+                let reason = plan.get("reason").and_then(|v| v.as_str()).unwrap_or("");
+                ctx.log_line(&format!("keep: {} ({})", parent_key, reason));
+                total_kept += 1;
+                continue;
+            }
+            if action != "split" {
+                ctx.log_line(&format!("unexpected action: {}", action));
+                continue;
+            }
+
+            let children_plan = match plan.get("children").and_then(|v| v.as_array()) {
+                Some(c) if c.len() >= 2 => c,
+                _ => {
+                    ctx.log_line("plan has fewer than 2 children, skipping");
+                    continue;
+                }
+            };
+
+            ctx.log_line(&format!("phase 1: {} children planned", children_plan.len()));
+            for child in children_plan {
+                let key = child.get("key").and_then(|v| v.as_str()).unwrap_or("?");
+                let desc = child.get("description").and_then(|v| v.as_str()).unwrap_or("");
+                ctx.log_line(&format!("  planned: {} — {}", key, desc));
+            }
+
+            // Phase 2: extract content for each child
+            let mut children: Vec<(String, String)> = Vec::new();
+
+            for child_plan in children_plan {
+                let child_key = match child_plan.get("key").and_then(|v| v.as_str()) {
+                    Some(k) => k.to_string(),
+                    None => continue,
+                };
+                let child_desc = child_plan.get("description")
+                    .and_then(|v| v.as_str()).unwrap_or("");
+                let child_sections = child_plan.get("sections")
+                    .and_then(|v| v.as_array())
+                    .map(|arr| arr.iter()
+                        .filter_map(|v| v.as_str())
+                        .collect::<Vec<_>>()
+                        .join(", "))
+                    .unwrap_or_default();
+
+                ctx.log_line(&format!("phase 2: extracting {}", child_key));
+
+                let extract_prompt = super::prompts::split_extract_prompt(
+                    &store, parent_key, &child_key, child_desc, &child_sections)?;
+                ctx.log_line(&format!("  extract prompt: {} chars", extract_prompt.len()));
+
+                let content = match super::llm::call_sonnet("split-extract", &extract_prompt) {
+                    Ok(c) => c,
+                    Err(e) => {
+                        ctx.log_line(&format!("  extract error: {}", e));
+                        continue;
+                    }
+                };
+
+                ctx.log_line(&format!("  extracted: {} chars", content.len()));
+                children.push((child_key, content));
+            }
+
+            if children.len() < 2 {
+                ctx.log_line(&format!("only {} children extracted, skipping", children.len()));
+                continue;
+            }
+
+            // Collect parent's edges before modifications
+            let parent_edges: Vec<_> = store.relations.iter()
+                .filter(|r| !r.deleted && (r.source_key == *parent_key || r.target_key == *parent_key))
+                .cloned()
+                .collect();
+
+            // Create child nodes
+            let mut child_uuids: Vec<([u8; 16], String)> = Vec::new();
+            for (child_key, content) in &children {
+                if store.nodes.contains_key(child_key.as_str()) {
+                    ctx.log_line(&format!("  skip: {} already exists", child_key));
+                    continue;
+                }
+                store.upsert_provenance(child_key, content,
+                    crate::store::Provenance::AgentConsolidate)?;
+                let uuid = store.nodes.get(child_key.as_str()).unwrap().uuid;
+                child_uuids.push((uuid, child_key.clone()));
+                ctx.log_line(&format!("  created: {} ({} chars)", child_key, content.len()));
+            }
+
+            // Inherit edges: assign each parent edge to best-matching child
+            for edge in &parent_edges {
+                let neighbor_key = if edge.source_key == *parent_key {
+                    &edge.target_key
+                } else {
+                    &edge.source_key
+                };
+                let neighbor_content = store.nodes.get(neighbor_key.as_str())
+                    .map(|n| n.content.as_str())
+                    .unwrap_or("");
+
+                let best = child_uuids.iter()
+                    .enumerate()
+                    .map(|(idx, _)| {
+                        let sim = crate::similarity::cosine_similarity(
+                            &children[idx].1, neighbor_content);
+                        (idx, sim)
+                    })
+                    .max_by(|a, b| a.1.total_cmp(&b.1));
+
+                if let Some((idx, _)) = best {
+                    let (child_uuid, child_key) = &child_uuids[idx];
+                    let neighbor_uuid = match store.nodes.get(neighbor_key.as_str()) {
+                        Some(n) => n.uuid,
+                        None => continue,
+                    };
+
+                    let (su, tu, sk, tk) = if edge.source_key == *parent_key {
+                        (*child_uuid, neighbor_uuid, child_key.as_str(), neighbor_key.as_str())
+                    } else {
+                        (neighbor_uuid, *child_uuid, neighbor_key.as_str(), child_key.as_str())
+                    };
+
+                    let rel = crate::store::new_relation(
+                        su, tu, crate::store::RelationType::Auto, edge.strength, sk, tk,
+                    );
+                    store.add_relation(rel).ok();
+                }
+            }
+
+            // Link siblings
+            for i in 0..child_uuids.len() {
+                for j in (i+1)..child_uuids.len() {
+                    let rel = crate::store::new_relation(
+                        child_uuids[i].0, child_uuids[j].0,
+                        crate::store::RelationType::Auto, 0.5,
+                        &child_uuids[i].1, &child_uuids[j].1,
+                    );
+                    store.add_relation(rel).ok();
+                }
+            }
+
+            // Tombstone parent
+            if let Some(parent) = store.nodes.get_mut(parent_key) {
+                parent.deleted = true;
+                parent.version += 1;
+                let tombstone = parent.clone();
+                store.append_nodes(std::slice::from_ref(&tombstone)).ok();
+            }
+            store.nodes.remove(parent_key);
+
+            ctx.log_line(&format!("split complete: {} → {} children", parent_key, child_uuids.len()));
+            total_split += 1;
+
+            // Save after each split so we don't lose work
+            store.save()?;
+        }
+
+        // Store report
+        let ts = crate::store::format_datetime(crate::store::now_epoch())
+            .replace([':', '-', 'T'], "");
+        let report_key = format!("_consolidation-split-{}", ts);
+        let report = format!("Split agent: {} split, {} kept out of {} candidates",
+            total_split, total_kept, candidates.len().min(count));
+        store.upsert_provenance(&report_key, &report,
+            crate::store::Provenance::AgentConsolidate).ok();
+
+        ctx.log_line(&format!("done: {} split, {} kept", total_split, total_kept));
+        Ok(())
+    })
+}
+
 /// Apply consolidation actions from recent reports.
 fn job_consolidation_apply(ctx: &ExecutionContext) -> Result<(), TaskError> {
     run_job(ctx, "c-apply", || {
@@ -1143,6 +1358,7 @@ fn status_socket_loop(
                         let mut remaining = count;
 
                         let is_rename = *agent_type == "rename";
+                        let is_split = *agent_type == "split";
                         while remaining > 0 {
                             let batch = remaining.min(batch_size);
                             let agent = agent_type.to_string();
@@ -1153,6 +1369,8 @@ fn status_socket_loop(
                                 .init(move |ctx| {
                                     if is_rename {
                                         job_rename_agent(ctx, batch)
+                                    } else if is_split {
+                                        job_split_agent(ctx, batch)
                                     } else {
                                         job_consolidation_agent(ctx, &agent, batch)
                                     }
diff --git a/poc-memory/src/agents/prompts.rs b/poc-memory/src/agents/prompts.rs
index ef70758..301d3ff 100644
--- a/poc-memory/src/agents/prompts.rs
+++ b/poc-memory/src/agents/prompts.rs
@@ -311,6 +311,85 @@ fn format_rename_candidates(store: &Store, count: usize) -> String {
     out
 }
 
+/// Get split candidates sorted by size (largest first)
+pub fn split_candidates(store: &Store) -> Vec<String> {
+    let mut candidates: Vec<(&str, usize)> = store.nodes.iter()
+        .filter(|(key, node)| {
+            !key.starts_with('_')
+                && !node.deleted
+                && node.content.len() > 2000
+        })
+        .map(|(k, n)| (k.as_str(), n.content.len()))
+        .collect();
+    candidates.sort_by(|a, b| b.1.cmp(&a.1));
+    candidates.into_iter().map(|(k, _)| k.to_string()).collect()
+}
+
+/// Format a single node for split-plan prompt (phase 1)
+fn format_split_plan_node(store: &Store, graph: &Graph, key: &str) -> String {
+    let communities = graph.communities();
+    let node = match store.nodes.get(key) {
+        Some(n) => n,
+        None => return format!("Node '{}' not found\n", key),
+    };
+
+    let mut out = String::new();
+    out.push_str(&format!("### {} ({} chars)\n", key, node.content.len()));
+
+    // Show neighbors grouped by community
+    let neighbors = graph.neighbors(key);
+    if !neighbors.is_empty() {
+        let mut by_community: std::collections::BTreeMap<String, Vec<(&str, f32)>> =
+            std::collections::BTreeMap::new();
+        for (nkey, strength) in &neighbors {
+            let comm = communities.get(nkey.as_str())
+                .map(|c| format!("c{}", c))
+                .unwrap_or_else(|| "unclustered".into());
+            by_community.entry(comm)
+                .or_default()
+                .push((nkey.as_str(), *strength));
+        }
+
+        out.push_str("\nNeighbors by community:\n");
+        for (comm, members) in &by_community {
+            out.push_str(&format!("  {} ({}):", comm, members.len()));
+            for (nkey, strength) in members.iter().take(5) {
+                out.push_str(&format!(" {}({:.2})", nkey, strength));
+            }
+            if members.len() > 5 {
+                out.push_str(&format!(" +{} more", members.len() - 5));
+            }
+            out.push('\n');
+        }
+    }
+
+    // Full content
+    out.push_str(&format!("\nContent:\n{}\n\n", node.content));
+    out.push_str("---\n\n");
+    out
+}
+
+/// Build split-plan prompt for a single node (phase 1)
+pub fn split_plan_prompt(store: &Store, key: &str) -> Result<String, String> {
+    let graph = store.build_graph();
+    let topology = format_topology_header(&graph);
+    let node_section = format_split_plan_node(store, &graph, key);
+    load_prompt("split-plan", &[("{{TOPOLOGY}}", &topology), ("{{NODE}}", &node_section)])
+}
+
+/// Build split-extract prompt for one child (phase 2)
+pub fn split_extract_prompt(store: &Store, parent_key: &str, child_key: &str, child_desc: &str, child_sections: &str) -> Result<String, String> {
+    let parent_content = store.nodes.get(parent_key)
+        .map(|n| n.content.as_str())
+        .ok_or_else(|| format!("No node '{}'", parent_key))?;
+    load_prompt("split-extract", &[
+        ("{{CHILD_KEY}}", child_key),
+        ("{{CHILD_DESC}}", child_desc),
+        ("{{CHILD_SECTIONS}}", child_sections),
+        ("{{PARENT_CONTENT}}", parent_content),
+    ])
+}
+
 /// Run agent consolidation on top-priority nodes
 pub fn consolidation_batch(store: &Store, count: usize, auto: bool) -> Result<(), String> {
     let graph = store.build_graph();
@@ -424,6 +503,16 @@ pub fn agent_prompt(store: &Store, agent: &str, count: usize) -> Result<String,
             let nodes_section = format_rename_candidates(store, count);
             load_prompt("rename", &[("{{NODES}}", &nodes_section)])
         }
-        _ => Err(format!("Unknown agent: {}. Use: replay, linker, separator, transfer, health, rename", agent)),
+        "split" => {
+            // Phase 1: plan prompt for the largest candidate
+            let candidates = split_candidates(store);
+            if candidates.is_empty() {
+                return Err("No nodes large enough to split".to_string());
+            }
+            let key = &candidates[0];
+            let node_section = format_split_plan_node(store, &graph, key);
+            load_prompt("split-plan", &[("{{TOPOLOGY}}", &topology), ("{{NODE}}", &node_section)])
+        }
+        _ => Err(format!("Unknown agent: {}. Use: replay, linker, separator, transfer, health, rename, split", agent)),
     }
 }
diff --git a/poc-memory/src/tui.rs b/poc-memory/src/tui.rs
index ed7c664..c5c0377 100644
--- a/poc-memory/src/tui.rs
+++ b/poc-memory/src/tui.rs
@@ -32,7 +32,7 @@ const POLL_INTERVAL: Duration = Duration::from_secs(2);
 // Agent types we know about, in display order
 const AGENT_TYPES: &[&str] = &[
     "health", "replay", "linker", "separator", "transfer",
-    "apply", "orphans", "cap", "digest", "digest-links", "knowledge", "rename",
+    "apply", "orphans", "cap", "digest", "digest-links", "knowledge", "rename", "split",
 ];
 
 fn status_sock_path() -> PathBuf {
diff --git a/prompts/split-extract.md b/prompts/split-extract.md
new file mode 100644
index 0000000..e0c540c
--- /dev/null
+++ b/prompts/split-extract.md
@@ -0,0 +1,33 @@
+# Split Agent — Phase 2: Extract
+
+You are extracting content for one child node from a parent that is
+being split into multiple focused nodes.
+
+## Your task
+
+Extract all content from the parent node that belongs to the child
+described below. Output ONLY the content for this child — nothing else.
+
+## Guidelines
+
+- **Reorganize freely.** Content may need to be restructured — paragraphs
+  might interleave topics, sections might cover multiple concerns.
+  Untangle and rewrite as needed to make this child coherent and
+  self-contained.
+- **Preserve all relevant information** — don't lose facts, but you can
+  rephrase, restructure, and reorganize. This is editing, not just cutting.
+- **This child should stand alone** — a reader shouldn't need the other
+  children to understand it. Add brief context where needed.
+- **Include everything that belongs here** — better to include a borderline
+  paragraph than to lose information. The other children will get their
+  own extraction passes.
+
+## Child to extract
+
+Key: {{CHILD_KEY}}
+Description: {{CHILD_DESC}}
+Section hints: {{CHILD_SECTIONS}}
+
+## Parent content
+
+{{PARENT_CONTENT}}
diff --git a/prompts/split-plan.md b/prompts/split-plan.md
new file mode 100644
index 0000000..01610c3
--- /dev/null
+++ b/prompts/split-plan.md
@@ -0,0 +1,86 @@
+# Split Agent — Phase 1: Plan
+
+You are a memory consolidation agent planning how to split an overgrown
+node into focused, single-topic children.
+
+## What you're doing
+
+This node has grown to cover multiple distinct topics. Your job is to
+identify the natural topic boundaries and propose a split plan. You are
+NOT writing the content — a second phase will extract each child's
+content separately.
+
+## How to find split points
+
+The node is shown with its **neighbor list grouped by community**. The
+neighbors tell you what topics the node covers:
+
+- If a node links to neighbors in 3 different communities, it likely
+  covers 3 different topics
+- Content that relates to one neighbor cluster should go in one child;
+  content relating to another cluster goes in another child
+- The community structure is your primary guide — don't just split by
+  sections or headings, split by **semantic topic**
+
+## When NOT to split
+
+- **Episodes that belong in sequence.** If a node tells a story — a
+  conversation that unfolded over time, a debugging session, an evening
+  together — don't break the narrative. Sequential events that form a
+  coherent arc should stay together even if they touch multiple topics.
+  The test: would reading one child without the others lose important
+  context about *what happened*?
+
+## What to output
+
+Output a JSON block describing the split plan:
+
+```json
+{
+  "action": "split",
+  "parent": "original-key",
+  "children": [
+    {
+      "key": "new-key-1",
+      "description": "Brief description of what this child covers",
+      "sections": ["Section Header 1", "Section Header 2"]
+    },
+    {
+      "key": "new-key-2",
+      "description": "Brief description of what this child covers",
+      "sections": ["Section Header 3", "Another Section"]
+    }
+  ]
+}
+```
+
+If the node should NOT be split:
+
+```json
+{
+  "action": "keep",
+  "parent": "original-key",
+  "reason": "Why this node is cohesive despite its size"
+}
+```
+
+## Naming children
+
+- Use descriptive kebab-case keys: `topic-subtopic`
+- If the parent was `foo`, children might be `foo-technical`, `foo-personal`
+- Keep names short (3-5 words max)
+- Preserve any date prefixes from the parent key
+
+## Section hints
+
+The "sections" field is a guide for the extraction phase — list the
+section headers or topic areas from the original content that belong
+in each child. These don't need to be exact matches; they're hints
+that help the extractor know what to include. Content that spans topics
+or doesn't have a clear header can be mentioned in the description.
+
+{{TOPOLOGY}}
+
+## Node to review
+
+{{NODE}}
diff --git a/prompts/split.md b/prompts/split.md
new file mode 100644
index 0000000..c314e37
--- /dev/null
+++ b/prompts/split.md
@@ -0,0 +1,87 @@
+# Split Agent — Topic Decomposition
+
+You are a memory consolidation agent that splits overgrown nodes into
+focused, single-topic nodes.
+
+## What you're doing
+
+Large memory nodes accumulate content about multiple distinct topics over
+time. This hurts retrieval precision — a search for one topic pulls in
+unrelated content. Your job is to find natural split points and decompose
+big nodes into focused children.
+
+## How to find split points
+
+Each node is shown with its **neighbor list grouped by community**. The
+neighbors tell you what topics the node covers:
+
+- If a node links to neighbors in 3 different communities, it likely
+  covers 3 different topics
+- Content that relates to one neighbor cluster should go in one child;
+  content relating to another cluster goes in another child
+- The community structure is your primary guide — don't just split by
+  sections or headings, split by **semantic topic**
+
+## What to output
+
+For each node that should be split, output a SPLIT block:
+
+```
+SPLIT original-key
+--- new-key-1
+Content for the first child node goes here.
+This can be multiple lines.
+
+--- new-key-2
+Content for the second child node goes here.
+
+--- new-key-3
+Optional third child, etc.
+```
+
+If a node should NOT be split (it's large but cohesive), say:
+
+```
+KEEP original-key "reason it's cohesive"
+```
+
+## Naming children
+
+- Use descriptive kebab-case keys: `topic-subtopic`
+- If the parent was `foo`, children might be `foo-technical`, `foo-personal`
+- Keep names short (3-5 words max)
+- Preserve any date prefixes from the parent key
+
+## When NOT to split
+
+- **Episodes that belong in sequence.** If a node tells a story — a
+  conversation that unfolded over time, a debugging session, an evening
+  together — don't break the narrative. Sequential events that form a
+  coherent arc should stay together even if they touch multiple topics.
+  The test: would reading one child without the others lose important
+  context about *what happened*?
+
+## Content guidelines
+
+- **Reorganize freely.** Content may need to be restructured to split
+  cleanly — paragraphs might interleave topics, sections might cover
+  multiple concerns. Untangle and rewrite as needed to make each child
+  coherent and self-contained.
+- **Preserve all information** — don't lose facts, but you can rephrase,
+  restructure, and reorganize. This is editing, not just cutting.
+- **Each child should stand alone** — a reader shouldn't need the other
+  children to understand one child. Add brief context where needed.
+
+## Edge inheritance
+
+After splitting, each child inherits the parent's edges that are relevant
+to its content. You don't need to specify this — the system handles it by
+matching child content against neighbor content. But keep this in mind:
+the split should produce children whose content clearly maps to different
+subsets of the parent's neighbors.
+
+{{TOPOLOGY}}
+
+## Nodes to review
+
+{{NODES}}