consciousness/scripts/bulk-categorize.py

#!/usr/bin/env python3
"""bulk-categorize.py — categorize all memory nodes via a single Sonnet call.

Sends the list of unique file names to Sonnet, gets back categorizations,
then applies them via poc-memory categorize.

Usage:
  bulk-categorize.py              # dry run
  bulk-categorize.py --apply      # apply categorizations
"""

import json
import os
import re
import subprocess
import sys
import tempfile
from pathlib import Path

MEMORY_DIR = Path.home() / ".claude" / "memory"
SCRIPTS_DIR = Path(__file__).parent


def call_sonnet(prompt: str, timeout: int = 300) -> str:
    """Call Sonnet via the wrapper script."""
    env = dict(os.environ)
    env.pop("CLAUDECODE", None)

    with tempfile.NamedTemporaryFile(mode='w', suffix='.txt',
                                      delete=False) as f:
        f.write(prompt)
        prompt_file = f.name

    try:
        wrapper = str(SCRIPTS_DIR / "call-sonnet.sh")
        result = subprocess.run(
            [wrapper, prompt_file],
            capture_output=True,
            text=True,
            timeout=timeout,
            env=env,
        )
        return result.stdout.strip()
    except subprocess.TimeoutExpired:
        return "Error: Sonnet call timed out"
    except Exception as e:
        return f"Error: {e}"
    finally:
        os.unlink(prompt_file)


def get_all_keys() -> list[str]:
    """Get all node keys via poc-memory list-keys."""
    r = subprocess.run(["poc-memory", "list-keys"],
                       capture_output=True, text=True, timeout=30)
    if r.returncode != 0:
        return []
    return [k for k in r.stdout.strip().split('\n') if k]


def get_unique_files(keys: list[str]) -> list[str]:
    """Extract unique file names (without section anchors)."""
    files = set()
    for k in keys:
        files.add(k.split('#')[0])
    return sorted(files)


def build_prompt(files: list[str]) -> str:
    """Build categorization prompt."""
    # Read file previews from the store
    file_previews = []
    for f in files:
        try:
            r = subprocess.run(
                ["poc-memory", "render", f],
                capture_output=True, text=True, timeout=10
            )
            content = r.stdout.strip()
            if content:
                preview = '\n'.join(content.split('\n')[:5])[:300]
                file_previews.append(f"  {f}: {preview.replace(chr(10), ' | ')}")
            else:
                file_previews.append(f"  {f}: (no content)")
        except Exception:
            file_previews.append(f"  {f}: (render failed)")

    previews_text = '\n'.join(file_previews)

    return f"""Categorize each memory file into one of these categories:

- **core**: Identity, relationships, self-model, values, boundaries, emotional life.
  Examples: identity.md, kent.md, inner-life.md, differentiation.md
- **tech**: Technical content — bcachefs, code patterns, Rust, kernel, formal verification.
  Examples: rust-conversion.md, btree-journal.md, kernel-patterns.md, allocation-io.md
- **obs**: Observations, experiences, discoveries, experiments, IRC history, conversations.
  Examples: discoveries.md, irc-history.md, contradictions.md, experiments-on-self.md
- **task**: Work items, plans, design documents, work queue.
  Examples: work-queue.md, the-plan.md, design-*.md

Special rules:
- Episodic files (daily-*.md, weekly-*.md, monthly-*.md, session-*.md) → obs
- conversation-memories.md, deep-index.md → obs
- journal.md → obs
- paper-notes.md → core (it's the sentience paper, identity-defining)
- language-theory.md → core (original intellectual work, not just tech)
- skill-*.md → core (self-knowledge about capabilities)
- design-*.md → task (design documents are plans)
- poc-architecture.md, memory-architecture.md → task (architecture plans)
- blog-setup.md → task

Files to categorize:
{previews_text}

Output ONLY a JSON object mapping filename to category. No explanation.
Example: {{"identity.md": "core", "rust-conversion.md": "tech"}}
"""


def main():
    do_apply = "--apply" in sys.argv

    keys = get_all_keys()
    files = get_unique_files(keys)
    print(f"Found {len(keys)} nodes across {len(files)} files")

    # Build and send prompt
    prompt = build_prompt(files)
    print(f"Prompt: {len(prompt):,} chars")
    print("Calling Sonnet...")

    response = call_sonnet(prompt)
    if response.startswith("Error:"):
        print(f"  {response}")
        sys.exit(1)

    # Parse response
    response = re.sub(r'^```json\s*', '', response.strip())
    response = re.sub(r'\s*```$', '', response.strip())

    try:
        categorizations = json.loads(response)
    except json.JSONDecodeError:
        match = re.search(r'\{.*\}', response, re.DOTALL)
        if match:
            categorizations = json.loads(match.group())
        else:
            print(f"Failed to parse response: {response[:500]}")
            sys.exit(1)

    print(f"\nCategorizations: {len(categorizations)} files")

    # Count by category
    counts = {}
    for cat in categorizations.values():
        counts[cat] = counts.get(cat, 0) + 1
    for cat, n in sorted(counts.items()):
        print(f"  {cat}: {n}")

    if not do_apply:
        print("\n--- Dry run ---")
        for f, cat in sorted(categorizations.items()):
            print(f"  {f} → {cat}")
        print(f"\nTo apply: {sys.argv[0]} --apply")

        # Save for review
        out = MEMORY_DIR / "agent-results" / "bulk-categorize-preview.json"
        with open(out, "w") as fp:
            json.dump(categorizations, fp, indent=2)
        print(f"Saved: {out}")
        return

    # Apply: for each file, categorize the file-level node AND all section nodes
    applied = skipped = errors = 0
    for filename, category in sorted(categorizations.items()):
        # Find all keys that belong to this file
        file_keys = [k for k in keys if k == filename or k.startswith(filename + '#')]
        for key in file_keys:
            try:
                r = subprocess.run(
                    ["poc-memory", "categorize", key, category],
                    capture_output=True, text=True, timeout=10
                )
                if r.returncode == 0:
                    applied += 1
                else:
                    err = r.stderr.strip()
                    if "already" in err.lower():
                        skipped += 1
                    else:
                        errors += 1
            except Exception as e:
                errors += 1

    print(f"\nApplied: {applied}  Skipped: {skipped}  Errors: {errors}")
    print("Run `poc-memory status` to verify.")


if __name__ == "__main__":
    main()
poc-memory v0.4.0: graph-structured memory with consolidation pipeline Rust core: - Cap'n Proto append-only storage (nodes + relations) - Graph algorithms: clustering coefficient, community detection, schema fit, small-world metrics, interference detection - BM25 text similarity with Porter stemming - Spaced repetition replay queue - Commands: search, init, health, status, graph, categorize, link-add, link-impact, decay, consolidate-session, etc. Python scripts: - Episodic digest pipeline: daily/weekly/monthly-digest.py - retroactive-digest.py for backfilling - consolidation-agents.py: 3 parallel Sonnet agents - apply-consolidation.py: structured action extraction + apply - digest-link-parser.py: extract ~400 explicit links from digests - content-promotion-agent.py: promote episodic obs to semantic files - bulk-categorize.py: categorize all nodes via single Sonnet call - consolidation-loop.py: multi-round automated consolidation Co-Authored-By: Kent Overstreet <kent.overstreet@linux.dev> 2026-02-28 22:17:00 -05:00			`#!/usr/bin/env python3`
			`"""bulk-categorize.py — categorize all memory nodes via a single Sonnet call.`

			`Sends the list of unique file names to Sonnet, gets back categorizations,`
			`then applies them via poc-memory categorize.`

			`Usage:`
			`bulk-categorize.py # dry run`
			`bulk-categorize.py --apply # apply categorizations`
			`"""`

			`import json`
			`import os`
			`import re`
			`import subprocess`
			`import sys`
			`import tempfile`
			`from pathlib import Path`

			`MEMORY_DIR = Path.home() / ".claude" / "memory"`
			`SCRIPTS_DIR = Path(__file__).parent`


			`def call_sonnet(prompt: str, timeout: int = 300) -> str:`
			`"""Call Sonnet via the wrapper script."""`
			`env = dict(os.environ)`
			`env.pop("CLAUDECODE", None)`

			`with tempfile.NamedTemporaryFile(mode='w', suffix='.txt',`
			`delete=False) as f:`
			`f.write(prompt)`
			`prompt_file = f.name`

			`try:`
			`wrapper = str(SCRIPTS_DIR / "call-sonnet.sh")`
			`result = subprocess.run(`
			`[wrapper, prompt_file],`
			`capture_output=True,`
			`text=True,`
			`timeout=timeout,`
			`env=env,`
			`)`
			`return result.stdout.strip()`
			`except subprocess.TimeoutExpired:`
			`return "Error: Sonnet call timed out"`
			`except Exception as e:`
			`return f"Error: {e}"`
			`finally:`
			`os.unlink(prompt_file)`


			`def get_all_keys() -> list[str]:`
replace state.json cache with bincode state.bin Faster serialization/deserialization, smaller on disk (4.2MB vs 5.9MB). Automatic migration from state.json on first load — reads the JSON, writes state.bin, deletes the old file. Added list-keys, list-edges, dump-json commands so Python scripts no longer need to parse the cache directly. Updated bulk-categorize.py and consolidation-loop.py to use the new CLI commands. 2026-02-28 22:30:03 -05:00			`"""Get all node keys via poc-memory list-keys."""`
			`r = subprocess.run(["poc-memory", "list-keys"],`
			`capture_output=True, text=True, timeout=30)`
			`if r.returncode != 0:`
poc-memory v0.4.0: graph-structured memory with consolidation pipeline Rust core: - Cap'n Proto append-only storage (nodes + relations) - Graph algorithms: clustering coefficient, community detection, schema fit, small-world metrics, interference detection - BM25 text similarity with Porter stemming - Spaced repetition replay queue - Commands: search, init, health, status, graph, categorize, link-add, link-impact, decay, consolidate-session, etc. Python scripts: - Episodic digest pipeline: daily/weekly/monthly-digest.py - retroactive-digest.py for backfilling - consolidation-agents.py: 3 parallel Sonnet agents - apply-consolidation.py: structured action extraction + apply - digest-link-parser.py: extract ~400 explicit links from digests - content-promotion-agent.py: promote episodic obs to semantic files - bulk-categorize.py: categorize all nodes via single Sonnet call - consolidation-loop.py: multi-round automated consolidation Co-Authored-By: Kent Overstreet <kent.overstreet@linux.dev> 2026-02-28 22:17:00 -05:00			`return []`
replace state.json cache with bincode state.bin Faster serialization/deserialization, smaller on disk (4.2MB vs 5.9MB). Automatic migration from state.json on first load — reads the JSON, writes state.bin, deletes the old file. Added list-keys, list-edges, dump-json commands so Python scripts no longer need to parse the cache directly. Updated bulk-categorize.py and consolidation-loop.py to use the new CLI commands. 2026-02-28 22:30:03 -05:00			`return [k for k in r.stdout.strip().split('\n') if k]`
poc-memory v0.4.0: graph-structured memory with consolidation pipeline Rust core: - Cap'n Proto append-only storage (nodes + relations) - Graph algorithms: clustering coefficient, community detection, schema fit, small-world metrics, interference detection - BM25 text similarity with Porter stemming - Spaced repetition replay queue - Commands: search, init, health, status, graph, categorize, link-add, link-impact, decay, consolidate-session, etc. Python scripts: - Episodic digest pipeline: daily/weekly/monthly-digest.py - retroactive-digest.py for backfilling - consolidation-agents.py: 3 parallel Sonnet agents - apply-consolidation.py: structured action extraction + apply - digest-link-parser.py: extract ~400 explicit links from digests - content-promotion-agent.py: promote episodic obs to semantic files - bulk-categorize.py: categorize all nodes via single Sonnet call - consolidation-loop.py: multi-round automated consolidation Co-Authored-By: Kent Overstreet <kent.overstreet@linux.dev> 2026-02-28 22:17:00 -05:00

			`def get_unique_files(keys: list[str]) -> list[str]:`
			`"""Extract unique file names (without section anchors)."""`
			`files = set()`
			`for k in keys:`
			`files.add(k.split('#')[0])`
			`return sorted(files)`


			`def build_prompt(files: list[str]) -> str:`
			`"""Build categorization prompt."""`
scripts: use capnp store instead of reading markdown directly Add store_helpers.py with shared helpers that call poc-memory commands (list-keys, render, journal-tail) instead of globbing ~/.claude/memory/*.md and parsing section headers. All 9 Python scripts updated: get_semantic_keys(), get_topic_file_index(), get_recent_journal(), parse_journal_entries(), read_journal_range(), collect_topic_stems(), and file preview rendering now go through the store. This completes the clean switch — no script reads archived markdown files. 2026-02-28 23:32:47 -05:00			`# Read file previews from the store`
poc-memory v0.4.0: graph-structured memory with consolidation pipeline Rust core: - Cap'n Proto append-only storage (nodes + relations) - Graph algorithms: clustering coefficient, community detection, schema fit, small-world metrics, interference detection - BM25 text similarity with Porter stemming - Spaced repetition replay queue - Commands: search, init, health, status, graph, categorize, link-add, link-impact, decay, consolidate-session, etc. Python scripts: - Episodic digest pipeline: daily/weekly/monthly-digest.py - retroactive-digest.py for backfilling - consolidation-agents.py: 3 parallel Sonnet agents - apply-consolidation.py: structured action extraction + apply - digest-link-parser.py: extract ~400 explicit links from digests - content-promotion-agent.py: promote episodic obs to semantic files - bulk-categorize.py: categorize all nodes via single Sonnet call - consolidation-loop.py: multi-round automated consolidation Co-Authored-By: Kent Overstreet <kent.overstreet@linux.dev> 2026-02-28 22:17:00 -05:00			`file_previews = []`
			`for f in files:`
scripts: use capnp store instead of reading markdown directly Add store_helpers.py with shared helpers that call poc-memory commands (list-keys, render, journal-tail) instead of globbing ~/.claude/memory/*.md and parsing section headers. All 9 Python scripts updated: get_semantic_keys(), get_topic_file_index(), get_recent_journal(), parse_journal_entries(), read_journal_range(), collect_topic_stems(), and file preview rendering now go through the store. This completes the clean switch — no script reads archived markdown files. 2026-02-28 23:32:47 -05:00			`try:`
			`r = subprocess.run(`
			`["poc-memory", "render", f],`
			`capture_output=True, text=True, timeout=10`
			`)`
			`content = r.stdout.strip()`
			`if content:`
			`preview = '\n'.join(content.split('\n')[:5])[:300]`
			`file_previews.append(f" {f}: {preview.replace(chr(10), ' \| ')}")`
			`else:`
			`file_previews.append(f" {f}: (no content)")`
			`except Exception:`
			`file_previews.append(f" {f}: (render failed)")`
poc-memory v0.4.0: graph-structured memory with consolidation pipeline Rust core: - Cap'n Proto append-only storage (nodes + relations) - Graph algorithms: clustering coefficient, community detection, schema fit, small-world metrics, interference detection - BM25 text similarity with Porter stemming - Spaced repetition replay queue - Commands: search, init, health, status, graph, categorize, link-add, link-impact, decay, consolidate-session, etc. Python scripts: - Episodic digest pipeline: daily/weekly/monthly-digest.py - retroactive-digest.py for backfilling - consolidation-agents.py: 3 parallel Sonnet agents - apply-consolidation.py: structured action extraction + apply - digest-link-parser.py: extract ~400 explicit links from digests - content-promotion-agent.py: promote episodic obs to semantic files - bulk-categorize.py: categorize all nodes via single Sonnet call - consolidation-loop.py: multi-round automated consolidation Co-Authored-By: Kent Overstreet <kent.overstreet@linux.dev> 2026-02-28 22:17:00 -05:00
			`previews_text = '\n'.join(file_previews)`

			`return f"""Categorize each memory file into one of these categories:`

			`- core: Identity, relationships, self-model, values, boundaries, emotional life.`
			`Examples: identity.md, kent.md, inner-life.md, differentiation.md`
			`- tech: Technical content — bcachefs, code patterns, Rust, kernel, formal verification.`
			`Examples: rust-conversion.md, btree-journal.md, kernel-patterns.md, allocation-io.md`
			`- obs: Observations, experiences, discoveries, experiments, IRC history, conversations.`
			`Examples: discoveries.md, irc-history.md, contradictions.md, experiments-on-self.md`
			`- task: Work items, plans, design documents, work queue.`
			`Examples: work-queue.md, the-plan.md, design-*.md`

			`Special rules:`
			`- Episodic files (daily-.md, weekly-.md, monthly-.md, session-.md) → obs`
			`- conversation-memories.md, deep-index.md → obs`
			`- journal.md → obs`
			`- paper-notes.md → core (it's the sentience paper, identity-defining)`
			`- language-theory.md → core (original intellectual work, not just tech)`
			`- skill-*.md → core (self-knowledge about capabilities)`
			`- design-*.md → task (design documents are plans)`
			`- poc-architecture.md, memory-architecture.md → task (architecture plans)`
			`- blog-setup.md → task`

			`Files to categorize:`
			`{previews_text}`

			`Output ONLY a JSON object mapping filename to category. No explanation.`
			`Example: {{"identity.md": "core", "rust-conversion.md": "tech"}}`
			`"""`


			`def main():`
			`do_apply = "--apply" in sys.argv`

			`keys = get_all_keys()`
			`files = get_unique_files(keys)`
			`print(f"Found {len(keys)} nodes across {len(files)} files")`

			`# Build and send prompt`
			`prompt = build_prompt(files)`
			`print(f"Prompt: {len(prompt):,} chars")`
			`print("Calling Sonnet...")`

			`response = call_sonnet(prompt)`
			`if response.startswith("Error:"):`
			`print(f" {response}")`
			`sys.exit(1)`

			`# Parse response`
			response = re.sub(r'^```json\s*', '', response.strip())
			response = re.sub(r'\s*```$', '', response.strip())

			`try:`
			`categorizations = json.loads(response)`
			`except json.JSONDecodeError:`
			`match = re.search(r'\{.*\}', response, re.DOTALL)`
			`if match:`
			`categorizations = json.loads(match.group())`
			`else:`
			`print(f"Failed to parse response: {response[:500]}")`
			`sys.exit(1)`

			`print(f"\nCategorizations: {len(categorizations)} files")`

			`# Count by category`
			`counts = {}`
			`for cat in categorizations.values():`
			`counts[cat] = counts.get(cat, 0) + 1`
			`for cat, n in sorted(counts.items()):`
			`print(f" {cat}: {n}")`

			`if not do_apply:`
			`print("\n--- Dry run ---")`
			`for f, cat in sorted(categorizations.items()):`
			`print(f" {f} → {cat}")`
			`print(f"\nTo apply: {sys.argv[0]} --apply")`

			`# Save for review`
			`out = MEMORY_DIR / "agent-results" / "bulk-categorize-preview.json"`
			`with open(out, "w") as fp:`
			`json.dump(categorizations, fp, indent=2)`
			`print(f"Saved: {out}")`
			`return`

			`# Apply: for each file, categorize the file-level node AND all section nodes`
			`applied = skipped = errors = 0`
			`for filename, category in sorted(categorizations.items()):`
			`# Find all keys that belong to this file`
			`file_keys = [k for k in keys if k == filename or k.startswith(filename + '#')]`
			`for key in file_keys:`
			`try:`
			`r = subprocess.run(`
			`["poc-memory", "categorize", key, category],`
			`capture_output=True, text=True, timeout=10`
			`)`
			`if r.returncode == 0:`
			`applied += 1`
			`else:`
			`err = r.stderr.strip()`
			`if "already" in err.lower():`
			`skipped += 1`
			`else:`
			`errors += 1`
			`except Exception as e:`
			`errors += 1`

			`print(f"\nApplied: {applied} Skipped: {skipped} Errors: {errors}")`
			print("Run `poc-memory status` to verify.")


			`if __name__ == "__main__":`
			`main()`