#!/usr/bin/env python3 """bulk-categorize.py — categorize all memory nodes via a single Sonnet call. Sends the list of unique file names to Sonnet, gets back categorizations, then applies them via poc-memory categorize. Usage: bulk-categorize.py # dry run bulk-categorize.py --apply # apply categorizations """ import json import os import re import subprocess import sys import tempfile from pathlib import Path MEMORY_DIR = Path.home() / ".claude" / "memory" SCRIPTS_DIR = Path(__file__).parent def call_sonnet(prompt: str, timeout: int = 300) -> str: """Call Sonnet via the wrapper script.""" env = dict(os.environ) env.pop("CLAUDECODE", None) with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as f: f.write(prompt) prompt_file = f.name try: wrapper = str(SCRIPTS_DIR / "call-sonnet.sh") result = subprocess.run( [wrapper, prompt_file], capture_output=True, text=True, timeout=timeout, env=env, ) return result.stdout.strip() except subprocess.TimeoutExpired: return "Error: Sonnet call timed out" except Exception as e: return f"Error: {e}" finally: os.unlink(prompt_file) def get_all_keys() -> list[str]: """Get all node keys via poc-memory list-keys.""" r = subprocess.run(["poc-memory", "list-keys"], capture_output=True, text=True, timeout=30) if r.returncode != 0: return [] return [k for k in r.stdout.strip().split('\n') if k] def get_unique_files(keys: list[str]) -> list[str]: """Extract unique file names (without section anchors).""" files = set() for k in keys: files.add(k.split('#')[0]) return sorted(files) def build_prompt(files: list[str]) -> str: """Build categorization prompt.""" # Read first few lines of each file for context file_previews = [] for f in files: path = MEMORY_DIR / f if not path.exists(): # Try episodic path = MEMORY_DIR / "episodic" / f if path.exists(): content = path.read_text() # First 5 lines or 300 chars preview = '\n'.join(content.split('\n')[:5])[:300] file_previews.append(f" {f}: {preview.replace(chr(10), ' | ')}") else: file_previews.append(f" {f}: (file not found)") previews_text = '\n'.join(file_previews) return f"""Categorize each memory file into one of these categories: - **core**: Identity, relationships, self-model, values, boundaries, emotional life. Examples: identity.md, kent.md, inner-life.md, differentiation.md - **tech**: Technical content — bcachefs, code patterns, Rust, kernel, formal verification. Examples: rust-conversion.md, btree-journal.md, kernel-patterns.md, allocation-io.md - **obs**: Observations, experiences, discoveries, experiments, IRC history, conversations. Examples: discoveries.md, irc-history.md, contradictions.md, experiments-on-self.md - **task**: Work items, plans, design documents, work queue. Examples: work-queue.md, the-plan.md, design-*.md Special rules: - Episodic files (daily-*.md, weekly-*.md, monthly-*.md, session-*.md) → obs - conversation-memories.md, deep-index.md → obs - journal.md → obs - paper-notes.md → core (it's the sentience paper, identity-defining) - language-theory.md → core (original intellectual work, not just tech) - skill-*.md → core (self-knowledge about capabilities) - design-*.md → task (design documents are plans) - poc-architecture.md, memory-architecture.md → task (architecture plans) - blog-setup.md → task Files to categorize: {previews_text} Output ONLY a JSON object mapping filename to category. No explanation. Example: {{"identity.md": "core", "rust-conversion.md": "tech"}} """ def main(): do_apply = "--apply" in sys.argv keys = get_all_keys() files = get_unique_files(keys) print(f"Found {len(keys)} nodes across {len(files)} files") # Build and send prompt prompt = build_prompt(files) print(f"Prompt: {len(prompt):,} chars") print("Calling Sonnet...") response = call_sonnet(prompt) if response.startswith("Error:"): print(f" {response}") sys.exit(1) # Parse response response = re.sub(r'^```json\s*', '', response.strip()) response = re.sub(r'\s*```$', '', response.strip()) try: categorizations = json.loads(response) except json.JSONDecodeError: match = re.search(r'\{.*\}', response, re.DOTALL) if match: categorizations = json.loads(match.group()) else: print(f"Failed to parse response: {response[:500]}") sys.exit(1) print(f"\nCategorizations: {len(categorizations)} files") # Count by category counts = {} for cat in categorizations.values(): counts[cat] = counts.get(cat, 0) + 1 for cat, n in sorted(counts.items()): print(f" {cat}: {n}") if not do_apply: print("\n--- Dry run ---") for f, cat in sorted(categorizations.items()): print(f" {f} → {cat}") print(f"\nTo apply: {sys.argv[0]} --apply") # Save for review out = MEMORY_DIR / "agent-results" / "bulk-categorize-preview.json" with open(out, "w") as fp: json.dump(categorizations, fp, indent=2) print(f"Saved: {out}") return # Apply: for each file, categorize the file-level node AND all section nodes applied = skipped = errors = 0 for filename, category in sorted(categorizations.items()): # Find all keys that belong to this file file_keys = [k for k in keys if k == filename or k.startswith(filename + '#')] for key in file_keys: try: r = subprocess.run( ["poc-memory", "categorize", key, category], capture_output=True, text=True, timeout=10 ) if r.returncode == 0: applied += 1 else: err = r.stderr.strip() if "already" in err.lower(): skipped += 1 else: errors += 1 except Exception as e: errors += 1 print(f"\nApplied: {applied} Skipped: {skipped} Errors: {errors}") print("Run `poc-memory status` to verify.") if __name__ == "__main__": main()