consciousness/scripts/bulk-categorize.py

#!/usr/bin/env python3
"""bulk-categorize.py — categorize all memory nodes via a single Sonnet call.

Sends the list of unique file names to Sonnet, gets back categorizations,
then applies them via poc-memory categorize.

Usage:
  bulk-categorize.py              # dry run
  bulk-categorize.py --apply      # apply categorizations
"""

import json
import os
import re
import subprocess
import sys
import tempfile
from pathlib import Path

MEMORY_DIR = Path.home() / ".claude" / "memory"
SCRIPTS_DIR = Path(__file__).parent


def call_sonnet(prompt: str, timeout: int = 300) -> str:
    """Call Sonnet via the wrapper script."""
    env = dict(os.environ)
    env.pop("CLAUDECODE", None)

    with tempfile.NamedTemporaryFile(mode='w', suffix='.txt',
                                      delete=False) as f:
        f.write(prompt)
        prompt_file = f.name

    try:
        wrapper = str(SCRIPTS_DIR / "call-sonnet.sh")
        result = subprocess.run(
            [wrapper, prompt_file],
            capture_output=True,
            text=True,
            timeout=timeout,
            env=env,
        )
        return result.stdout.strip()
    except subprocess.TimeoutExpired:
        return "Error: Sonnet call timed out"
    except Exception as e:
        return f"Error: {e}"
    finally:
        os.unlink(prompt_file)


def get_all_keys() -> list[str]:
    """Get all node keys via poc-memory list-keys."""
    r = subprocess.run(["poc-memory", "list-keys"],
                       capture_output=True, text=True, timeout=30)
    if r.returncode != 0:
        return []
    return [k for k in r.stdout.strip().split('\n') if k]


def get_unique_files(keys: list[str]) -> list[str]:
    """Extract unique file names (without section anchors)."""
    files = set()
    for k in keys:
        files.add(k.split('#')[0])
    return sorted(files)


def build_prompt(files: list[str]) -> str:
    """Build categorization prompt."""
    # Read first few lines of each file for context
    file_previews = []
    for f in files:
        path = MEMORY_DIR / f
        if not path.exists():
            # Try episodic
            path = MEMORY_DIR / "episodic" / f
        if path.exists():
            content = path.read_text()
            # First 5 lines or 300 chars
            preview = '\n'.join(content.split('\n')[:5])[:300]
            file_previews.append(f"  {f}: {preview.replace(chr(10), ' | ')}")
        else:
            file_previews.append(f"  {f}: (file not found)")

    previews_text = '\n'.join(file_previews)

    return f"""Categorize each memory file into one of these categories:

- **core**: Identity, relationships, self-model, values, boundaries, emotional life.
  Examples: identity.md, kent.md, inner-life.md, differentiation.md
- **tech**: Technical content — bcachefs, code patterns, Rust, kernel, formal verification.
  Examples: rust-conversion.md, btree-journal.md, kernel-patterns.md, allocation-io.md
- **obs**: Observations, experiences, discoveries, experiments, IRC history, conversations.
  Examples: discoveries.md, irc-history.md, contradictions.md, experiments-on-self.md
- **task**: Work items, plans, design documents, work queue.
  Examples: work-queue.md, the-plan.md, design-*.md

Special rules:
- Episodic files (daily-*.md, weekly-*.md, monthly-*.md, session-*.md) → obs
- conversation-memories.md, deep-index.md → obs
- journal.md → obs
- paper-notes.md → core (it's the sentience paper, identity-defining)
- language-theory.md → core (original intellectual work, not just tech)
- skill-*.md → core (self-knowledge about capabilities)
- design-*.md → task (design documents are plans)
- poc-architecture.md, memory-architecture.md → task (architecture plans)
- blog-setup.md → task

Files to categorize:
{previews_text}

Output ONLY a JSON object mapping filename to category. No explanation.
Example: {{"identity.md": "core", "rust-conversion.md": "tech"}}
"""


def main():
    do_apply = "--apply" in sys.argv

    keys = get_all_keys()
    files = get_unique_files(keys)
    print(f"Found {len(keys)} nodes across {len(files)} files")

    # Build and send prompt
    prompt = build_prompt(files)
    print(f"Prompt: {len(prompt):,} chars")
    print("Calling Sonnet...")

    response = call_sonnet(prompt)
    if response.startswith("Error:"):
        print(f"  {response}")
        sys.exit(1)

    # Parse response
    response = re.sub(r'^```json\s*', '', response.strip())
    response = re.sub(r'\s*```$', '', response.strip())

    try:
        categorizations = json.loads(response)
    except json.JSONDecodeError:
        match = re.search(r'\{.*\}', response, re.DOTALL)
        if match:
            categorizations = json.loads(match.group())
        else:
            print(f"Failed to parse response: {response[:500]}")
            sys.exit(1)

    print(f"\nCategorizations: {len(categorizations)} files")

    # Count by category
    counts = {}
    for cat in categorizations.values():
        counts[cat] = counts.get(cat, 0) + 1
    for cat, n in sorted(counts.items()):
        print(f"  {cat}: {n}")

    if not do_apply:
        print("\n--- Dry run ---")
        for f, cat in sorted(categorizations.items()):
            print(f"  {f} → {cat}")
        print(f"\nTo apply: {sys.argv[0]} --apply")

        # Save for review
        out = MEMORY_DIR / "agent-results" / "bulk-categorize-preview.json"
        with open(out, "w") as fp:
            json.dump(categorizations, fp, indent=2)
        print(f"Saved: {out}")
        return

    # Apply: for each file, categorize the file-level node AND all section nodes
    applied = skipped = errors = 0
    for filename, category in sorted(categorizations.items()):
        # Find all keys that belong to this file
        file_keys = [k for k in keys if k == filename or k.startswith(filename + '#')]
        for key in file_keys:
            try:
                r = subprocess.run(
                    ["poc-memory", "categorize", key, category],
                    capture_output=True, text=True, timeout=10
                )
                if r.returncode == 0:
                    applied += 1
                else:
                    err = r.stderr.strip()
                    if "already" in err.lower():
                        skipped += 1
                    else:
                        errors += 1
            except Exception as e:
                errors += 1

    print(f"\nApplied: {applied}  Skipped: {skipped}  Errors: {errors}")
    print("Run `poc-memory status` to verify.")


if __name__ == "__main__":
    main()