consciousness/scripts/bulk-categorize.py

201 lines
6.5 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""bulk-categorize.py — categorize all memory nodes via a single Sonnet call.
Sends the list of unique file names to Sonnet, gets back categorizations,
then applies them via poc-memory categorize.
Usage:
bulk-categorize.py # dry run
bulk-categorize.py --apply # apply categorizations
"""
import json
import os
import re
import subprocess
import sys
import tempfile
from pathlib import Path
MEMORY_DIR = Path.home() / ".claude" / "memory"
SCRIPTS_DIR = Path(__file__).parent
def call_sonnet(prompt: str, timeout: int = 300) -> str:
"""Call Sonnet via the wrapper script."""
env = dict(os.environ)
env.pop("CLAUDECODE", None)
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt',
delete=False) as f:
f.write(prompt)
prompt_file = f.name
try:
wrapper = str(SCRIPTS_DIR / "call-sonnet.sh")
result = subprocess.run(
[wrapper, prompt_file],
capture_output=True,
text=True,
timeout=timeout,
env=env,
)
return result.stdout.strip()
except subprocess.TimeoutExpired:
return "Error: Sonnet call timed out"
except Exception as e:
return f"Error: {e}"
finally:
os.unlink(prompt_file)
def get_all_keys() -> list[str]:
"""Get all node keys via poc-memory list-keys."""
r = subprocess.run(["poc-memory", "list-keys"],
capture_output=True, text=True, timeout=30)
if r.returncode != 0:
return []
return [k for k in r.stdout.strip().split('\n') if k]
def get_unique_files(keys: list[str]) -> list[str]:
"""Extract unique file names (without section anchors)."""
files = set()
for k in keys:
files.add(k.split('#')[0])
return sorted(files)
def build_prompt(files: list[str]) -> str:
"""Build categorization prompt."""
# Read file previews from the store
file_previews = []
for f in files:
try:
r = subprocess.run(
["poc-memory", "render", f],
capture_output=True, text=True, timeout=10
)
content = r.stdout.strip()
if content:
preview = '\n'.join(content.split('\n')[:5])[:300]
file_previews.append(f" {f}: {preview.replace(chr(10), ' | ')}")
else:
file_previews.append(f" {f}: (no content)")
except Exception:
file_previews.append(f" {f}: (render failed)")
previews_text = '\n'.join(file_previews)
return f"""Categorize each memory file into one of these categories:
- **core**: Identity, relationships, self-model, values, boundaries, emotional life.
Examples: identity.md, kent.md, inner-life.md, differentiation.md
- **tech**: Technical content bcachefs, code patterns, Rust, kernel, formal verification.
Examples: rust-conversion.md, btree-journal.md, kernel-patterns.md, allocation-io.md
- **obs**: Observations, experiences, discoveries, experiments, IRC history, conversations.
Examples: discoveries.md, irc-history.md, contradictions.md, experiments-on-self.md
- **task**: Work items, plans, design documents, work queue.
Examples: work-queue.md, the-plan.md, design-*.md
Special rules:
- Episodic files (daily-*.md, weekly-*.md, monthly-*.md, session-*.md) obs
- conversation-memories.md, deep-index.md obs
- journal.md obs
- paper-notes.md core (it's the sentience paper, identity-defining)
- language-theory.md core (original intellectual work, not just tech)
- skill-*.md core (self-knowledge about capabilities)
- design-*.md task (design documents are plans)
- poc-architecture.md, memory-architecture.md task (architecture plans)
- blog-setup.md task
Files to categorize:
{previews_text}
Output ONLY a JSON object mapping filename to category. No explanation.
Example: {{"identity.md": "core", "rust-conversion.md": "tech"}}
"""
def main():
do_apply = "--apply" in sys.argv
keys = get_all_keys()
files = get_unique_files(keys)
print(f"Found {len(keys)} nodes across {len(files)} files")
# Build and send prompt
prompt = build_prompt(files)
print(f"Prompt: {len(prompt):,} chars")
print("Calling Sonnet...")
response = call_sonnet(prompt)
if response.startswith("Error:"):
print(f" {response}")
sys.exit(1)
# Parse response
response = re.sub(r'^```json\s*', '', response.strip())
response = re.sub(r'\s*```$', '', response.strip())
try:
categorizations = json.loads(response)
except json.JSONDecodeError:
match = re.search(r'\{.*\}', response, re.DOTALL)
if match:
categorizations = json.loads(match.group())
else:
print(f"Failed to parse response: {response[:500]}")
sys.exit(1)
print(f"\nCategorizations: {len(categorizations)} files")
# Count by category
counts = {}
for cat in categorizations.values():
counts[cat] = counts.get(cat, 0) + 1
for cat, n in sorted(counts.items()):
print(f" {cat}: {n}")
if not do_apply:
print("\n--- Dry run ---")
for f, cat in sorted(categorizations.items()):
print(f" {f}{cat}")
print(f"\nTo apply: {sys.argv[0]} --apply")
# Save for review
out = MEMORY_DIR / "agent-results" / "bulk-categorize-preview.json"
with open(out, "w") as fp:
json.dump(categorizations, fp, indent=2)
print(f"Saved: {out}")
return
# Apply: for each file, categorize the file-level node AND all section nodes
applied = skipped = errors = 0
for filename, category in sorted(categorizations.items()):
# Find all keys that belong to this file
file_keys = [k for k in keys if k == filename or k.startswith(filename + '#')]
for key in file_keys:
try:
r = subprocess.run(
["poc-memory", "categorize", key, category],
capture_output=True, text=True, timeout=10
)
if r.returncode == 0:
applied += 1
else:
err = r.stderr.strip()
if "already" in err.lower():
skipped += 1
else:
errors += 1
except Exception as e:
errors += 1
print(f"\nApplied: {applied} Skipped: {skipped} Errors: {errors}")
print("Run `poc-memory status` to verify.")
if __name__ == "__main__":
main()