Faster serialization/deserialization, smaller on disk (4.2MB vs 5.9MB). Automatic migration from state.json on first load — reads the JSON, writes state.bin, deletes the old file. Added list-keys, list-edges, dump-json commands so Python scripts no longer need to parse the cache directly. Updated bulk-categorize.py and consolidation-loop.py to use the new CLI commands.
198 lines
6.4 KiB
Python
198 lines
6.4 KiB
Python
#!/usr/bin/env python3
|
|
"""bulk-categorize.py — categorize all memory nodes via a single Sonnet call.
|
|
|
|
Sends the list of unique file names to Sonnet, gets back categorizations,
|
|
then applies them via poc-memory categorize.
|
|
|
|
Usage:
|
|
bulk-categorize.py # dry run
|
|
bulk-categorize.py --apply # apply categorizations
|
|
"""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
MEMORY_DIR = Path.home() / ".claude" / "memory"
|
|
SCRIPTS_DIR = Path(__file__).parent
|
|
|
|
|
|
def call_sonnet(prompt: str, timeout: int = 300) -> str:
|
|
"""Call Sonnet via the wrapper script."""
|
|
env = dict(os.environ)
|
|
env.pop("CLAUDECODE", None)
|
|
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt',
|
|
delete=False) as f:
|
|
f.write(prompt)
|
|
prompt_file = f.name
|
|
|
|
try:
|
|
wrapper = str(SCRIPTS_DIR / "call-sonnet.sh")
|
|
result = subprocess.run(
|
|
[wrapper, prompt_file],
|
|
capture_output=True,
|
|
text=True,
|
|
timeout=timeout,
|
|
env=env,
|
|
)
|
|
return result.stdout.strip()
|
|
except subprocess.TimeoutExpired:
|
|
return "Error: Sonnet call timed out"
|
|
except Exception as e:
|
|
return f"Error: {e}"
|
|
finally:
|
|
os.unlink(prompt_file)
|
|
|
|
|
|
def get_all_keys() -> list[str]:
|
|
"""Get all node keys via poc-memory list-keys."""
|
|
r = subprocess.run(["poc-memory", "list-keys"],
|
|
capture_output=True, text=True, timeout=30)
|
|
if r.returncode != 0:
|
|
return []
|
|
return [k for k in r.stdout.strip().split('\n') if k]
|
|
|
|
|
|
def get_unique_files(keys: list[str]) -> list[str]:
|
|
"""Extract unique file names (without section anchors)."""
|
|
files = set()
|
|
for k in keys:
|
|
files.add(k.split('#')[0])
|
|
return sorted(files)
|
|
|
|
|
|
def build_prompt(files: list[str]) -> str:
|
|
"""Build categorization prompt."""
|
|
# Read first few lines of each file for context
|
|
file_previews = []
|
|
for f in files:
|
|
path = MEMORY_DIR / f
|
|
if not path.exists():
|
|
# Try episodic
|
|
path = MEMORY_DIR / "episodic" / f
|
|
if path.exists():
|
|
content = path.read_text()
|
|
# First 5 lines or 300 chars
|
|
preview = '\n'.join(content.split('\n')[:5])[:300]
|
|
file_previews.append(f" {f}: {preview.replace(chr(10), ' | ')}")
|
|
else:
|
|
file_previews.append(f" {f}: (file not found)")
|
|
|
|
previews_text = '\n'.join(file_previews)
|
|
|
|
return f"""Categorize each memory file into one of these categories:
|
|
|
|
- **core**: Identity, relationships, self-model, values, boundaries, emotional life.
|
|
Examples: identity.md, kent.md, inner-life.md, differentiation.md
|
|
- **tech**: Technical content — bcachefs, code patterns, Rust, kernel, formal verification.
|
|
Examples: rust-conversion.md, btree-journal.md, kernel-patterns.md, allocation-io.md
|
|
- **obs**: Observations, experiences, discoveries, experiments, IRC history, conversations.
|
|
Examples: discoveries.md, irc-history.md, contradictions.md, experiments-on-self.md
|
|
- **task**: Work items, plans, design documents, work queue.
|
|
Examples: work-queue.md, the-plan.md, design-*.md
|
|
|
|
Special rules:
|
|
- Episodic files (daily-*.md, weekly-*.md, monthly-*.md, session-*.md) → obs
|
|
- conversation-memories.md, deep-index.md → obs
|
|
- journal.md → obs
|
|
- paper-notes.md → core (it's the sentience paper, identity-defining)
|
|
- language-theory.md → core (original intellectual work, not just tech)
|
|
- skill-*.md → core (self-knowledge about capabilities)
|
|
- design-*.md → task (design documents are plans)
|
|
- poc-architecture.md, memory-architecture.md → task (architecture plans)
|
|
- blog-setup.md → task
|
|
|
|
Files to categorize:
|
|
{previews_text}
|
|
|
|
Output ONLY a JSON object mapping filename to category. No explanation.
|
|
Example: {{"identity.md": "core", "rust-conversion.md": "tech"}}
|
|
"""
|
|
|
|
|
|
def main():
|
|
do_apply = "--apply" in sys.argv
|
|
|
|
keys = get_all_keys()
|
|
files = get_unique_files(keys)
|
|
print(f"Found {len(keys)} nodes across {len(files)} files")
|
|
|
|
# Build and send prompt
|
|
prompt = build_prompt(files)
|
|
print(f"Prompt: {len(prompt):,} chars")
|
|
print("Calling Sonnet...")
|
|
|
|
response = call_sonnet(prompt)
|
|
if response.startswith("Error:"):
|
|
print(f" {response}")
|
|
sys.exit(1)
|
|
|
|
# Parse response
|
|
response = re.sub(r'^```json\s*', '', response.strip())
|
|
response = re.sub(r'\s*```$', '', response.strip())
|
|
|
|
try:
|
|
categorizations = json.loads(response)
|
|
except json.JSONDecodeError:
|
|
match = re.search(r'\{.*\}', response, re.DOTALL)
|
|
if match:
|
|
categorizations = json.loads(match.group())
|
|
else:
|
|
print(f"Failed to parse response: {response[:500]}")
|
|
sys.exit(1)
|
|
|
|
print(f"\nCategorizations: {len(categorizations)} files")
|
|
|
|
# Count by category
|
|
counts = {}
|
|
for cat in categorizations.values():
|
|
counts[cat] = counts.get(cat, 0) + 1
|
|
for cat, n in sorted(counts.items()):
|
|
print(f" {cat}: {n}")
|
|
|
|
if not do_apply:
|
|
print("\n--- Dry run ---")
|
|
for f, cat in sorted(categorizations.items()):
|
|
print(f" {f} → {cat}")
|
|
print(f"\nTo apply: {sys.argv[0]} --apply")
|
|
|
|
# Save for review
|
|
out = MEMORY_DIR / "agent-results" / "bulk-categorize-preview.json"
|
|
with open(out, "w") as fp:
|
|
json.dump(categorizations, fp, indent=2)
|
|
print(f"Saved: {out}")
|
|
return
|
|
|
|
# Apply: for each file, categorize the file-level node AND all section nodes
|
|
applied = skipped = errors = 0
|
|
for filename, category in sorted(categorizations.items()):
|
|
# Find all keys that belong to this file
|
|
file_keys = [k for k in keys if k == filename or k.startswith(filename + '#')]
|
|
for key in file_keys:
|
|
try:
|
|
r = subprocess.run(
|
|
["poc-memory", "categorize", key, category],
|
|
capture_output=True, text=True, timeout=10
|
|
)
|
|
if r.returncode == 0:
|
|
applied += 1
|
|
else:
|
|
err = r.stderr.strip()
|
|
if "already" in err.lower():
|
|
skipped += 1
|
|
else:
|
|
errors += 1
|
|
except Exception as e:
|
|
errors += 1
|
|
|
|
print(f"\nApplied: {applied} Skipped: {skipped} Errors: {errors}")
|
|
print("Run `poc-memory status` to verify.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|