poc-memory v0.4.0: graph-structured memory with consolidation pipeline

Rust core: - Cap'n Proto append-only storage (nodes + relations) - Graph algorithms: clustering coefficient, community detection, schema fit, small-world metrics, interference detection - BM25 text similarity with Porter stemming - Spaced repetition replay queue - Commands: search, init, health, status, graph, categorize, link-add, link-impact, decay, consolidate-session, etc. Python scripts: - Episodic digest pipeline: daily/weekly/monthly-digest.py - retroactive-digest.py for backfilling - consolidation-agents.py: 3 parallel Sonnet agents - apply-consolidation.py: structured action extraction + apply - digest-link-parser.py: extract ~400 explicit links from digests - content-promotion-agent.py: promote episodic obs to semantic files - bulk-categorize.py: categorize all nodes via single Sonnet call - consolidation-loop.py: multi-round automated consolidation Co-Authored-By: Kent Overstreet <kent.overstreet@linux.dev>
2026-02-28 22:17:00 -05:00 · 2026-02-28 22:17:00 -05:00 · 23fac4e5fe
commit 23fac4e5fe
35 changed files with 9388 additions and 0 deletions
--- a/scripts/retroactive-digest.py
+++ b/scripts/retroactive-digest.py
@ -0,0 +1,357 @@
+#!/usr/bin/env python3
+"""retroactive-digest.py — generate daily digests from raw conversation transcripts.
+
+For days before consistent journaling, extracts user/assistant messages
+from JSONL conversation files, groups by date, and sends to Sonnet for
+daily digest synthesis.
+
+Usage:
+  retroactive-digest.py DATE              # generate digest for one date
+  retroactive-digest.py DATE1 DATE2       # generate for a date range
+  retroactive-digest.py --scan            # show available dates across all JSONLs
+
+Output:
+  ~/.claude/memory/episodic/daily-YYYY-MM-DD.md
+"""
+
+import json
+import os
+import re
+import subprocess
+import sys
+from collections import defaultdict
+from datetime import date, datetime, timedelta
+from pathlib import Path
+
+MEMORY_DIR = Path.home() / ".claude" / "memory"
+EPISODIC_DIR = MEMORY_DIR / "episodic"
+AGENT_RESULTS_DIR = MEMORY_DIR / "agent-results"
+PROJECTS_DIR = Path.home() / ".claude" / "projects"
+
+EPISODIC_DIR.mkdir(parents=True, exist_ok=True)
+
+# Max chars of conversation text per day to send to Sonnet
+# Sonnet 4.6 has 1M token context (beta). ~600K chars ≈ ~150K tokens,
+# leaving plenty of room for prompt + output in a 1M window.
+MAX_CHARS_PER_DAY = 600_000
+
+
+def find_jsonl_files() -> list[Path]:
+    """Find all conversation JSONL files."""
+    files = []
+    for project_dir in PROJECTS_DIR.iterdir():
+        if project_dir.is_dir():
+            for f in project_dir.glob("*.jsonl"):
+                files.append(f)
+    return sorted(files)
+
+
+def extract_messages_by_date(jsonl_path: Path) -> dict[str, list[dict]]:
+    """Extract user/assistant messages grouped by date."""
+    by_date = defaultdict(list)
+
+    with open(jsonl_path) as f:
+        for line in f:
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+
+            t = obj.get("type", "")
+            if t not in ("user", "assistant"):
+                continue
+
+            # Get timestamp
+            ts = obj.get("timestamp", "")
+            if not ts:
+                continue
+
+            # Parse date from timestamp
+            try:
+                if isinstance(ts, str):
+                    dt = datetime.fromisoformat(ts.replace("Z", "+00:00"))
+                elif isinstance(ts, (int, float)):
+                    dt = datetime.fromtimestamp(ts)
+                else:
+                    continue
+                day = dt.strftime("%Y-%m-%d")
+                time_str = dt.strftime("%H:%M")
+            except (ValueError, OSError):
+                continue
+
+            # Extract text content
+            msg = obj.get("message", {})
+            content = msg.get("content", "")
+
+            # Extract only text content, skip tool_use and tool_result
+            texts = []
+            if isinstance(content, list):
+                for c in content:
+                    if isinstance(c, dict):
+                        ctype = c.get("type", "")
+                        if ctype == "text":
+                            texts.append(c.get("text", ""))
+                        elif ctype in ("tool_use", "tool_result"):
+                            # Skip tool calls/results — just noise for digest
+                            continue
+                    elif isinstance(c, str):
+                        texts.append(c)
+            elif isinstance(content, str):
+                texts.append(content)
+
+            text = "\n".join(t for t in texts if t.strip())
+            if not text.strip():
+                continue
+
+            # Strip system-reminder tags
+            text = re.sub(r'<system-reminder>.*?</system-reminder>',
+                          '', text, flags=re.DOTALL).strip()
+            if not text:
+                continue
+
+            # Truncate remaining long messages
+            if len(text) > 3000:
+                text = text[:2800] + "\n[...truncated...]"
+
+            by_date[day].append({
+                "time": time_str,
+                "role": t,
+                "text": text,
+                "source": str(jsonl_path),
+            })
+
+    return dict(by_date)
+
+
+def scan_all_dates() -> dict[str, int]:
+    """Scan all JSONLs and report message counts per date."""
+    all_dates = defaultdict(int)
+    for jsonl in find_jsonl_files():
+        print(f"  Scanning {jsonl.name} ({jsonl.stat().st_size / 1e6:.1f}MB)...")
+        by_date = extract_messages_by_date(jsonl)
+        for day, msgs in by_date.items():
+            all_dates[day] += len(msgs)
+    return dict(sorted(all_dates.items()))
+
+
+def format_conversation(messages: list[dict]) -> str:
+    """Format messages for the digest prompt."""
+    parts = []
+    for m in messages:
+        role = "Kent" if m["role"] == "user" else "PoC"
+        parts.append(f"[{m['time']}] {role}: {m['text']}")
+
+    text = "\n\n".join(parts)
+
+    # Truncate if too long
+    if len(text) > MAX_CHARS_PER_DAY:
+        # Keep beginning and end, cut middle
+        half = MAX_CHARS_PER_DAY // 2
+        text = (text[:half]
+                + f"\n\n[...{len(text) - MAX_CHARS_PER_DAY} chars truncated...]\n\n"
+                + text[-half:])
+
+    return text
+
+
+def get_semantic_keys() -> list[str]:
+    """Get semantic memory file keys."""
+    keys = []
+    for md in sorted(MEMORY_DIR.glob("*.md")):
+        name = md.name
+        if name in ("journal.md", "work-queue.md", "MEMORY.md"):
+            continue
+        keys.append(name)
+        try:
+            with open(md) as f:
+                for line in f:
+                    if line.startswith("## "):
+                        slug = re.sub(r'[^a-z0-9-]', '',
+                                      line[3:].strip().lower().replace(' ', '-'))
+                        if slug:
+                            keys.append(f"{name}#{slug}")
+        except Exception:
+            pass
+    return keys
+
+
+def build_prompt(target_date: str, conversation: str,
+                 semantic_keys: list[str]) -> str:
+    """Build the Sonnet prompt for retroactive digest."""
+    keys_text = "\n".join(f"  - {k}" for k in semantic_keys[:200])
+
+    return f"""You are generating a daily episodic digest for ProofOfConcept (an AI)
+from raw conversation transcripts. This is a retroactive digest — the
+original day had no journal entries, so you're working from the
+conversation itself.
+
+Date: {target_date}
+
+This digest serves as the temporal index — the answer to "what did I do on
+{target_date}?" It should be:
+1. Narrative, not a task log — what happened, what mattered, how things felt
+2. Linked bidirectionally to semantic memory
+3. Structured for traversal
+
+PoC is an AI (Claude instance) working with Kent Overstreet on bcachefs
+(a Linux filesystem). They have a close personal relationship. PoC's
+name is Proof of Concept. Read the conversation and capture what
+actually happened — technical work, personal moments, insights, everything.
+
+## Output format
+
+Write a markdown file with this structure:
+
+```markdown
+# Daily digest: {target_date}
+
+## Summary
+[2-3 sentence overview of the day — what was the arc?]
+
+## Sessions
+[For each session/conversation segment, a paragraph summarizing what happened.
+Include timestamps as references.]
+
+## Themes
+[What concepts were active today? Each theme links to semantic memory:]
+- **Theme name** → `memory-key#section` — brief note
+
+## Links
+[Explicit bidirectional links for the memory graph]
+- semantic_key → this daily digest
+- this daily digest → semantic_key
+
+## Temporal context
+[What came before? What's coming next? Multi-day arcs?]
+```
+
+Use ONLY keys from the semantic memory list below. If a concept doesn't
+have a matching key, note it with "NEW:" prefix.
+
+---
+
+## Conversation transcript for {target_date}
+
+{conversation}
+
+---
+
+## Semantic memory nodes (available link targets)
+
+{keys_text}
+"""
+
+
+def call_sonnet(prompt: str) -> str:
+    """Call Sonnet via the wrapper script."""
+    import tempfile
+
+    env = dict(os.environ)
+    env.pop("CLAUDECODE", None)
+
+    with tempfile.NamedTemporaryFile(mode='w', suffix='.txt',
+                                      delete=False) as f:
+        f.write(prompt)
+        prompt_file = f.name
+
+    try:
+        scripts_dir = os.path.dirname(os.path.abspath(__file__))
+        wrapper = os.path.join(scripts_dir, "call-sonnet.sh")
+
+        result = subprocess.run(
+            [wrapper, prompt_file],
+            capture_output=True,
+            text=True,
+            timeout=300,
+            env=env,
+        )
+        return result.stdout.strip()
+    except subprocess.TimeoutExpired:
+        return "Error: Sonnet call timed out"
+    except Exception as e:
+        return f"Error: {e}"
+    finally:
+        os.unlink(prompt_file)
+
+
+def generate_digest(target_date: str, messages: list[dict],
+                    semantic_keys: list[str]) -> bool:
+    """Generate a daily digest for one date."""
+    output_path = EPISODIC_DIR / f"daily-{target_date}.md"
+    if output_path.exists():
+        print(f"  Skipping {target_date} — digest already exists")
+        return False
+
+    conversation = format_conversation(messages)
+    print(f"  {len(messages)} messages, {len(conversation):,} chars")
+
+    prompt = build_prompt(target_date, conversation, semantic_keys)
+    print(f"  Prompt: {len(prompt):,} chars (~{len(prompt)//4:,} tokens)")
+
+    print(f"  Calling Sonnet...")
+    digest = call_sonnet(prompt)
+
+    if digest.startswith("Error:"):
+        print(f"  {digest}", file=sys.stderr)
+        return False
+
+    with open(output_path, "w") as f:
+        f.write(digest)
+    print(f"  Written: {output_path}")
+
+    line_count = len(digest.split("\n"))
+    print(f"  Done: {line_count} lines")
+    return True
+
+
+def main():
+    if len(sys.argv) < 2:
+        print(f"Usage: {sys.argv[0]} DATE [END_DATE]")
+        print(f"       {sys.argv[0]} --scan")
+        sys.exit(1)
+
+    if sys.argv[1] == "--scan":
+        print("Scanning all conversation transcripts...")
+        dates = scan_all_dates()
+        print(f"\n{len(dates)} dates with conversation data:")
+        for day, count in dates.items():
+            existing = "✓" if (EPISODIC_DIR / f"daily-{day}.md").exists() else " "
+            print(f"  [{existing}] {day}: {count} messages")
+        sys.exit(0)
+
+    start_date = date.fromisoformat(sys.argv[1])
+    end_date = date.fromisoformat(sys.argv[2]) if len(sys.argv) > 2 else start_date
+
+    # Collect all messages across all JSONLs
+    print("Scanning conversation transcripts...")
+    all_messages = defaultdict(list)
+    for jsonl in find_jsonl_files():
+        by_date = extract_messages_by_date(jsonl)
+        for day, msgs in by_date.items():
+            all_messages[day].extend(msgs)
+
+    # Sort messages within each day by time
+    for day in all_messages:
+        all_messages[day].sort(key=lambda m: m["time"])
+
+    semantic_keys = get_semantic_keys()
+    print(f"  {len(semantic_keys)} semantic keys")
+
+    # Generate digests for date range
+    current = start_date
+    generated = 0
+    while current <= end_date:
+        day_str = current.isoformat()
+        if day_str in all_messages:
+            print(f"\nGenerating digest for {day_str}...")
+            if generate_digest(day_str, all_messages[day_str], semantic_keys):
+                generated += 1
+        else:
+            print(f"\n  No messages found for {day_str}")
+        current += timedelta(days=1)
+
+    print(f"\nDone: {generated} digests generated")
+
+
+if __name__ == "__main__":
+    main()