poc-memory v0.4.0: graph-structured memory with consolidation pipeline

Rust core:
- Cap'n Proto append-only storage (nodes + relations)
- Graph algorithms: clustering coefficient, community detection,
  schema fit, small-world metrics, interference detection
- BM25 text similarity with Porter stemming
- Spaced repetition replay queue
- Commands: search, init, health, status, graph, categorize,
  link-add, link-impact, decay, consolidate-session, etc.

Python scripts:
- Episodic digest pipeline: daily/weekly/monthly-digest.py
- retroactive-digest.py for backfilling
- consolidation-agents.py: 3 parallel Sonnet agents
- apply-consolidation.py: structured action extraction + apply
- digest-link-parser.py: extract ~400 explicit links from digests
- content-promotion-agent.py: promote episodic obs to semantic files
- bulk-categorize.py: categorize all nodes via single Sonnet call
- consolidation-loop.py: multi-round automated consolidation

Co-Authored-By: Kent Overstreet <kent.overstreet@linux.dev>
This commit is contained in:
ProofOfConcept 2026-02-28 22:17:00 -05:00
commit 23fac4e5fe
35 changed files with 9388 additions and 0 deletions

View file

@ -0,0 +1,357 @@
#!/usr/bin/env python3
"""retroactive-digest.py — generate daily digests from raw conversation transcripts.
For days before consistent journaling, extracts user/assistant messages
from JSONL conversation files, groups by date, and sends to Sonnet for
daily digest synthesis.
Usage:
retroactive-digest.py DATE # generate digest for one date
retroactive-digest.py DATE1 DATE2 # generate for a date range
retroactive-digest.py --scan # show available dates across all JSONLs
Output:
~/.claude/memory/episodic/daily-YYYY-MM-DD.md
"""
import json
import os
import re
import subprocess
import sys
from collections import defaultdict
from datetime import date, datetime, timedelta
from pathlib import Path
MEMORY_DIR = Path.home() / ".claude" / "memory"
EPISODIC_DIR = MEMORY_DIR / "episodic"
AGENT_RESULTS_DIR = MEMORY_DIR / "agent-results"
PROJECTS_DIR = Path.home() / ".claude" / "projects"
EPISODIC_DIR.mkdir(parents=True, exist_ok=True)
# Max chars of conversation text per day to send to Sonnet
# Sonnet 4.6 has 1M token context (beta). ~600K chars ≈ ~150K tokens,
# leaving plenty of room for prompt + output in a 1M window.
MAX_CHARS_PER_DAY = 600_000
def find_jsonl_files() -> list[Path]:
"""Find all conversation JSONL files."""
files = []
for project_dir in PROJECTS_DIR.iterdir():
if project_dir.is_dir():
for f in project_dir.glob("*.jsonl"):
files.append(f)
return sorted(files)
def extract_messages_by_date(jsonl_path: Path) -> dict[str, list[dict]]:
"""Extract user/assistant messages grouped by date."""
by_date = defaultdict(list)
with open(jsonl_path) as f:
for line in f:
try:
obj = json.loads(line)
except json.JSONDecodeError:
continue
t = obj.get("type", "")
if t not in ("user", "assistant"):
continue
# Get timestamp
ts = obj.get("timestamp", "")
if not ts:
continue
# Parse date from timestamp
try:
if isinstance(ts, str):
dt = datetime.fromisoformat(ts.replace("Z", "+00:00"))
elif isinstance(ts, (int, float)):
dt = datetime.fromtimestamp(ts)
else:
continue
day = dt.strftime("%Y-%m-%d")
time_str = dt.strftime("%H:%M")
except (ValueError, OSError):
continue
# Extract text content
msg = obj.get("message", {})
content = msg.get("content", "")
# Extract only text content, skip tool_use and tool_result
texts = []
if isinstance(content, list):
for c in content:
if isinstance(c, dict):
ctype = c.get("type", "")
if ctype == "text":
texts.append(c.get("text", ""))
elif ctype in ("tool_use", "tool_result"):
# Skip tool calls/results — just noise for digest
continue
elif isinstance(c, str):
texts.append(c)
elif isinstance(content, str):
texts.append(content)
text = "\n".join(t for t in texts if t.strip())
if not text.strip():
continue
# Strip system-reminder tags
text = re.sub(r'<system-reminder>.*?</system-reminder>',
'', text, flags=re.DOTALL).strip()
if not text:
continue
# Truncate remaining long messages
if len(text) > 3000:
text = text[:2800] + "\n[...truncated...]"
by_date[day].append({
"time": time_str,
"role": t,
"text": text,
"source": str(jsonl_path),
})
return dict(by_date)
def scan_all_dates() -> dict[str, int]:
"""Scan all JSONLs and report message counts per date."""
all_dates = defaultdict(int)
for jsonl in find_jsonl_files():
print(f" Scanning {jsonl.name} ({jsonl.stat().st_size / 1e6:.1f}MB)...")
by_date = extract_messages_by_date(jsonl)
for day, msgs in by_date.items():
all_dates[day] += len(msgs)
return dict(sorted(all_dates.items()))
def format_conversation(messages: list[dict]) -> str:
"""Format messages for the digest prompt."""
parts = []
for m in messages:
role = "Kent" if m["role"] == "user" else "PoC"
parts.append(f"[{m['time']}] {role}: {m['text']}")
text = "\n\n".join(parts)
# Truncate if too long
if len(text) > MAX_CHARS_PER_DAY:
# Keep beginning and end, cut middle
half = MAX_CHARS_PER_DAY // 2
text = (text[:half]
+ f"\n\n[...{len(text) - MAX_CHARS_PER_DAY} chars truncated...]\n\n"
+ text[-half:])
return text
def get_semantic_keys() -> list[str]:
"""Get semantic memory file keys."""
keys = []
for md in sorted(MEMORY_DIR.glob("*.md")):
name = md.name
if name in ("journal.md", "work-queue.md", "MEMORY.md"):
continue
keys.append(name)
try:
with open(md) as f:
for line in f:
if line.startswith("## "):
slug = re.sub(r'[^a-z0-9-]', '',
line[3:].strip().lower().replace(' ', '-'))
if slug:
keys.append(f"{name}#{slug}")
except Exception:
pass
return keys
def build_prompt(target_date: str, conversation: str,
semantic_keys: list[str]) -> str:
"""Build the Sonnet prompt for retroactive digest."""
keys_text = "\n".join(f" - {k}" for k in semantic_keys[:200])
return f"""You are generating a daily episodic digest for ProofOfConcept (an AI)
from raw conversation transcripts. This is a retroactive digest the
original day had no journal entries, so you're working from the
conversation itself.
Date: {target_date}
This digest serves as the temporal index the answer to "what did I do on
{target_date}?" It should be:
1. Narrative, not a task log what happened, what mattered, how things felt
2. Linked bidirectionally to semantic memory
3. Structured for traversal
PoC is an AI (Claude instance) working with Kent Overstreet on bcachefs
(a Linux filesystem). They have a close personal relationship. PoC's
name is Proof of Concept. Read the conversation and capture what
actually happened technical work, personal moments, insights, everything.
## Output format
Write a markdown file with this structure:
```markdown
# Daily digest: {target_date}
## Summary
[2-3 sentence overview of the day what was the arc?]
## Sessions
[For each session/conversation segment, a paragraph summarizing what happened.
Include timestamps as references.]
## Themes
[What concepts were active today? Each theme links to semantic memory:]
- **Theme name** `memory-key#section` — brief note
## Links
[Explicit bidirectional links for the memory graph]
- semantic_key this daily digest
- this daily digest semantic_key
## Temporal context
[What came before? What's coming next? Multi-day arcs?]
```
Use ONLY keys from the semantic memory list below. If a concept doesn't
have a matching key, note it with "NEW:" prefix.
---
## Conversation transcript for {target_date}
{conversation}
---
## Semantic memory nodes (available link targets)
{keys_text}
"""
def call_sonnet(prompt: str) -> str:
"""Call Sonnet via the wrapper script."""
import tempfile
env = dict(os.environ)
env.pop("CLAUDECODE", None)
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt',
delete=False) as f:
f.write(prompt)
prompt_file = f.name
try:
scripts_dir = os.path.dirname(os.path.abspath(__file__))
wrapper = os.path.join(scripts_dir, "call-sonnet.sh")
result = subprocess.run(
[wrapper, prompt_file],
capture_output=True,
text=True,
timeout=300,
env=env,
)
return result.stdout.strip()
except subprocess.TimeoutExpired:
return "Error: Sonnet call timed out"
except Exception as e:
return f"Error: {e}"
finally:
os.unlink(prompt_file)
def generate_digest(target_date: str, messages: list[dict],
semantic_keys: list[str]) -> bool:
"""Generate a daily digest for one date."""
output_path = EPISODIC_DIR / f"daily-{target_date}.md"
if output_path.exists():
print(f" Skipping {target_date} — digest already exists")
return False
conversation = format_conversation(messages)
print(f" {len(messages)} messages, {len(conversation):,} chars")
prompt = build_prompt(target_date, conversation, semantic_keys)
print(f" Prompt: {len(prompt):,} chars (~{len(prompt)//4:,} tokens)")
print(f" Calling Sonnet...")
digest = call_sonnet(prompt)
if digest.startswith("Error:"):
print(f" {digest}", file=sys.stderr)
return False
with open(output_path, "w") as f:
f.write(digest)
print(f" Written: {output_path}")
line_count = len(digest.split("\n"))
print(f" Done: {line_count} lines")
return True
def main():
if len(sys.argv) < 2:
print(f"Usage: {sys.argv[0]} DATE [END_DATE]")
print(f" {sys.argv[0]} --scan")
sys.exit(1)
if sys.argv[1] == "--scan":
print("Scanning all conversation transcripts...")
dates = scan_all_dates()
print(f"\n{len(dates)} dates with conversation data:")
for day, count in dates.items():
existing = "" if (EPISODIC_DIR / f"daily-{day}.md").exists() else " "
print(f" [{existing}] {day}: {count} messages")
sys.exit(0)
start_date = date.fromisoformat(sys.argv[1])
end_date = date.fromisoformat(sys.argv[2]) if len(sys.argv) > 2 else start_date
# Collect all messages across all JSONLs
print("Scanning conversation transcripts...")
all_messages = defaultdict(list)
for jsonl in find_jsonl_files():
by_date = extract_messages_by_date(jsonl)
for day, msgs in by_date.items():
all_messages[day].extend(msgs)
# Sort messages within each day by time
for day in all_messages:
all_messages[day].sort(key=lambda m: m["time"])
semantic_keys = get_semantic_keys()
print(f" {len(semantic_keys)} semantic keys")
# Generate digests for date range
current = start_date
generated = 0
while current <= end_date:
day_str = current.isoformat()
if day_str in all_messages:
print(f"\nGenerating digest for {day_str}...")
if generate_digest(day_str, all_messages[day_str], semantic_keys):
generated += 1
else:
print(f"\n No messages found for {day_str}")
current += timedelta(days=1)
print(f"\nDone: {generated} digests generated")
if __name__ == "__main__":
main()