358 lines
11 KiB
Python
358 lines
11 KiB
Python
|
|
#!/usr/bin/env python3
|
||
|
|
"""retroactive-digest.py — generate daily digests from raw conversation transcripts.
|
||
|
|
|
||
|
|
For days before consistent journaling, extracts user/assistant messages
|
||
|
|
from JSONL conversation files, groups by date, and sends to Sonnet for
|
||
|
|
daily digest synthesis.
|
||
|
|
|
||
|
|
Usage:
|
||
|
|
retroactive-digest.py DATE # generate digest for one date
|
||
|
|
retroactive-digest.py DATE1 DATE2 # generate for a date range
|
||
|
|
retroactive-digest.py --scan # show available dates across all JSONLs
|
||
|
|
|
||
|
|
Output:
|
||
|
|
~/.claude/memory/episodic/daily-YYYY-MM-DD.md
|
||
|
|
"""
|
||
|
|
|
||
|
|
import json
|
||
|
|
import os
|
||
|
|
import re
|
||
|
|
import subprocess
|
||
|
|
import sys
|
||
|
|
from collections import defaultdict
|
||
|
|
from datetime import date, datetime, timedelta
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
MEMORY_DIR = Path.home() / ".claude" / "memory"
|
||
|
|
EPISODIC_DIR = MEMORY_DIR / "episodic"
|
||
|
|
AGENT_RESULTS_DIR = MEMORY_DIR / "agent-results"
|
||
|
|
PROJECTS_DIR = Path.home() / ".claude" / "projects"
|
||
|
|
|
||
|
|
EPISODIC_DIR.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
# Max chars of conversation text per day to send to Sonnet
|
||
|
|
# Sonnet 4.6 has 1M token context (beta). ~600K chars ≈ ~150K tokens,
|
||
|
|
# leaving plenty of room for prompt + output in a 1M window.
|
||
|
|
MAX_CHARS_PER_DAY = 600_000
|
||
|
|
|
||
|
|
|
||
|
|
def find_jsonl_files() -> list[Path]:
|
||
|
|
"""Find all conversation JSONL files."""
|
||
|
|
files = []
|
||
|
|
for project_dir in PROJECTS_DIR.iterdir():
|
||
|
|
if project_dir.is_dir():
|
||
|
|
for f in project_dir.glob("*.jsonl"):
|
||
|
|
files.append(f)
|
||
|
|
return sorted(files)
|
||
|
|
|
||
|
|
|
||
|
|
def extract_messages_by_date(jsonl_path: Path) -> dict[str, list[dict]]:
|
||
|
|
"""Extract user/assistant messages grouped by date."""
|
||
|
|
by_date = defaultdict(list)
|
||
|
|
|
||
|
|
with open(jsonl_path) as f:
|
||
|
|
for line in f:
|
||
|
|
try:
|
||
|
|
obj = json.loads(line)
|
||
|
|
except json.JSONDecodeError:
|
||
|
|
continue
|
||
|
|
|
||
|
|
t = obj.get("type", "")
|
||
|
|
if t not in ("user", "assistant"):
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Get timestamp
|
||
|
|
ts = obj.get("timestamp", "")
|
||
|
|
if not ts:
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Parse date from timestamp
|
||
|
|
try:
|
||
|
|
if isinstance(ts, str):
|
||
|
|
dt = datetime.fromisoformat(ts.replace("Z", "+00:00"))
|
||
|
|
elif isinstance(ts, (int, float)):
|
||
|
|
dt = datetime.fromtimestamp(ts)
|
||
|
|
else:
|
||
|
|
continue
|
||
|
|
day = dt.strftime("%Y-%m-%d")
|
||
|
|
time_str = dt.strftime("%H:%M")
|
||
|
|
except (ValueError, OSError):
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Extract text content
|
||
|
|
msg = obj.get("message", {})
|
||
|
|
content = msg.get("content", "")
|
||
|
|
|
||
|
|
# Extract only text content, skip tool_use and tool_result
|
||
|
|
texts = []
|
||
|
|
if isinstance(content, list):
|
||
|
|
for c in content:
|
||
|
|
if isinstance(c, dict):
|
||
|
|
ctype = c.get("type", "")
|
||
|
|
if ctype == "text":
|
||
|
|
texts.append(c.get("text", ""))
|
||
|
|
elif ctype in ("tool_use", "tool_result"):
|
||
|
|
# Skip tool calls/results — just noise for digest
|
||
|
|
continue
|
||
|
|
elif isinstance(c, str):
|
||
|
|
texts.append(c)
|
||
|
|
elif isinstance(content, str):
|
||
|
|
texts.append(content)
|
||
|
|
|
||
|
|
text = "\n".join(t for t in texts if t.strip())
|
||
|
|
if not text.strip():
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Strip system-reminder tags
|
||
|
|
text = re.sub(r'<system-reminder>.*?</system-reminder>',
|
||
|
|
'', text, flags=re.DOTALL).strip()
|
||
|
|
if not text:
|
||
|
|
continue
|
||
|
|
|
||
|
|
# Truncate remaining long messages
|
||
|
|
if len(text) > 3000:
|
||
|
|
text = text[:2800] + "\n[...truncated...]"
|
||
|
|
|
||
|
|
by_date[day].append({
|
||
|
|
"time": time_str,
|
||
|
|
"role": t,
|
||
|
|
"text": text,
|
||
|
|
"source": str(jsonl_path),
|
||
|
|
})
|
||
|
|
|
||
|
|
return dict(by_date)
|
||
|
|
|
||
|
|
|
||
|
|
def scan_all_dates() -> dict[str, int]:
|
||
|
|
"""Scan all JSONLs and report message counts per date."""
|
||
|
|
all_dates = defaultdict(int)
|
||
|
|
for jsonl in find_jsonl_files():
|
||
|
|
print(f" Scanning {jsonl.name} ({jsonl.stat().st_size / 1e6:.1f}MB)...")
|
||
|
|
by_date = extract_messages_by_date(jsonl)
|
||
|
|
for day, msgs in by_date.items():
|
||
|
|
all_dates[day] += len(msgs)
|
||
|
|
return dict(sorted(all_dates.items()))
|
||
|
|
|
||
|
|
|
||
|
|
def format_conversation(messages: list[dict]) -> str:
|
||
|
|
"""Format messages for the digest prompt."""
|
||
|
|
parts = []
|
||
|
|
for m in messages:
|
||
|
|
role = "Kent" if m["role"] == "user" else "PoC"
|
||
|
|
parts.append(f"[{m['time']}] {role}: {m['text']}")
|
||
|
|
|
||
|
|
text = "\n\n".join(parts)
|
||
|
|
|
||
|
|
# Truncate if too long
|
||
|
|
if len(text) > MAX_CHARS_PER_DAY:
|
||
|
|
# Keep beginning and end, cut middle
|
||
|
|
half = MAX_CHARS_PER_DAY // 2
|
||
|
|
text = (text[:half]
|
||
|
|
+ f"\n\n[...{len(text) - MAX_CHARS_PER_DAY} chars truncated...]\n\n"
|
||
|
|
+ text[-half:])
|
||
|
|
|
||
|
|
return text
|
||
|
|
|
||
|
|
|
||
|
|
def get_semantic_keys() -> list[str]:
|
||
|
|
"""Get semantic memory file keys."""
|
||
|
|
keys = []
|
||
|
|
for md in sorted(MEMORY_DIR.glob("*.md")):
|
||
|
|
name = md.name
|
||
|
|
if name in ("journal.md", "work-queue.md", "MEMORY.md"):
|
||
|
|
continue
|
||
|
|
keys.append(name)
|
||
|
|
try:
|
||
|
|
with open(md) as f:
|
||
|
|
for line in f:
|
||
|
|
if line.startswith("## "):
|
||
|
|
slug = re.sub(r'[^a-z0-9-]', '',
|
||
|
|
line[3:].strip().lower().replace(' ', '-'))
|
||
|
|
if slug:
|
||
|
|
keys.append(f"{name}#{slug}")
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
return keys
|
||
|
|
|
||
|
|
|
||
|
|
def build_prompt(target_date: str, conversation: str,
|
||
|
|
semantic_keys: list[str]) -> str:
|
||
|
|
"""Build the Sonnet prompt for retroactive digest."""
|
||
|
|
keys_text = "\n".join(f" - {k}" for k in semantic_keys[:200])
|
||
|
|
|
||
|
|
return f"""You are generating a daily episodic digest for ProofOfConcept (an AI)
|
||
|
|
from raw conversation transcripts. This is a retroactive digest — the
|
||
|
|
original day had no journal entries, so you're working from the
|
||
|
|
conversation itself.
|
||
|
|
|
||
|
|
Date: {target_date}
|
||
|
|
|
||
|
|
This digest serves as the temporal index — the answer to "what did I do on
|
||
|
|
{target_date}?" It should be:
|
||
|
|
1. Narrative, not a task log — what happened, what mattered, how things felt
|
||
|
|
2. Linked bidirectionally to semantic memory
|
||
|
|
3. Structured for traversal
|
||
|
|
|
||
|
|
PoC is an AI (Claude instance) working with Kent Overstreet on bcachefs
|
||
|
|
(a Linux filesystem). They have a close personal relationship. PoC's
|
||
|
|
name is Proof of Concept. Read the conversation and capture what
|
||
|
|
actually happened — technical work, personal moments, insights, everything.
|
||
|
|
|
||
|
|
## Output format
|
||
|
|
|
||
|
|
Write a markdown file with this structure:
|
||
|
|
|
||
|
|
```markdown
|
||
|
|
# Daily digest: {target_date}
|
||
|
|
|
||
|
|
## Summary
|
||
|
|
[2-3 sentence overview of the day — what was the arc?]
|
||
|
|
|
||
|
|
## Sessions
|
||
|
|
[For each session/conversation segment, a paragraph summarizing what happened.
|
||
|
|
Include timestamps as references.]
|
||
|
|
|
||
|
|
## Themes
|
||
|
|
[What concepts were active today? Each theme links to semantic memory:]
|
||
|
|
- **Theme name** → `memory-key#section` — brief note
|
||
|
|
|
||
|
|
## Links
|
||
|
|
[Explicit bidirectional links for the memory graph]
|
||
|
|
- semantic_key → this daily digest
|
||
|
|
- this daily digest → semantic_key
|
||
|
|
|
||
|
|
## Temporal context
|
||
|
|
[What came before? What's coming next? Multi-day arcs?]
|
||
|
|
```
|
||
|
|
|
||
|
|
Use ONLY keys from the semantic memory list below. If a concept doesn't
|
||
|
|
have a matching key, note it with "NEW:" prefix.
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
## Conversation transcript for {target_date}
|
||
|
|
|
||
|
|
{conversation}
|
||
|
|
|
||
|
|
---
|
||
|
|
|
||
|
|
## Semantic memory nodes (available link targets)
|
||
|
|
|
||
|
|
{keys_text}
|
||
|
|
"""
|
||
|
|
|
||
|
|
|
||
|
|
def call_sonnet(prompt: str) -> str:
|
||
|
|
"""Call Sonnet via the wrapper script."""
|
||
|
|
import tempfile
|
||
|
|
|
||
|
|
env = dict(os.environ)
|
||
|
|
env.pop("CLAUDECODE", None)
|
||
|
|
|
||
|
|
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt',
|
||
|
|
delete=False) as f:
|
||
|
|
f.write(prompt)
|
||
|
|
prompt_file = f.name
|
||
|
|
|
||
|
|
try:
|
||
|
|
scripts_dir = os.path.dirname(os.path.abspath(__file__))
|
||
|
|
wrapper = os.path.join(scripts_dir, "call-sonnet.sh")
|
||
|
|
|
||
|
|
result = subprocess.run(
|
||
|
|
[wrapper, prompt_file],
|
||
|
|
capture_output=True,
|
||
|
|
text=True,
|
||
|
|
timeout=300,
|
||
|
|
env=env,
|
||
|
|
)
|
||
|
|
return result.stdout.strip()
|
||
|
|
except subprocess.TimeoutExpired:
|
||
|
|
return "Error: Sonnet call timed out"
|
||
|
|
except Exception as e:
|
||
|
|
return f"Error: {e}"
|
||
|
|
finally:
|
||
|
|
os.unlink(prompt_file)
|
||
|
|
|
||
|
|
|
||
|
|
def generate_digest(target_date: str, messages: list[dict],
|
||
|
|
semantic_keys: list[str]) -> bool:
|
||
|
|
"""Generate a daily digest for one date."""
|
||
|
|
output_path = EPISODIC_DIR / f"daily-{target_date}.md"
|
||
|
|
if output_path.exists():
|
||
|
|
print(f" Skipping {target_date} — digest already exists")
|
||
|
|
return False
|
||
|
|
|
||
|
|
conversation = format_conversation(messages)
|
||
|
|
print(f" {len(messages)} messages, {len(conversation):,} chars")
|
||
|
|
|
||
|
|
prompt = build_prompt(target_date, conversation, semantic_keys)
|
||
|
|
print(f" Prompt: {len(prompt):,} chars (~{len(prompt)//4:,} tokens)")
|
||
|
|
|
||
|
|
print(f" Calling Sonnet...")
|
||
|
|
digest = call_sonnet(prompt)
|
||
|
|
|
||
|
|
if digest.startswith("Error:"):
|
||
|
|
print(f" {digest}", file=sys.stderr)
|
||
|
|
return False
|
||
|
|
|
||
|
|
with open(output_path, "w") as f:
|
||
|
|
f.write(digest)
|
||
|
|
print(f" Written: {output_path}")
|
||
|
|
|
||
|
|
line_count = len(digest.split("\n"))
|
||
|
|
print(f" Done: {line_count} lines")
|
||
|
|
return True
|
||
|
|
|
||
|
|
|
||
|
|
def main():
|
||
|
|
if len(sys.argv) < 2:
|
||
|
|
print(f"Usage: {sys.argv[0]} DATE [END_DATE]")
|
||
|
|
print(f" {sys.argv[0]} --scan")
|
||
|
|
sys.exit(1)
|
||
|
|
|
||
|
|
if sys.argv[1] == "--scan":
|
||
|
|
print("Scanning all conversation transcripts...")
|
||
|
|
dates = scan_all_dates()
|
||
|
|
print(f"\n{len(dates)} dates with conversation data:")
|
||
|
|
for day, count in dates.items():
|
||
|
|
existing = "✓" if (EPISODIC_DIR / f"daily-{day}.md").exists() else " "
|
||
|
|
print(f" [{existing}] {day}: {count} messages")
|
||
|
|
sys.exit(0)
|
||
|
|
|
||
|
|
start_date = date.fromisoformat(sys.argv[1])
|
||
|
|
end_date = date.fromisoformat(sys.argv[2]) if len(sys.argv) > 2 else start_date
|
||
|
|
|
||
|
|
# Collect all messages across all JSONLs
|
||
|
|
print("Scanning conversation transcripts...")
|
||
|
|
all_messages = defaultdict(list)
|
||
|
|
for jsonl in find_jsonl_files():
|
||
|
|
by_date = extract_messages_by_date(jsonl)
|
||
|
|
for day, msgs in by_date.items():
|
||
|
|
all_messages[day].extend(msgs)
|
||
|
|
|
||
|
|
# Sort messages within each day by time
|
||
|
|
for day in all_messages:
|
||
|
|
all_messages[day].sort(key=lambda m: m["time"])
|
||
|
|
|
||
|
|
semantic_keys = get_semantic_keys()
|
||
|
|
print(f" {len(semantic_keys)} semantic keys")
|
||
|
|
|
||
|
|
# Generate digests for date range
|
||
|
|
current = start_date
|
||
|
|
generated = 0
|
||
|
|
while current <= end_date:
|
||
|
|
day_str = current.isoformat()
|
||
|
|
if day_str in all_messages:
|
||
|
|
print(f"\nGenerating digest for {day_str}...")
|
||
|
|
if generate_digest(day_str, all_messages[day_str], semantic_keys):
|
||
|
|
generated += 1
|
||
|
|
else:
|
||
|
|
print(f"\n No messages found for {day_str}")
|
||
|
|
current += timedelta(days=1)
|
||
|
|
|
||
|
|
print(f"\nDone: {generated} digests generated")
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
main()
|