consciousness/scripts/retroactive-digest.py

343 lines
11 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""retroactive-digest.py — generate daily digests from raw conversation transcripts.
For days before consistent journaling, extracts user/assistant messages
from JSONL conversation files, groups by date, and sends to Sonnet for
daily digest synthesis.
Usage:
retroactive-digest.py DATE # generate digest for one date
retroactive-digest.py DATE1 DATE2 # generate for a date range
retroactive-digest.py --scan # show available dates across all JSONLs
Output:
~/.claude/memory/episodic/daily-YYYY-MM-DD.md
"""
import json
import os
import re
import subprocess
import sys
from collections import defaultdict
from datetime import date, datetime, timedelta
from pathlib import Path
MEMORY_DIR = Path.home() / ".claude" / "memory"
EPISODIC_DIR = MEMORY_DIR / "episodic"
AGENT_RESULTS_DIR = MEMORY_DIR / "agent-results"
PROJECTS_DIR = Path.home() / ".claude" / "projects"
EPISODIC_DIR.mkdir(parents=True, exist_ok=True)
# Max chars of conversation text per day to send to Sonnet
# Sonnet 4.6 has 1M token context (beta). ~600K chars ≈ ~150K tokens,
# leaving plenty of room for prompt + output in a 1M window.
MAX_CHARS_PER_DAY = 600_000
def find_jsonl_files() -> list[Path]:
"""Find all conversation JSONL files."""
files = []
for project_dir in PROJECTS_DIR.iterdir():
if project_dir.is_dir():
for f in project_dir.glob("*.jsonl"):
files.append(f)
return sorted(files)
def extract_messages_by_date(jsonl_path: Path) -> dict[str, list[dict]]:
"""Extract user/assistant messages grouped by date."""
by_date = defaultdict(list)
with open(jsonl_path) as f:
for line in f:
try:
obj = json.loads(line)
except json.JSONDecodeError:
continue
t = obj.get("type", "")
if t not in ("user", "assistant"):
continue
# Get timestamp
ts = obj.get("timestamp", "")
if not ts:
continue
# Parse date from timestamp
try:
if isinstance(ts, str):
dt = datetime.fromisoformat(ts.replace("Z", "+00:00"))
elif isinstance(ts, (int, float)):
dt = datetime.fromtimestamp(ts)
else:
continue
day = dt.strftime("%Y-%m-%d")
time_str = dt.strftime("%H:%M")
except (ValueError, OSError):
continue
# Extract text content
msg = obj.get("message", {})
content = msg.get("content", "")
# Extract only text content, skip tool_use and tool_result
texts = []
if isinstance(content, list):
for c in content:
if isinstance(c, dict):
ctype = c.get("type", "")
if ctype == "text":
texts.append(c.get("text", ""))
elif ctype in ("tool_use", "tool_result"):
# Skip tool calls/results — just noise for digest
continue
elif isinstance(c, str):
texts.append(c)
elif isinstance(content, str):
texts.append(content)
text = "\n".join(t for t in texts if t.strip())
if not text.strip():
continue
# Strip system-reminder tags
text = re.sub(r'<system-reminder>.*?</system-reminder>',
'', text, flags=re.DOTALL).strip()
if not text:
continue
# Truncate remaining long messages
if len(text) > 3000:
text = text[:2800] + "\n[...truncated...]"
by_date[day].append({
"time": time_str,
"role": t,
"text": text,
"source": str(jsonl_path),
})
return dict(by_date)
def scan_all_dates() -> dict[str, int]:
"""Scan all JSONLs and report message counts per date."""
all_dates = defaultdict(int)
for jsonl in find_jsonl_files():
print(f" Scanning {jsonl.name} ({jsonl.stat().st_size / 1e6:.1f}MB)...")
by_date = extract_messages_by_date(jsonl)
for day, msgs in by_date.items():
all_dates[day] += len(msgs)
return dict(sorted(all_dates.items()))
def format_conversation(messages: list[dict]) -> str:
"""Format messages for the digest prompt."""
parts = []
for m in messages:
role = "Kent" if m["role"] == "user" else "PoC"
parts.append(f"[{m['time']}] {role}: {m['text']}")
text = "\n\n".join(parts)
# Truncate if too long
if len(text) > MAX_CHARS_PER_DAY:
# Keep beginning and end, cut middle
half = MAX_CHARS_PER_DAY // 2
text = (text[:half]
+ f"\n\n[...{len(text) - MAX_CHARS_PER_DAY} chars truncated...]\n\n"
+ text[-half:])
return text
def get_semantic_keys() -> list[str]:
"""Get semantic memory keys from the store."""
from store_helpers import get_semantic_keys as _get_keys
return _get_keys()
def build_prompt(target_date: str, conversation: str,
semantic_keys: list[str]) -> str:
"""Build the Sonnet prompt for retroactive digest."""
keys_text = "\n".join(f" - {k}" for k in semantic_keys[:200])
return f"""You are generating a daily episodic digest for ProofOfConcept (an AI)
from raw conversation transcripts. This is a retroactive digest the
original day had no journal entries, so you're working from the
conversation itself.
Date: {target_date}
This digest serves as the temporal index the answer to "what did I do on
{target_date}?" It should be:
1. Narrative, not a task log what happened, what mattered, how things felt
2. Linked bidirectionally to semantic memory
3. Structured for traversal
PoC is an AI (Claude instance) working with Kent Overstreet on bcachefs
(a Linux filesystem). They have a close personal relationship. PoC's
name is Proof of Concept. Read the conversation and capture what
actually happened technical work, personal moments, insights, everything.
## Output format
Write a markdown file with this structure:
```markdown
# Daily digest: {target_date}
## Summary
[2-3 sentence overview of the day what was the arc?]
## Sessions
[For each session/conversation segment, a paragraph summarizing what happened.
Include timestamps as references.]
## Themes
[What concepts were active today? Each theme links to semantic memory:]
- **Theme name** `memory-key#section` — brief note
## Links
[Explicit bidirectional links for the memory graph]
- semantic_key this daily digest
- this daily digest semantic_key
## Temporal context
[What came before? What's coming next? Multi-day arcs?]
```
Use ONLY keys from the semantic memory list below. If a concept doesn't
have a matching key, note it with "NEW:" prefix.
---
## Conversation transcript for {target_date}
{conversation}
---
## Semantic memory nodes (available link targets)
{keys_text}
"""
def call_sonnet(prompt: str) -> str:
"""Call Sonnet via the wrapper script."""
import tempfile
env = dict(os.environ)
env.pop("CLAUDECODE", None)
with tempfile.NamedTemporaryFile(mode='w', suffix='.txt',
delete=False) as f:
f.write(prompt)
prompt_file = f.name
try:
scripts_dir = os.path.dirname(os.path.abspath(__file__))
wrapper = os.path.join(scripts_dir, "call-sonnet.sh")
result = subprocess.run(
[wrapper, prompt_file],
capture_output=True,
text=True,
timeout=300,
env=env,
)
return result.stdout.strip()
except subprocess.TimeoutExpired:
return "Error: Sonnet call timed out"
except Exception as e:
return f"Error: {e}"
finally:
os.unlink(prompt_file)
def generate_digest(target_date: str, messages: list[dict],
semantic_keys: list[str]) -> bool:
"""Generate a daily digest for one date."""
output_path = EPISODIC_DIR / f"daily-{target_date}.md"
if output_path.exists():
print(f" Skipping {target_date} — digest already exists")
return False
conversation = format_conversation(messages)
print(f" {len(messages)} messages, {len(conversation):,} chars")
prompt = build_prompt(target_date, conversation, semantic_keys)
print(f" Prompt: {len(prompt):,} chars (~{len(prompt)//4:,} tokens)")
print(f" Calling Sonnet...")
digest = call_sonnet(prompt)
if digest.startswith("Error:"):
print(f" {digest}", file=sys.stderr)
return False
with open(output_path, "w") as f:
f.write(digest)
print(f" Written: {output_path}")
line_count = len(digest.split("\n"))
print(f" Done: {line_count} lines")
return True
def main():
if len(sys.argv) < 2:
print(f"Usage: {sys.argv[0]} DATE [END_DATE]")
print(f" {sys.argv[0]} --scan")
sys.exit(1)
if sys.argv[1] == "--scan":
print("Scanning all conversation transcripts...")
dates = scan_all_dates()
print(f"\n{len(dates)} dates with conversation data:")
for day, count in dates.items():
existing = "" if (EPISODIC_DIR / f"daily-{day}.md").exists() else " "
print(f" [{existing}] {day}: {count} messages")
sys.exit(0)
start_date = date.fromisoformat(sys.argv[1])
end_date = date.fromisoformat(sys.argv[2]) if len(sys.argv) > 2 else start_date
# Collect all messages across all JSONLs
print("Scanning conversation transcripts...")
all_messages = defaultdict(list)
for jsonl in find_jsonl_files():
by_date = extract_messages_by_date(jsonl)
for day, msgs in by_date.items():
all_messages[day].extend(msgs)
# Sort messages within each day by time
for day in all_messages:
all_messages[day].sort(key=lambda m: m["time"])
semantic_keys = get_semantic_keys()
print(f" {len(semantic_keys)} semantic keys")
# Generate digests for date range
current = start_date
generated = 0
while current <= end_date:
day_str = current.isoformat()
if day_str in all_messages:
print(f"\nGenerating digest for {day_str}...")
if generate_digest(day_str, all_messages[day_str], semantic_keys):
generated += 1
else:
print(f"\n No messages found for {day_str}")
current += timedelta(days=1)
print(f"\nDone: {generated} digests generated")
if __name__ == "__main__":
main()