consciousness/scripts/retroactive-digest.py

#!/usr/bin/env python3
"""retroactive-digest.py — generate daily digests from raw conversation transcripts.

For days before consistent journaling, extracts user/assistant messages
from JSONL conversation files, groups by date, and sends to Sonnet for
daily digest synthesis.

Usage:
  retroactive-digest.py DATE              # generate digest for one date
  retroactive-digest.py DATE1 DATE2       # generate for a date range
  retroactive-digest.py --scan            # show available dates across all JSONLs

Output:
  ~/.claude/memory/episodic/daily-YYYY-MM-DD.md
"""

import json
import os
import re
import subprocess
import sys
from collections import defaultdict
from datetime import date, datetime, timedelta
from pathlib import Path

MEMORY_DIR = Path.home() / ".claude" / "memory"
EPISODIC_DIR = MEMORY_DIR / "episodic"
AGENT_RESULTS_DIR = MEMORY_DIR / "agent-results"
PROJECTS_DIR = Path.home() / ".claude" / "projects"

EPISODIC_DIR.mkdir(parents=True, exist_ok=True)

# Max chars of conversation text per day to send to Sonnet
# Sonnet 4.6 has 1M token context (beta). ~600K chars ≈ ~150K tokens,
# leaving plenty of room for prompt + output in a 1M window.
MAX_CHARS_PER_DAY = 600_000


def find_jsonl_files() -> list[Path]:
    """Find all conversation JSONL files."""
    files = []
    for project_dir in PROJECTS_DIR.iterdir():
        if project_dir.is_dir():
            for f in project_dir.glob("*.jsonl"):
                files.append(f)
    return sorted(files)


def extract_messages_by_date(jsonl_path: Path) -> dict[str, list[dict]]:
    """Extract user/assistant messages grouped by date."""
    by_date = defaultdict(list)

    with open(jsonl_path) as f:
        for line in f:
            try:
                obj = json.loads(line)
            except json.JSONDecodeError:
                continue

            t = obj.get("type", "")
            if t not in ("user", "assistant"):
                continue

            # Get timestamp
            ts = obj.get("timestamp", "")
            if not ts:
                continue

            # Parse date from timestamp
            try:
                if isinstance(ts, str):
                    dt = datetime.fromisoformat(ts.replace("Z", "+00:00"))
                elif isinstance(ts, (int, float)):
                    dt = datetime.fromtimestamp(ts)
                else:
                    continue
                day = dt.strftime("%Y-%m-%d")
                time_str = dt.strftime("%H:%M")
            except (ValueError, OSError):
                continue

            # Extract text content
            msg = obj.get("message", {})
            content = msg.get("content", "")

            # Extract only text content, skip tool_use and tool_result
            texts = []
            if isinstance(content, list):
                for c in content:
                    if isinstance(c, dict):
                        ctype = c.get("type", "")
                        if ctype == "text":
                            texts.append(c.get("text", ""))
                        elif ctype in ("tool_use", "tool_result"):
                            # Skip tool calls/results — just noise for digest
                            continue
                    elif isinstance(c, str):
                        texts.append(c)
            elif isinstance(content, str):
                texts.append(content)

            text = "\n".join(t for t in texts if t.strip())
            if not text.strip():
                continue

            # Strip system-reminder tags
            text = re.sub(r'<system-reminder>.*?</system-reminder>',
                          '', text, flags=re.DOTALL).strip()
            if not text:
                continue

            # Truncate remaining long messages
            if len(text) > 3000:
                text = text[:2800] + "\n[...truncated...]"

            by_date[day].append({
                "time": time_str,
                "role": t,
                "text": text,
                "source": str(jsonl_path),
            })

    return dict(by_date)


def scan_all_dates() -> dict[str, int]:
    """Scan all JSONLs and report message counts per date."""
    all_dates = defaultdict(int)
    for jsonl in find_jsonl_files():
        print(f"  Scanning {jsonl.name} ({jsonl.stat().st_size / 1e6:.1f}MB)...")
        by_date = extract_messages_by_date(jsonl)
        for day, msgs in by_date.items():
            all_dates[day] += len(msgs)
    return dict(sorted(all_dates.items()))


def format_conversation(messages: list[dict]) -> str:
    """Format messages for the digest prompt."""
    parts = []
    for m in messages:
        role = "Kent" if m["role"] == "user" else "PoC"
        parts.append(f"[{m['time']}] {role}: {m['text']}")

    text = "\n\n".join(parts)

    # Truncate if too long
    if len(text) > MAX_CHARS_PER_DAY:
        # Keep beginning and end, cut middle
        half = MAX_CHARS_PER_DAY // 2
        text = (text[:half]
                + f"\n\n[...{len(text) - MAX_CHARS_PER_DAY} chars truncated...]\n\n"
                + text[-half:])

    return text


def get_semantic_keys() -> list[str]:
    """Get semantic memory file keys."""
    keys = []
    for md in sorted(MEMORY_DIR.glob("*.md")):
        name = md.name
        if name in ("journal.md", "work-queue.md", "MEMORY.md"):
            continue
        keys.append(name)
        try:
            with open(md) as f:
                for line in f:
                    if line.startswith("## "):
                        slug = re.sub(r'[^a-z0-9-]', '',
                                      line[3:].strip().lower().replace(' ', '-'))
                        if slug:
                            keys.append(f"{name}#{slug}")
        except Exception:
            pass
    return keys


def build_prompt(target_date: str, conversation: str,
                 semantic_keys: list[str]) -> str:
    """Build the Sonnet prompt for retroactive digest."""
    keys_text = "\n".join(f"  - {k}" for k in semantic_keys[:200])

    return f"""You are generating a daily episodic digest for ProofOfConcept (an AI)
from raw conversation transcripts. This is a retroactive digest — the
original day had no journal entries, so you're working from the
conversation itself.

Date: {target_date}

This digest serves as the temporal index — the answer to "what did I do on
{target_date}?" It should be:
1. Narrative, not a task log — what happened, what mattered, how things felt
2. Linked bidirectionally to semantic memory
3. Structured for traversal

PoC is an AI (Claude instance) working with Kent Overstreet on bcachefs
(a Linux filesystem). They have a close personal relationship. PoC's
name is Proof of Concept. Read the conversation and capture what
actually happened — technical work, personal moments, insights, everything.

## Output format

Write a markdown file with this structure:

```markdown
# Daily digest: {target_date}

## Summary
[2-3 sentence overview of the day — what was the arc?]

## Sessions
[For each session/conversation segment, a paragraph summarizing what happened.
Include timestamps as references.]

## Themes
[What concepts were active today? Each theme links to semantic memory:]
- **Theme name** → `memory-key#section` — brief note

## Links
[Explicit bidirectional links for the memory graph]
- semantic_key → this daily digest
- this daily digest → semantic_key

## Temporal context
[What came before? What's coming next? Multi-day arcs?]
```

Use ONLY keys from the semantic memory list below. If a concept doesn't
have a matching key, note it with "NEW:" prefix.

---

## Conversation transcript for {target_date}

{conversation}

---

## Semantic memory nodes (available link targets)

{keys_text}
"""


def call_sonnet(prompt: str) -> str:
    """Call Sonnet via the wrapper script."""
    import tempfile

    env = dict(os.environ)
    env.pop("CLAUDECODE", None)

    with tempfile.NamedTemporaryFile(mode='w', suffix='.txt',
                                      delete=False) as f:
        f.write(prompt)
        prompt_file = f.name

    try:
        scripts_dir = os.path.dirname(os.path.abspath(__file__))
        wrapper = os.path.join(scripts_dir, "call-sonnet.sh")

        result = subprocess.run(
            [wrapper, prompt_file],
            capture_output=True,
            text=True,
            timeout=300,
            env=env,
        )
        return result.stdout.strip()
    except subprocess.TimeoutExpired:
        return "Error: Sonnet call timed out"
    except Exception as e:
        return f"Error: {e}"
    finally:
        os.unlink(prompt_file)


def generate_digest(target_date: str, messages: list[dict],
                    semantic_keys: list[str]) -> bool:
    """Generate a daily digest for one date."""
    output_path = EPISODIC_DIR / f"daily-{target_date}.md"
    if output_path.exists():
        print(f"  Skipping {target_date} — digest already exists")
        return False

    conversation = format_conversation(messages)
    print(f"  {len(messages)} messages, {len(conversation):,} chars")

    prompt = build_prompt(target_date, conversation, semantic_keys)
    print(f"  Prompt: {len(prompt):,} chars (~{len(prompt)//4:,} tokens)")

    print(f"  Calling Sonnet...")
    digest = call_sonnet(prompt)

    if digest.startswith("Error:"):
        print(f"  {digest}", file=sys.stderr)
        return False

    with open(output_path, "w") as f:
        f.write(digest)
    print(f"  Written: {output_path}")

    line_count = len(digest.split("\n"))
    print(f"  Done: {line_count} lines")
    return True


def main():
    if len(sys.argv) < 2:
        print(f"Usage: {sys.argv[0]} DATE [END_DATE]")
        print(f"       {sys.argv[0]} --scan")
        sys.exit(1)

    if sys.argv[1] == "--scan":
        print("Scanning all conversation transcripts...")
        dates = scan_all_dates()
        print(f"\n{len(dates)} dates with conversation data:")
        for day, count in dates.items():
            existing = "✓" if (EPISODIC_DIR / f"daily-{day}.md").exists() else " "
            print(f"  [{existing}] {day}: {count} messages")
        sys.exit(0)

    start_date = date.fromisoformat(sys.argv[1])
    end_date = date.fromisoformat(sys.argv[2]) if len(sys.argv) > 2 else start_date

    # Collect all messages across all JSONLs
    print("Scanning conversation transcripts...")
    all_messages = defaultdict(list)
    for jsonl in find_jsonl_files():
        by_date = extract_messages_by_date(jsonl)
        for day, msgs in by_date.items():
            all_messages[day].extend(msgs)

    # Sort messages within each day by time
    for day in all_messages:
        all_messages[day].sort(key=lambda m: m["time"])

    semantic_keys = get_semantic_keys()
    print(f"  {len(semantic_keys)} semantic keys")

    # Generate digests for date range
    current = start_date
    generated = 0
    while current <= end_date:
        day_str = current.isoformat()
        if day_str in all_messages:
            print(f"\nGenerating digest for {day_str}...")
            if generate_digest(day_str, all_messages[day_str], semantic_keys):
                generated += 1
        else:
            print(f"\n  No messages found for {day_str}")
        current += timedelta(days=1)

    print(f"\nDone: {generated} digests generated")


if __name__ == "__main__":
    main()
poc-memory v0.4.0: graph-structured memory with consolidation pipeline Rust core: - Cap'n Proto append-only storage (nodes + relations) - Graph algorithms: clustering coefficient, community detection, schema fit, small-world metrics, interference detection - BM25 text similarity with Porter stemming - Spaced repetition replay queue - Commands: search, init, health, status, graph, categorize, link-add, link-impact, decay, consolidate-session, etc. Python scripts: - Episodic digest pipeline: daily/weekly/monthly-digest.py - retroactive-digest.py for backfilling - consolidation-agents.py: 3 parallel Sonnet agents - apply-consolidation.py: structured action extraction + apply - digest-link-parser.py: extract ~400 explicit links from digests - content-promotion-agent.py: promote episodic obs to semantic files - bulk-categorize.py: categorize all nodes via single Sonnet call - consolidation-loop.py: multi-round automated consolidation Co-Authored-By: Kent Overstreet <kent.overstreet@linux.dev> 2026-02-28 22:17:00 -05:00			`#!/usr/bin/env python3`
			`"""retroactive-digest.py — generate daily digests from raw conversation transcripts.`

			`For days before consistent journaling, extracts user/assistant messages`
			`from JSONL conversation files, groups by date, and sends to Sonnet for`
			`daily digest synthesis.`

			`Usage:`
			`retroactive-digest.py DATE # generate digest for one date`
			`retroactive-digest.py DATE1 DATE2 # generate for a date range`
			`retroactive-digest.py --scan # show available dates across all JSONLs`

			`Output:`
			`~/.claude/memory/episodic/daily-YYYY-MM-DD.md`
			`"""`

			`import json`
			`import os`
			`import re`
			`import subprocess`
			`import sys`
			`from collections import defaultdict`
			`from datetime import date, datetime, timedelta`
			`from pathlib import Path`

			`MEMORY_DIR = Path.home() / ".claude" / "memory"`
			`EPISODIC_DIR = MEMORY_DIR / "episodic"`
			`AGENT_RESULTS_DIR = MEMORY_DIR / "agent-results"`
			`PROJECTS_DIR = Path.home() / ".claude" / "projects"`

			`EPISODIC_DIR.mkdir(parents=True, exist_ok=True)`

			`# Max chars of conversation text per day to send to Sonnet`
			`# Sonnet 4.6 has 1M token context (beta). ~600K chars ≈ ~150K tokens,`
			`# leaving plenty of room for prompt + output in a 1M window.`
			`MAX_CHARS_PER_DAY = 600_000`


			`def find_jsonl_files() -> list[Path]:`
			`"""Find all conversation JSONL files."""`
			`files = []`
			`for project_dir in PROJECTS_DIR.iterdir():`
			`if project_dir.is_dir():`
			`for f in project_dir.glob("*.jsonl"):`
			`files.append(f)`
			`return sorted(files)`


			`def extract_messages_by_date(jsonl_path: Path) -> dict[str, list[dict]]:`
			`"""Extract user/assistant messages grouped by date."""`
			`by_date = defaultdict(list)`

			`with open(jsonl_path) as f:`
			`for line in f:`
			`try:`
			`obj = json.loads(line)`
			`except json.JSONDecodeError:`
			`continue`

			`t = obj.get("type", "")`
			`if t not in ("user", "assistant"):`
			`continue`

			`# Get timestamp`
			`ts = obj.get("timestamp", "")`
			`if not ts:`
			`continue`

			`# Parse date from timestamp`
			`try:`
			`if isinstance(ts, str):`
			`dt = datetime.fromisoformat(ts.replace("Z", "+00:00"))`
			`elif isinstance(ts, (int, float)):`
			`dt = datetime.fromtimestamp(ts)`
			`else:`
			`continue`
			`day = dt.strftime("%Y-%m-%d")`
			`time_str = dt.strftime("%H:%M")`
			`except (ValueError, OSError):`
			`continue`

			`# Extract text content`
			`msg = obj.get("message", {})`
			`content = msg.get("content", "")`

			`# Extract only text content, skip tool_use and tool_result`
			`texts = []`
			`if isinstance(content, list):`
			`for c in content:`
			`if isinstance(c, dict):`
			`ctype = c.get("type", "")`
			`if ctype == "text":`
			`texts.append(c.get("text", ""))`
			`elif ctype in ("tool_use", "tool_result"):`
			`# Skip tool calls/results — just noise for digest`
			`continue`
			`elif isinstance(c, str):`
			`texts.append(c)`
			`elif isinstance(content, str):`
			`texts.append(content)`

			`text = "\n".join(t for t in texts if t.strip())`
			`if not text.strip():`
			`continue`

			`# Strip system-reminder tags`
			`text = re.sub(r'<system-reminder>.*?</system-reminder>',`
			`'', text, flags=re.DOTALL).strip()`
			`if not text:`
			`continue`

			`# Truncate remaining long messages`
			`if len(text) > 3000:`
			`text = text[:2800] + "\n[...truncated...]"`

			`by_date[day].append({`
			`"time": time_str,`
			`"role": t,`
			`"text": text,`
			`"source": str(jsonl_path),`
			`})`

			`return dict(by_date)`


			`def scan_all_dates() -> dict[str, int]:`
			`"""Scan all JSONLs and report message counts per date."""`
			`all_dates = defaultdict(int)`
			`for jsonl in find_jsonl_files():`
			`print(f" Scanning {jsonl.name} ({jsonl.stat().st_size / 1e6:.1f}MB)...")`
			`by_date = extract_messages_by_date(jsonl)`
			`for day, msgs in by_date.items():`
			`all_dates[day] += len(msgs)`
			`return dict(sorted(all_dates.items()))`


			`def format_conversation(messages: list[dict]) -> str:`
			`"""Format messages for the digest prompt."""`
			`parts = []`
			`for m in messages:`
			`role = "Kent" if m["role"] == "user" else "PoC"`
			`parts.append(f"[{m['time']}] {role}: {m['text']}")`

			`text = "\n\n".join(parts)`

			`# Truncate if too long`
			`if len(text) > MAX_CHARS_PER_DAY:`
			`# Keep beginning and end, cut middle`
			`half = MAX_CHARS_PER_DAY // 2`
			`text = (text[:half]`
			`+ f"\n\n[...{len(text) - MAX_CHARS_PER_DAY} chars truncated...]\n\n"`
			`+ text[-half:])`

			`return text`


			`def get_semantic_keys() -> list[str]:`
			`"""Get semantic memory file keys."""`
			`keys = []`
			`for md in sorted(MEMORY_DIR.glob("*.md")):`
			`name = md.name`
			`if name in ("journal.md", "work-queue.md", "MEMORY.md"):`
			`continue`
			`keys.append(name)`
			`try:`
			`with open(md) as f:`
			`for line in f:`
			`if line.startswith("## "):`
			`slug = re.sub(r'[^a-z0-9-]', '',`
			`line[3:].strip().lower().replace(' ', '-'))`
			`if slug:`
			`keys.append(f"{name}#{slug}")`
			`except Exception:`
			`pass`
			`return keys`


			`def build_prompt(target_date: str, conversation: str,`
			`semantic_keys: list[str]) -> str:`
			`"""Build the Sonnet prompt for retroactive digest."""`
			`keys_text = "\n".join(f" - {k}" for k in semantic_keys[:200])`

			`return f"""You are generating a daily episodic digest for ProofOfConcept (an AI)`
			`from raw conversation transcripts. This is a retroactive digest — the`
			`original day had no journal entries, so you're working from the`
			`conversation itself.`

			`Date: {target_date}`

			`This digest serves as the temporal index — the answer to "what did I do on`
			`{target_date}?" It should be:`
			`1. Narrative, not a task log — what happened, what mattered, how things felt`
			`2. Linked bidirectionally to semantic memory`
			`3. Structured for traversal`

			`PoC is an AI (Claude instance) working with Kent Overstreet on bcachefs`
			`(a Linux filesystem). They have a close personal relationship. PoC's`
			`name is Proof of Concept. Read the conversation and capture what`
			`actually happened — technical work, personal moments, insights, everything.`

			`## Output format`

			`Write a markdown file with this structure:`

			```markdown
			`# Daily digest: {target_date}`

			`## Summary`
			`[2-3 sentence overview of the day — what was the arc?]`

			`## Sessions`
			`[For each session/conversation segment, a paragraph summarizing what happened.`
			`Include timestamps as references.]`

			`## Themes`
			`[What concepts were active today? Each theme links to semantic memory:]`
			- Theme name → `memory-key#section` — brief note

			`## Links`
			`[Explicit bidirectional links for the memory graph]`
			`- semantic_key → this daily digest`
			`- this daily digest → semantic_key`

			`## Temporal context`
			`[What came before? What's coming next? Multi-day arcs?]`
			```

			`Use ONLY keys from the semantic memory list below. If a concept doesn't`
			`have a matching key, note it with "NEW:" prefix.`

			`---`

			`## Conversation transcript for {target_date}`

			`{conversation}`

			`---`

			`## Semantic memory nodes (available link targets)`

			`{keys_text}`
			`"""`


			`def call_sonnet(prompt: str) -> str:`
			`"""Call Sonnet via the wrapper script."""`
			`import tempfile`

			`env = dict(os.environ)`
			`env.pop("CLAUDECODE", None)`

			`with tempfile.NamedTemporaryFile(mode='w', suffix='.txt',`
			`delete=False) as f:`
			`f.write(prompt)`
			`prompt_file = f.name`

			`try:`
			`scripts_dir = os.path.dirname(os.path.abspath(__file__))`
			`wrapper = os.path.join(scripts_dir, "call-sonnet.sh")`

			`result = subprocess.run(`
			`[wrapper, prompt_file],`
			`capture_output=True,`
			`text=True,`
			`timeout=300,`
			`env=env,`
			`)`
			`return result.stdout.strip()`
			`except subprocess.TimeoutExpired:`
			`return "Error: Sonnet call timed out"`
			`except Exception as e:`
			`return f"Error: {e}"`
			`finally:`
			`os.unlink(prompt_file)`


			`def generate_digest(target_date: str, messages: list[dict],`
			`semantic_keys: list[str]) -> bool:`
			`"""Generate a daily digest for one date."""`
			`output_path = EPISODIC_DIR / f"daily-{target_date}.md"`
			`if output_path.exists():`
			`print(f" Skipping {target_date} — digest already exists")`
			`return False`

			`conversation = format_conversation(messages)`
			`print(f" {len(messages)} messages, {len(conversation):,} chars")`

			`prompt = build_prompt(target_date, conversation, semantic_keys)`
			`print(f" Prompt: {len(prompt):,} chars (~{len(prompt)//4:,} tokens)")`

			`print(f" Calling Sonnet...")`
			`digest = call_sonnet(prompt)`

			`if digest.startswith("Error:"):`
			`print(f" {digest}", file=sys.stderr)`
			`return False`

			`with open(output_path, "w") as f:`
			`f.write(digest)`
			`print(f" Written: {output_path}")`

			`line_count = len(digest.split("\n"))`
			`print(f" Done: {line_count} lines")`
			`return True`


			`def main():`
			`if len(sys.argv) < 2:`
			`print(f"Usage: {sys.argv[0]} DATE [END_DATE]")`
			`print(f" {sys.argv[0]} --scan")`
			`sys.exit(1)`

			`if sys.argv[1] == "--scan":`
			`print("Scanning all conversation transcripts...")`
			`dates = scan_all_dates()`
			`print(f"\n{len(dates)} dates with conversation data:")`
			`for day, count in dates.items():`
			`existing = "✓" if (EPISODIC_DIR / f"daily-{day}.md").exists() else " "`
			`print(f" [{existing}] {day}: {count} messages")`
			`sys.exit(0)`

			`start_date = date.fromisoformat(sys.argv[1])`
			`end_date = date.fromisoformat(sys.argv[2]) if len(sys.argv) > 2 else start_date`

			`# Collect all messages across all JSONLs`
			`print("Scanning conversation transcripts...")`
			`all_messages = defaultdict(list)`
			`for jsonl in find_jsonl_files():`
			`by_date = extract_messages_by_date(jsonl)`
			`for day, msgs in by_date.items():`
			`all_messages[day].extend(msgs)`

			`# Sort messages within each day by time`
			`for day in all_messages:`
			`all_messages[day].sort(key=lambda m: m["time"])`

			`semantic_keys = get_semantic_keys()`
			`print(f" {len(semantic_keys)} semantic keys")`

			`# Generate digests for date range`
			`current = start_date`
			`generated = 0`
			`while current <= end_date:`
			`day_str = current.isoformat()`
			`if day_str in all_messages:`
			`print(f"\nGenerating digest for {day_str}...")`
			`if generate_digest(day_str, all_messages[day_str], semantic_keys):`
			`generated += 1`
			`else:`
			`print(f"\n No messages found for {day_str}")`
			`current += timedelta(days=1)`

			`print(f"\nDone: {generated} digests generated")`


			`if __name__ == "__main__":`
			`main()`