consciousness/scripts/digest-link-parser.py

#!/usr/bin/env python3
"""digest-link-parser.py — extract explicit links from episodic digests.

Parses the "Links" sections of daily/weekly/monthly digests and
applies them to the memory graph via poc-memory link-add.

Usage:
  digest-link-parser.py              # dry run
  digest-link-parser.py --apply      # apply links
"""

import re
import subprocess
import sys
from pathlib import Path

EPISODIC_DIR = Path.home() / ".claude" / "memory" / "episodic"


def normalize_key(raw: str) -> str:
    """Normalize a link target to a poc-memory key."""
    key = raw.strip().strip('`').strip()

    # weekly/2026-W06 → weekly-2026-W06.md
    # monthly/2026-02 → monthly-2026-02.md
    # daily/2026-02-04 → daily-2026-02-04.md
    key = re.sub(r'^(daily|weekly|monthly)/', r'\1-', key)

    # daily-2026-02-04 → daily-2026-02-04.md
    if re.match(r'^(daily|weekly|monthly)-\d{4}', key):
        if not key.endswith('.md'):
            key = key + '.md'

    # Handle "this daily digest" / "this weekly digest" etc
    if key.startswith('this ') or key == '2026-02-14':
        return ""  # Skip self-references, handled by caller

    # Ensure .md extension for file references
    if '#' in key:
        parts = key.split('#', 1)
        if not parts[0].endswith('.md'):
            parts[0] = parts[0] + '.md'
        key = '#'.join(parts)
    elif not key.endswith('.md') and '/' not in key and not key.startswith('NEW:'):
        key = key + '.md'

    return key


def extract_links(filepath: Path) -> list[dict]:
    """Extract links from a digest file's Links section."""
    content = filepath.read_text()
    links = []

    # Determine the digest's own key
    digest_name = filepath.stem  # e.g., "daily-2026-02-28"
    digest_key = digest_name + ".md"

    # Find the Links section
    in_links = False
    for line in content.split('\n'):
        # Start of Links section
        if re.match(r'^##\s+Links', line):
            in_links = True
            continue
        # End of Links section (next ## header)
        if in_links and re.match(r'^##\s+', line) and not re.match(r'^##\s+Links', line):
            in_links = False
            continue

        if not in_links:
            continue

        # Skip subheaders within links section
        if line.startswith('###') or line.startswith('**'):
            continue

        # Parse link lines: "- source → target (reason)"
        # Also handles: "- `source` → `target` (reason)"
        # And: "- source → target"
        match = re.match(
            r'^-\s+(.+?)\s*[→↔←]\s*(.+?)(?:\s*\((.+?)\))?\s*$',
            line
        )
        if not match:
            continue

        raw_source = match.group(1).strip()
        raw_target = match.group(2).strip()
        reason = match.group(3) or ""

        # Normalize keys
        source = normalize_key(raw_source)
        target = normalize_key(raw_target)

        # Replace self-references with digest key
        if not source:
            source = digest_key
        if not target:
            target = digest_key

        # Handle "this daily digest" patterns in the raw text
        if 'this daily' in raw_source.lower() or 'this weekly' in raw_source.lower() or 'this monthly' in raw_source.lower():
            source = digest_key
        if 'this daily' in raw_target.lower() or 'this weekly' in raw_target.lower() or 'this monthly' in raw_target.lower():
            target = digest_key

        # Handle bare date references like "2026-02-14"
        date_match = re.match(r'^(\d{4}-\d{2}-\d{2})$', source.replace('.md', ''))
        if date_match:
            source = f"daily-{date_match.group(1)}.md"
        date_match = re.match(r'^(\d{4}-\d{2}-\d{2})$', target.replace('.md', ''))
        if date_match:
            target = f"daily-{date_match.group(1)}.md"

        # Skip NEW: prefixed links (target doesn't exist yet)
        if source.startswith('NEW:') or target.startswith('NEW:'):
            continue

        # Skip if source == target
        if source == target:
            continue

        links.append({
            "source": source,
            "target": target,
            "reason": reason,
            "file": filepath.name,
        })

    return links


def main():
    do_apply = "--apply" in sys.argv

    # Collect all links from all digests
    all_links = []
    for pattern in ["daily-*.md", "weekly-*.md", "monthly-*.md"]:
        for f in sorted(EPISODIC_DIR.glob(pattern)):
            links = extract_links(f)
            if links:
                all_links.extend(links)

    # Deduplicate (same source→target pair)
    seen = set()
    unique_links = []
    for link in all_links:
        key = (link["source"], link["target"])
        if key not in seen:
            seen.add(key)
            unique_links.append(link)

    print(f"Found {len(all_links)} total links, {len(unique_links)} unique")

    if not do_apply:
        # Dry run — just show them
        for i, link in enumerate(unique_links, 1):
            print(f"  {i:3d}. {link['source']} → {link['target']}")
            if link['reason']:
                print(f"       ({link['reason'][:80]})")
        print(f"\nTo apply: {sys.argv[0]} --apply")
        return

    # Apply with fallback: if section-level key fails, try file-level
    applied = skipped = errors = fallbacks = 0
    for link in unique_links:
        src, tgt = link["source"], link["target"]
        reason = link.get("reason", "")

        def try_link(s, t, r):
            cmd = ["poc-memory", "link-add", s, t]
            if r:
                cmd.append(r[:200])
            result = subprocess.run(cmd, capture_output=True, text=True, timeout=10)
            return result

        try:
            r = try_link(src, tgt, reason)
            if r.returncode == 0:
                out = r.stdout.strip()
                if "already exists" in out:
                    skipped += 1
                else:
                    print(f"  {out}")
                    applied += 1
            else:
                err = r.stderr.strip()
                if "No entry for" in err:
                    # Try stripping section anchors
                    src_base = src.split('#')[0] if '#' in src else src
                    tgt_base = tgt.split('#')[0] if '#' in tgt else tgt
                    if src_base == tgt_base:
                        skipped += 1  # Same file, skip
                        continue
                    r2 = try_link(src_base, tgt_base, reason)
                    if r2.returncode == 0:
                        out = r2.stdout.strip()
                        if "already exists" in out:
                            skipped += 1
                        else:
                            print(f"  {out}  (fallback from #{src.split('#')[-1] if '#' in src else ''}/{tgt.split('#')[-1] if '#' in tgt else ''})")
                            applied += 1
                            fallbacks += 1
                    else:
                        skipped += 1  # File truly doesn't exist
                elif "not found" in err:
                    skipped += 1
                else:
                    print(f"  ? {src} → {tgt}: {err}")
                    errors += 1
        except Exception as e:
            print(f"  ! {src} → {tgt}: {e}")
            errors += 1

    print(f"\nApplied: {applied} ({fallbacks} file-level fallbacks)  Skipped: {skipped}  Errors: {errors}")


if __name__ == "__main__":
    main()