From 8f4b28cd2091e013c17866b3219257c75a0bd16d Mon Sep 17 00:00:00 2001 From: ProofOfConcept Date: Thu, 5 Mar 2026 15:31:44 -0500 Subject: [PATCH] doc: daemon design notes --- doc/daemon-design.md | 230 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 230 insertions(+) create mode 100644 doc/daemon-design.md diff --git a/doc/daemon-design.md b/doc/daemon-design.md new file mode 100644 index 0000000..e8a3887 --- /dev/null +++ b/doc/daemon-design.md @@ -0,0 +1,230 @@ +# poc-memory daemon design sketch + +2026-03-05, ProofOfConcept + Kent + +## Problem + +Memory maintenance (extraction, consolidation, digests, knowledge loop) +currently runs via a cron job that shells out to separate scripts. This +is fragile (`set -e` silently ate today's failure), hard to observe, +and has no backpressure or dependency management. The conversation +extraction agent hasn't run in 2 days because Phase 1 crashed and +took Phase 2 with it. + +## Goal + +`poc-memory daemon` — a single long-running process that owns all +background memory work. Structured so the orchestration logic can later +move into poc-agent when that's ready. + +## Jobs + +Six job types, roughly in pipeline order: + +| Job | Trigger | Cadence | Depends on | +|-----|---------|---------|------------| +| **extract** | session end detected | event | — | +| **decay** | schedule | daily | — | +| **consolidate** | schedule or post-extract | daily + after extract | extract | +| **knowledge-loop** | post-consolidate | daily | consolidate | +| **digest** | schedule | daily (+ weekly) | consolidate | +| **health** | schedule | hourly | — | + +### extract +Watch `~/.claude/projects/*/` for conversation jsonl files. A session +is "ended" when its jsonl hasn't been written to in 10 minutes AND no +claude process has it open (lsof/fuser check). Extract durable knowledge +using the observation agent (currently Python/Sonnet). + +This is the most time-sensitive job — new material should enter the +graph within ~15 minutes of a session ending, not wait 24 hours. + +### decay +Run weight decay on all nodes. Quick, no API calls. Currently +`poc-memory decay`. + +### consolidate +Graph health check, then agent runs: replay, linker, separator. +Link orphans, cap degree. Currently `poc-memory consolidate-full` +minus the digest step. + +### knowledge-loop +The four knowledge agents: observation (on any un-extracted fragments), +extractor, connector, challenger. Currently `knowledge_loop.py`. +Runs until convergence or budget exhausted. + +### digest +Generate daily and weekly digests from journal entries. +Currently part of `consolidate-full`. + +### health +Quick graph metrics snapshot. No API calls. Log metrics for trend +tracking. Alert (via status file) if graph health degrades. + +## Architecture + +``` +poc-memory daemon +├── Scheduler +│ ├── clock triggers (daily, hourly) +│ ├── event triggers (session end, post-job) +│ └── condition triggers (health threshold) +├── Job Runner +│ ├── job queue (ordered by priority + dependencies) +│ ├── single-threaded Sonnet executor (backpressure) +│ └── retry logic (exponential backoff on API errors) +├── Session Watcher +│ ├── inotify on ~/.claude/projects/*/ +│ ├── staleness + lsof check for session end +│ └── tracks which sessions have been extracted +├── Status Store +│ └── ~/.claude/memory/daemon-status.json +└── Logger + └── structured log → ~/.claude/memory/daemon.log +``` + +### Scheduler + +Three trigger types unified into one interface: + +```rust +enum Trigger { + /// Fire at fixed intervals + Schedule { interval: Duration, last_run: Option }, + /// Fire when a condition is met + Event { kind: EventKind }, + /// Fire when another job completes + After { job: JobId, only_on_success: bool }, +} + +enum EventKind { + SessionEnded(PathBuf), // specific jsonl path + HealthBelowThreshold, +} +``` + +Jobs declare their triggers and dependencies. The scheduler resolves +ordering and ensures a job doesn't run while its dependency is still +in progress. + +### Sonnet executor + +All API-calling jobs (extract, consolidate agents, knowledge loop) +go through a single Sonnet executor. This provides: + +- **Serialization**: one Sonnet call at a time (simple, avoids rate limits) +- **Backpressure**: if calls are failing, back off globally +- **Cost tracking**: log tokens used per job +- **Timeout**: kill calls that hang (the current scripts have no timeout) + +Initially this just shells out to `call-sonnet.sh` like the Python +scripts do. Later it can use the API directly. + +### Status store + +```json +{ + "daemon": { + "pid": 12345, + "started": "2026-03-05T09:15:00-05:00", + "uptime_secs": 3600 + }, + "jobs": { + "extract": { + "state": "idle", + "last_run": "2026-03-05T10:30:00-05:00", + "last_result": "ok", + "last_duration_secs": 45, + "sessions_extracted": 3, + "next_scheduled": null + }, + "consolidate": { + "state": "running", + "started": "2026-03-05T11:00:00-05:00", + "progress": "linker (3/5)", + "last_result": "ok", + "last_duration_secs": 892 + }, + "knowledge-loop": { + "state": "waiting", + "waiting_on": "consolidate", + "last_result": "ok", + "last_cycles": 25 + } + }, + "sonnet": { + "state": "busy", + "current_job": "consolidate", + "calls_today": 47, + "errors_today": 0, + "tokens_today": 125000 + } +} +``` + +Queryable via `poc-memory daemon status` (reads the JSON, pretty-prints). +Also: `poc-memory daemon status --json` for programmatic access. + +### Session watcher + +```rust +struct SessionWatcher { + /// jsonl paths we've already fully extracted + extracted: HashSet, + /// jsonl paths we're watching for staleness + watching: HashMap, // path → last_modified +} +``` + +On each tick (every 60s): +1. Scan `~/.claude/projects/*/` for `*.jsonl` files +2. For each unknown file, start watching +3. For each watched file where mtime is >10min old AND not open by + any process → mark as ended, emit `SessionEnded` event +4. Skip files in `extracted` set + +The extracted set persists to disk so we don't re-extract after +daemon restart. + +## Logging + +Structured log lines, one JSON object per line: + +```json +{"ts":"2026-03-05T11:00:00","job":"consolidate","event":"started"} +{"ts":"2026-03-05T11:00:05","job":"consolidate","event":"sonnet_call","tokens":2400,"duration_ms":3200} +{"ts":"2026-03-05T11:14:52","job":"consolidate","event":"completed","duration_secs":892,"result":"ok"} +{"ts":"2026-03-05T11:14:52","job":"consolidate","event":"error","msg":"Sonnet timeout after 600s","retry_in":120} +``` + +`poc-memory daemon log` tails the log with human-friendly formatting. +`poc-memory daemon log --job extract` filters to one job. + +## Migration path + +### Phase 1: orchestration only +The daemon is just a scheduler + session watcher. Jobs still shell out +to `poc-memory consolidate-full`, `knowledge_loop.py`, etc. The value +is: reliable scheduling, session-end detection, centralized status, +error logging that doesn't disappear. + +### Phase 2: inline jobs +Move job logic into Rust one at a time. Decay and health are trivial +(already Rust). Digests next. Consolidation agents and knowledge loop +are bigger (currently Python + Sonnet prompts). + +### Phase 3: poc-agent integration +The daemon becomes a subsystem of poc-agent. The scheduler, status +store, and Sonnet executor are reusable. The session watcher feeds +into poc-agent's broader awareness of what's happening on the machine. + +## Open questions + +- **Signal handling**: SIGHUP to reload config? SIGUSR1 to trigger + immediate consolidation? +- **Multiple claude sessions**: extract should handle overlapping + sessions (different project dirs). Currently there's usually just one. +- **Budget limits**: should the daemon have a daily token budget and + stop when exhausted? Prevents runaway costs if something loops. +- **Notification**: when something fails, should it write to the + telegram inbox? Or just log and let `daemon status` surface it?