consciousness/poc-memory/src/agents/daemon.rs

// poc-memory daemon: background job orchestration for memory maintenance
//
// Replaces the fragile cron+shell approach with a single long-running process
// that owns all background memory work. Uses jobkit for worker pool, status
// tracking, retry, and cancellation.
//
// Architecture:
//   - Scheduler task: runs every 60s, scans filesystem state, spawns jobs
//   - Session watcher task: detects ended Claude sessions, triggers extraction
//   - Jobs shell out to existing poc-memory subcommands (Phase 1)
//   - Status written to daemon-status.json for `poc-memory daemon status`
//
// Phase 2 will inline job logic; Phase 3 integrates into poc-agent.

use jobkit::{Choir, ExecutionContext, ResourcePool, TaskError, TaskInfo, TaskStatus};
use std::collections::{HashMap, HashSet};
use std::fs;
use std::io::Write;
use std::path::{Path, PathBuf};
use std::sync::{Arc, Mutex};
use std::time::{Duration, SystemTime};

const SESSION_STALE_SECS: u64 = 600;   // 10 minutes
const SCHEDULER_INTERVAL: Duration = Duration::from_secs(60);
const HEALTH_INTERVAL: Duration = Duration::from_secs(3600);
fn status_file() -> &'static str { "daemon-status.json" }
fn log_file() -> &'static str { "daemon.log" }

fn status_path() -> PathBuf {
    crate::config::get().data_dir.join(status_file())
}

fn log_path() -> PathBuf {
    crate::config::get().data_dir.join(log_file())
}

// --- Logging ---

const LOG_MAX_BYTES: u64 = 1_000_000; // 1MB, then truncate to last half

fn log_event(job: &str, event: &str, detail: &str) {
    let ts = chrono::Local::now().format("%Y-%m-%dT%H:%M:%S");
    let line = if detail.is_empty() {
        format!("{{\"ts\":\"{}\",\"job\":\"{}\",\"event\":\"{}\"}}\n", ts, job, event)
    } else {
        // Escape detail for JSON safety
        let safe = detail.replace('\\', "\\\\").replace('"', "\\\"")
            .replace('\n', "\\n");
        format!("{{\"ts\":\"{}\",\"job\":\"{}\",\"event\":\"{}\",\"detail\":\"{}\"}}\n",
            ts, job, event, safe)
    };
    let path = log_path();

    // Rotate if too large
    if let Ok(meta) = fs::metadata(&path) {
        if meta.len() > LOG_MAX_BYTES {
            if let Ok(content) = fs::read_to_string(&path) {
                let half = content.len() / 2;
                // Find next newline after halfway point
                if let Some(nl) = content[half..].find('\n') {
                    let _ = fs::write(&path, &content[half + nl + 1..]);
                }
            }
        }
    }

    if let Ok(mut f) = fs::OpenOptions::new().create(true).append(true).open(&path) {
        let _ = f.write_all(line.as_bytes());
    }
}

// --- Job functions (direct, no subprocess) ---

/// Run a named job with logging, progress reporting, and error mapping.
fn run_job(ctx: &ExecutionContext, name: &str, f: impl FnOnce() -> Result<(), String>) -> Result<(), TaskError> {
    log_event(name, "started", "");
    ctx.set_progress("starting");
    let start = std::time::Instant::now();

    match f() {
        Ok(()) => {
            let duration = format!("{:.1}s", start.elapsed().as_secs_f64());
            log_event(name, "completed", &duration);
            ctx.set_result(&duration);
            Ok(())
        }
        Err(e) => {
            let duration = format!("{:.1}s", start.elapsed().as_secs_f64());
            let msg = format!("{}: {}", duration, e);
            log_event(name, "failed", &msg);
            Err(TaskError::Retry(msg))
        }
    }
}

fn job_experience_mine(ctx: &ExecutionContext, path: &str, segment: Option<usize>) -> Result<(), TaskError> {
    let path = path.to_string();
    run_job(ctx, &format!("experience-mine {}", path), || {
        ctx.log_line("loading store");
        let mut store = crate::store::Store::load()?;
        ctx.log_line("mining");
        let count = super::enrich::experience_mine(&mut store, &path, segment)?;
        ctx.log_line(format!("{count} entries mined"));
        Ok(())
    })
}

fn job_fact_mine(ctx: &ExecutionContext, path: &str) -> Result<(), TaskError> {
    let path = path.to_string();
    run_job(ctx, &format!("fact-mine {}", path), || {
        ctx.log_line("mining facts");
        let p = std::path::Path::new(&path);
        let progress = |msg: &str| { ctx.set_progress(msg); };
        let count = super::fact_mine::mine_and_store(p, Some(&progress))?;
        ctx.log_line(format!("{count} facts stored"));
        Ok(())
    })
}

/// Run a single consolidation agent (replay, linker, separator, transfer, health).
fn job_consolidation_agent(
    ctx: &ExecutionContext,
    agent_type: &str,
    batch_size: usize,
) -> Result<(), TaskError> {
    let agent = agent_type.to_string();
    let batch = batch_size;
    run_job(ctx, &format!("c-{}", agent), || {
        ctx.log_line("loading store");
        let mut store = crate::store::Store::load()?;
        ctx.log_line(&format!("running agent: {} (batch={})", agent, batch));
        let (total, applied) = super::knowledge::run_and_apply(&mut store, &agent, batch, "consolidate")?;
        ctx.log_line(&format!("done: {} actions ({} applied)", total, applied));
        Ok(())
    })
}

/// Run the rename agent: generates renames via LLM, applies them directly.
fn job_rename_agent(
    ctx: &ExecutionContext,
    batch_size: usize,
) -> Result<(), TaskError> {
    run_job(ctx, "c-rename", || {
        ctx.log_line("loading store");
        let mut store = crate::store::Store::load()?;

        let batch = if batch_size == 0 { 10 } else { batch_size };
        ctx.log_line(&format!("running rename agent (batch={})", batch));

        let result = super::knowledge::run_one_agent(&mut store, "rename", batch, "consolidate")?;

        // Parse RENAME actions from response (rename uses its own format, not WRITE_NODE/LINK/REFINE)
        let mut applied = 0;
        let mut skipped = 0;
        for line in result.output.lines() {
            let trimmed = line.trim();
            if !trimmed.starts_with("RENAME ") { continue; }

            let parts: Vec<&str> = trimmed[7..].splitn(2, ' ').collect();
            if parts.len() != 2 { skipped += 1; continue; }

            let old_key = parts[0].trim();
            let new_key = parts[1].trim();
            if old_key.is_empty() || new_key.is_empty() { skipped += 1; continue; }

            let resolved = match store.resolve_key(old_key) {
                Ok(k) => k,
                Err(e) => {
                    ctx.log_line(&format!("skip: {} → {}: {}", old_key, new_key, e));
                    skipped += 1;
                    continue;
                }
            };

            if store.nodes.contains_key(new_key) {
                ctx.log_line(&format!("skip: {} already exists", new_key));
                skipped += 1;
                continue;
            }

            match store.rename_node(&resolved, new_key) {
                Ok(()) => {
                    ctx.log_line(&format!("renamed: {} → {}", resolved, new_key));
                    applied += 1;
                }
                Err(e) => {
                    ctx.log_line(&format!("error: {} → {}: {}", resolved, new_key, e));
                    skipped += 1;
                }
            }
        }

        if applied > 0 {
            store.save()?;
        }

        ctx.log_line(&format!("done: {} applied, {} skipped", applied, skipped));
        Ok(())
    })
}

/// Run the split agent: two-phase decomposition of large nodes.
///
/// Phase 1: Send node + neighbors to LLM, get back a JSON split plan
///          (child keys, descriptions, section hints).
/// Phase 2: For each child, send parent content + child description to LLM,
///          get back the extracted/reorganized content for that child.
///
/// This handles arbitrarily large nodes because the output of each phase 2
/// call is proportional to one child, not the whole parent.
/// Split a single node by key. Called as an independent task so multiple
/// splits can run in parallel. Each task loads the store fresh, checks the
/// node still exists and hasn't been split, does the LLM work, then saves.
fn job_split_one(
    ctx: &ExecutionContext,
    parent_key: String,
) -> Result<(), TaskError> {
    run_job(ctx, "c-split", || {
        ctx.log_line(&format!("loading store for {}", parent_key));
        let mut store = crate::store::Store::load()?;

        // Check node still exists and hasn't been deleted/split already
        let content_len = match store.nodes.get(parent_key.as_str()) {
            Some(n) if !n.deleted => n.content.len(),
            _ => {
                ctx.log_line(&format!("skip: {} no longer exists or deleted", parent_key));
                return Ok(());
            }
        };

        ctx.log_line(&format!("--- splitting: {} ({} chars)", parent_key, content_len));

        // Phase 1: get split plan
        let plan_prompt = super::prompts::split_plan_prompt(&store, &parent_key)?;
        ctx.log_line(&format!("phase 1: plan prompt {} chars", plan_prompt.len()));

        let plan_response = super::llm::call_sonnet("split-plan", &plan_prompt)?;
        let plan = match super::llm::parse_json_response(&plan_response) {
            Ok(v) => v,
            Err(e) => {
                ctx.log_line(&format!("phase 1 parse error: {}", e));
                return Ok(());
            }
        };

        let action = plan.get("action").and_then(|v| v.as_str()).unwrap_or("");
        if action == "keep" {
            let reason = plan.get("reason").and_then(|v| v.as_str()).unwrap_or("");
            ctx.log_line(&format!("keep: {} ({})", parent_key, reason));
            return Ok(());
        }
        if action != "split" {
            ctx.log_line(&format!("unexpected action: {}", action));
            return Ok(());
        }

        let children_plan = match plan.get("children").and_then(|v| v.as_array()) {
            Some(c) if c.len() >= 2 => c,
            _ => {
                ctx.log_line("plan has fewer than 2 children, skipping");
                return Ok(());
            }
        };

        ctx.log_line(&format!("phase 1: {} children planned", children_plan.len()));
        for child in children_plan {
            let key = child.get("key").and_then(|v| v.as_str()).unwrap_or("?");
            let desc = child.get("description").and_then(|v| v.as_str()).unwrap_or("");
            ctx.log_line(&format!("  planned: {} — {}", key, desc));
        }

        // Phase 2: extract content for each child
        let mut children: Vec<(String, String)> = Vec::new();
        // Collect neighbor assignments from plan: child_key -> [neighbor_keys]
        let mut neighbor_map: HashMap<String, Vec<String>> = HashMap::new();

        for child_plan in children_plan {
            let child_key = match child_plan.get("key").and_then(|v| v.as_str()) {
                Some(k) => k.to_string(),
                None => continue,
            };
            let child_desc = child_plan.get("description")
                .and_then(|v| v.as_str()).unwrap_or("");
            let child_sections = child_plan.get("sections")
                .and_then(|v| v.as_array())
                .map(|arr| arr.iter()
                    .filter_map(|v| v.as_str())
                    .collect::<Vec<_>>()
                    .join(", "))
                .unwrap_or_default();
            let child_neighbors: Vec<String> = child_plan.get("neighbors")
                .and_then(|v| v.as_array())
                .map(|arr| arr.iter()
                    .filter_map(|v| v.as_str().map(|s| s.to_string()))
                    .collect())
                .unwrap_or_default();
            neighbor_map.insert(child_key.clone(), child_neighbors);

            ctx.log_line(&format!("phase 2: extracting {}", child_key));

            let extract_prompt = super::prompts::split_extract_prompt(
                &store, &parent_key, &child_key, child_desc, &child_sections)?;
            ctx.log_line(&format!("  extract prompt: {} chars", extract_prompt.len()));

            let content = match super::llm::call_sonnet("split-extract", &extract_prompt) {
                Ok(c) => c,
                Err(e) => {
                    ctx.log_line(&format!("  extract error: {}", e));
                    continue;
                }
            };

            ctx.log_line(&format!("  extracted: {} chars", content.len()));
            children.push((child_key, content));
        }

        if children.len() < 2 {
            ctx.log_line(&format!("only {} children extracted, skipping", children.len()));
            return Ok(());
        }

        // Reload store before mutations — another split may have saved meanwhile
        store = crate::store::Store::load()?;

        // Re-check parent still exists after reload
        if !store.nodes.contains_key(parent_key.as_str()) ||
           store.nodes.get(parent_key.as_str()).map_or(true, |n| n.deleted) {
            ctx.log_line(&format!("skip: {} was split by another task", parent_key));
            return Ok(());
        }

        // Collect parent's edges before modifications
        let parent_edges: Vec<_> = store.relations.iter()
            .filter(|r| !r.deleted && (r.source_key == *parent_key || r.target_key == *parent_key))
            .cloned()
            .collect();

        // Create child nodes
        let mut child_uuids: Vec<([u8; 16], String)> = Vec::new();
        for (child_key, content) in &children {
            if store.nodes.contains_key(child_key.as_str()) {
                ctx.log_line(&format!("  skip: {} already exists", child_key));
                continue;
            }
            store.upsert_provenance(child_key, content,
                "consolidate:write")?;
            let uuid = store.nodes.get(child_key.as_str()).unwrap().uuid;
            child_uuids.push((uuid, child_key.clone()));
            ctx.log_line(&format!("  created: {} ({} chars)", child_key, content.len()));
        }

        // Inherit edges using agent's neighbor assignments from the plan
        for (child_uuid, child_key) in &child_uuids {
            let neighbors = match neighbor_map.get(child_key) {
                Some(n) => n,
                None => continue,
            };
            for neighbor_key in neighbors {
                // Find the parent edge for this neighbor to inherit its strength
                let parent_edge = parent_edges.iter().find(|r| {
                    r.source_key == *neighbor_key || r.target_key == *neighbor_key
                });
                let strength = parent_edge.map(|e| e.strength).unwrap_or(0.3);

                let neighbor_uuid = match store.nodes.get(neighbor_key.as_str()) {
                    Some(n) => n.uuid,
                    None => continue,
                };

                let rel = crate::store::new_relation(
                    *child_uuid, neighbor_uuid,
                    crate::store::RelationType::Auto, strength,
                    child_key, neighbor_key,
                );
                store.add_relation(rel).ok();
            }
        }

        // Link siblings
        for i in 0..child_uuids.len() {
            for j in (i+1)..child_uuids.len() {
                let rel = crate::store::new_relation(
                    child_uuids[i].0, child_uuids[j].0,
                    crate::store::RelationType::Auto, 0.5,
                    &child_uuids[i].1, &child_uuids[j].1,
                );
                store.add_relation(rel).ok();
            }
        }

        // Tombstone parent
        if let Some(parent) = store.nodes.get_mut(parent_key.as_str()) {
            parent.deleted = true;
            parent.version += 1;
            let tombstone = parent.clone();
            store.append_nodes(std::slice::from_ref(&tombstone)).ok();
        }
        store.nodes.remove(parent_key.as_str());

        ctx.log_line(&format!("split complete: {} → {} children", parent_key, child_uuids.len()));
        store.save()?;
        Ok(())
    })
}

/// Link orphan nodes (CPU-heavy, no LLM).
fn job_link_orphans(ctx: &ExecutionContext) -> Result<(), TaskError> {
    run_job(ctx, "c-orphans", || {
        ctx.log_line("loading store");
        let mut store = crate::store::Store::load()?;
        ctx.log_line("linking orphans");
        let (orphans, added) = crate::neuro::link_orphans(&mut store, 2, 3, 0.15);
        ctx.log_line(&format!("{} orphans, {} links added", orphans, added));
        Ok(())
    })
}

/// Cap node degree to prevent mega-hubs.
fn job_cap_degree(ctx: &ExecutionContext) -> Result<(), TaskError> {
    run_job(ctx, "c-cap", || {
        ctx.log_line("loading store");
        let mut store = crate::store::Store::load()?;
        ctx.log_line("capping degree");
        match store.cap_degree(50) {
            Ok((hubs, pruned)) => {
                store.save()?;
                ctx.log_line(&format!("{} hubs capped, {} edges pruned", hubs, pruned));
                Ok(())
            }
            Err(e) => Err(e),
        }
    })
}

/// Apply links extracted from digests.
fn job_digest_links(ctx: &ExecutionContext) -> Result<(), TaskError> {
    run_job(ctx, "c-digest-links", || {
        ctx.log_line("loading store");
        let mut store = crate::store::Store::load()?;
        ctx.log_line("applying digest links");
        let links = super::digest::parse_all_digest_links(&store);
        let (applied, skipped, fallbacks) = super::digest::apply_digest_links(&mut store, &links);
        store.save()?;
        ctx.log_line(&format!("{} applied, {} skipped, {} fallbacks", applied, skipped, fallbacks));
        Ok(())
    })
}

fn job_knowledge_loop(ctx: &ExecutionContext) -> Result<(), TaskError> {
    run_job(ctx, "knowledge-loop", || {
        let config = super::knowledge::KnowledgeLoopConfig {
            max_cycles: 100,
            batch_size: 5,
            ..Default::default()
        };
        ctx.log_line("running agents");
        let results = super::knowledge::run_knowledge_loop(&config)?;
        ctx.log_line(format!("{} cycles, {} actions",
            results.len(),
            results.iter().map(|r| r.total_applied).sum::<usize>()));
        Ok(())
    })
}

fn job_digest(ctx: &ExecutionContext) -> Result<(), TaskError> {
    run_job(ctx, "digest", || {
        ctx.log_line("loading store");
        let mut store = crate::store::Store::load()?;
        ctx.log_line("generating digests");
        crate::digest::digest_auto(&mut store)
    })
}

fn job_daily_check(
    ctx: &ExecutionContext,
    graph_health: &Arc<Mutex<Option<GraphHealth>>>,
) -> Result<(), TaskError> {
    let gh = Arc::clone(graph_health);
    run_job(ctx, "daily-check", || {
        ctx.log_line("loading store");
        let store = crate::store::Store::load()?;
        ctx.log_line("checking health");
        let _report = crate::neuro::daily_check(&store);

        // Decay search hit counters (10% daily decay)
        ctx.log_line("decaying search counters");
        match crate::counters::decay_all(0.9) {
            Ok(removed) => ctx.log_line(&format!("decayed counters, removed {}", removed)),
            Err(e) => ctx.log_line(&format!("counter decay failed: {}", e)),
        }

        // Compute graph health metrics for status display
        ctx.log_line("computing graph health");
        let health = compute_graph_health(&store);
        *gh.lock().unwrap() = Some(health);

        Ok(())
    })
}

fn compute_graph_health(store: &crate::store::Store) -> GraphHealth {
    let graph = store.build_graph();
    let snap = crate::graph::current_metrics(&graph);

    let episodic_count = store.nodes.iter()
        .filter(|(_, n)| matches!(n.node_type, crate::store::NodeType::EpisodicSession))
        .count();
    let episodic_ratio = if store.nodes.is_empty() { 0.0 }
        else { episodic_count as f32 / store.nodes.len() as f32 };

    // Use the same planning logic as consolidation (skip O(n²) interference)
    let plan = crate::neuro::consolidation_plan_quick(store);

    GraphHealth {
        nodes: snap.nodes,
        edges: snap.edges,
        communities: snap.communities,
        alpha: snap.alpha,
        gini: snap.gini,
        avg_cc: snap.avg_cc,
        sigma: snap.sigma,
        episodic_ratio,
        interference: 0,
        plan_replay: plan.replay_count,
        plan_linker: plan.linker_count,
        plan_separator: plan.separator_count,
        plan_transfer: plan.transfer_count,
        plan_rationale: plan.rationale,
        computed_at: crate::store::format_datetime_space(crate::store::now_epoch()),
    }
}

// --- Session detection ---

/// Find JSONL session files that are stale (not recently written) and not
/// held open by any process.
/// Find JSONL session files that haven't been written to recently.
/// Only checks metadata (stat), no file reads or subprocess calls.
/// The fuser check (is file open?) is deferred to the reconcile loop,
/// only for sessions that pass the mined-key filter.
/// Minimum file size for a session to be worth mining.
/// Daemon-spawned LLM calls are ~55KB/5 lines; real interactive
/// sessions are much larger. Skip anything too small to contain
/// meaningful conversation.
const MIN_SESSION_BYTES: u64 = 100_000;

fn find_stale_sessions() -> Vec<PathBuf> {
    let projects = crate::config::get().projects_dir.clone();
    if !projects.exists() {
        return Vec::new();
    }

    let mut stale = Vec::new();
    let now = SystemTime::now();

    let Ok(dirs) = fs::read_dir(&projects) else { return stale };
    for dir_entry in dirs.filter_map(|e| e.ok()) {
        if !dir_entry.path().is_dir() { continue; }
        let Ok(files) = fs::read_dir(dir_entry.path()) else { continue };
        for f in files.filter_map(|e| e.ok()) {
            let path = f.path();
            if path.extension().map(|x| x == "jsonl").unwrap_or(false) {
                if let Ok(meta) = path.metadata() {
                    // Skip tiny sessions (daemon-spawned LLM calls, aborted sessions)
                    if meta.len() < MIN_SESSION_BYTES { continue; }
                    if let Ok(mtime) = meta.modified() {
                        let age = now.duration_since(mtime).unwrap_or_default();
                        if age.as_secs() >= SESSION_STALE_SECS {
                            stale.push(path);
                        }
                    }
                }
            }
        }
    }
    stale
}

/// Check if any other process has a file open by scanning /proc/*/fd/.
/// This is what `fuser` does internally, without the subprocess overhead.
fn is_file_open(path: &Path) -> bool {
    let Ok(target) = path.canonicalize() else { return false };
    let Ok(procs) = fs::read_dir("/proc") else { return false };
    let my_pid = std::process::id().to_string();

    for proc_entry in procs.filter_map(|e| e.ok()) {
        let name = proc_entry.file_name();
        let name = name.to_string_lossy();
        if !name.chars().all(|c| c.is_ascii_digit()) { continue; }
        if *name == my_pid { continue; }

        let fd_dir = proc_entry.path().join("fd");
        let Ok(fds) = fs::read_dir(&fd_dir) else { continue };
        for fd in fds.filter_map(|e| e.ok()) {
            if let Ok(link) = fs::read_link(fd.path()) {
                if link == target { return true; }
            }
        }
    }
    false
}

/// Get process uptime as human-readable string by reading /proc/<pid>/stat.
fn proc_uptime(pid: u32) -> Option<String> {
    // /proc/<pid>/stat field 22 (1-indexed) is start time in clock ticks
    let stat = fs::read_to_string(format!("/proc/{}/stat", pid)).ok()?;
    // Fields after comm (which may contain spaces/parens) — find closing paren
    let after_comm = stat.get(stat.rfind(')')? + 2..)?;
    let fields: Vec<&str> = after_comm.split_whitespace().collect();
    // Field 22 in stat is index 19 after comm (fields[0] = state, field 22 = starttime = index 19)
    let start_ticks: u64 = fields.get(19)?.parse().ok()?;
    let ticks_per_sec = unsafe { libc::sysconf(libc::_SC_CLK_TCK) } as u64;
    let boot_time_secs = {
        let uptime = fs::read_to_string("/proc/uptime").ok()?;
        let sys_uptime: f64 = uptime.split_whitespace().next()?.parse().ok()?;
        let now = SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).ok()?.as_secs();
        now - sys_uptime as u64
    };
    let start_secs = boot_time_secs + start_ticks / ticks_per_sec;
    let now = SystemTime::now().duration_since(SystemTime::UNIX_EPOCH).ok()?.as_secs();
    let uptime = now.saturating_sub(start_secs);

    Some(format_duration_human(uptime as u128 * 1000))
}

// --- Status writing ---

fn write_status(
    choir: &Choir,
    last_daily: Option<chrono::NaiveDate>,
    graph_health: &Arc<Mutex<Option<GraphHealth>>>,
) {
    let status = build_status(choir, last_daily, graph_health);
    if let Ok(json) = serde_json::to_string_pretty(&status) {
        let _ = fs::write(status_path(), json);
    }
}

#[derive(Clone, Default, serde::Serialize, serde::Deserialize)]
pub struct GraphHealth {
    pub nodes: usize,
    pub edges: usize,
    pub communities: usize,
    pub alpha: f32,        // power-law exponent (target ≥2.5)
    pub gini: f32,         // degree inequality (target ≤0.4)
    pub avg_cc: f32,       // clustering coefficient (target ≥0.2)
    pub sigma: f32,        // small-world sigma
    pub episodic_ratio: f32, // episodic/total nodes (target <0.4)
    pub interference: usize, // interfering pairs (target <50)
    // Consolidation work estimate from plan
    pub plan_replay: usize,
    pub plan_linker: usize,
    pub plan_separator: usize,
    pub plan_transfer: usize,
    pub plan_rationale: Vec<String>,
    pub computed_at: String,
}

#[derive(serde::Serialize, serde::Deserialize)]
struct DaemonStatus {
    pid: u32,
    tasks: Vec<TaskInfo>,
    #[serde(default)]
    last_daily: Option<String>,
    #[serde(default)]
    graph_health: Option<GraphHealth>,
}

// --- The daemon ---

pub fn run_daemon() -> Result<(), String> {
    let choir = Choir::new();
    let llm_concurrency = crate::config::get().llm_concurrency;
    // Workers: 2 for long-running loops + llm_concurrency + 1 for non-LLM jobs
    let n_workers = llm_concurrency + 3;
    let names: Vec<String> = (0..n_workers).map(|i| format!("w{}", i)).collect();
    let _workers: Vec<_> = names.iter().map(|n| choir.add_worker(n)).collect();

    let llm = ResourcePool::new("llm", llm_concurrency);
    llm.bind(&choir);

    // Recover last_daily from previous status file
    let last_daily: Arc<Mutex<Option<chrono::NaiveDate>>> = Arc::new(Mutex::new(
        fs::read_to_string(status_path()).ok()
            .and_then(|s| serde_json::from_str::<DaemonStatus>(&s).ok())
            .and_then(|s| s.last_daily)
            .and_then(|d| d.parse().ok())
    ));

    let graph_health: Arc<Mutex<Option<GraphHealth>>> = Arc::new(Mutex::new(None));

    log_event("daemon", "started", &format!("pid {}", std::process::id()));
    eprintln!("poc-memory daemon started (pid {})", std::process::id());

    // Write initial status
    write_status(&choir, *last_daily.lock().unwrap(), &graph_health);

    // Session watcher: reconcile-based extraction
    // Each tick: scan filesystem for stale sessions, check store for what's
    // already mined, check task registry for what's in-flight, spawn the diff.
    // No persistent tracking state — the store is the source of truth.
    let choir_sw = Arc::clone(&choir);
    let llm_sw = Arc::clone(&llm);
    let last_daily_sw = Arc::clone(&last_daily);
    let graph_health_sw = Arc::clone(&graph_health);
    choir.spawn("session-watcher").init(move |ctx| {
        ctx.set_progress("idle");
        // Cache: path → (file_size, segment_count). Invalidated when size changes.
        let mut seg_cache: HashMap<String, (u64, usize)> = HashMap::new();
        // Retry backoff: filename → (next_retry_after, current_backoff).
        // Exponential from 5min, cap 30min. Resets on daemon restart.
        let mut retry_backoff: HashMap<String, (std::time::Instant, Duration)> = HashMap::new();

        const BACKOFF_INITIAL: Duration = Duration::from_secs(300);  // 5 min
        const BACKOFF_MAX: Duration = Duration::from_secs(1800);     // 30 min

        loop {
            if ctx.is_cancelled() {
                return Err(TaskError::Fatal("cancelled".into()));
            }

            ctx.set_progress("scanning");

            // Check for failed tasks and update backoff.
            // Task names are "extract:{filename}.{segment}" — extract the
            // filename (UUID.jsonl) by stripping the trailing .N segment suffix.
            let task_statuses = choir_sw.task_statuses();
            for t in &task_statuses {
                if let Some(label) = t.name.strip_prefix("extract:") {
                    // label is "UUID.jsonl.N" — strip last ".N" to get filename
                    let filename = match label.rfind('.') {
                        Some(pos) if label[pos+1..].chars().all(|c| c.is_ascii_digit()) => {
                            &label[..pos]
                        }
                        _ => label,
                    };
                    match t.status {
                        TaskStatus::Failed => {
                            let entry = retry_backoff.entry(filename.to_string())
                                .or_insert((std::time::Instant::now(), BACKOFF_INITIAL));
                            entry.1 = (entry.1 * 2).min(BACKOFF_MAX);
                            entry.0 = std::time::Instant::now() + entry.1;
                        }
                        TaskStatus::Completed => {
                            retry_backoff.remove(filename);
                        }
                        _ => {}
                    }
                }
            }

            // What's currently running/pending? (avoid spawning duplicates)
            let active: HashSet<String> = task_statuses.iter()
                .filter(|t| !t.status.is_finished())
                .map(|t| t.name.clone())
                .collect();

            let stale = find_stale_sessions();

            // Load mined transcript keys once for this tick
            let mined = super::enrich::mined_transcript_keys();

            const MAX_NEW_PER_TICK: usize = 10;

            // Load fact-mined keys too
            let fact_keys: HashSet<String> = {
                use crate::store::StoreView;
                let view = crate::store::AnyView::load().ok();
                view.map(|v| {
                    let mut keys = HashSet::new();
                    v.for_each_node(|key, _, _| {
                        if key.starts_with("_facts-") {
                            keys.insert(key.to_string());
                        }
                    });
                    keys
                }).unwrap_or_default()
            };

            let mut extract_queued = 0;
            let mut extract_remaining = 0;
            let mut fact_remaining = 0;
            let mut already_mined = 0;
            let mut still_open = 0;
            let mut backed_off = 0;
            let total_stale = stale.len();

            // Sessions with old whole-file keys that need per-segment migration
            let mut migrate_keys: Vec<(String, String, usize)> = Vec::new();

            let mut needs_extract: Vec<(String, String, Option<usize>)> = Vec::new();
            let mut needs_fact: Vec<(String, String)> = Vec::new();

            let now = std::time::Instant::now();

            for session in stale {
                let filename = session.file_name()
                    .map(|n| n.to_string_lossy().to_string())
                    .unwrap_or_else(|| "unknown".into());
                let path_str = session.to_string_lossy().to_string();

                // Check retry backoff before doing any work
                if let Some((next_retry, _)) = retry_backoff.get(&filename) {
                    if now < *next_retry {
                        backed_off += 1;
                        continue;
                    }
                }

                if is_file_open(&session) {
                    still_open += 1;
                    continue;
                }

                // Get file size for cache invalidation
                let file_size = fs::metadata(&session).map(|m| m.len()).unwrap_or(0);

                // Get segment count, using cache with size-based invalidation
                let seg_count = if let Some(&(cached_size, cached_count)) = seg_cache.get(&path_str) {
                    if cached_size == file_size {
                        cached_count
                    } else {
                        // File changed — re-parse
                        let messages = match super::enrich::extract_conversation(&path_str) {
                            Ok(m) => m,
                            Err(_) => continue,
                        };
                        let count = super::enrich::split_on_compaction(messages).len();
                        seg_cache.insert(path_str.clone(), (file_size, count));
                        count
                    }
                } else {
                    let messages = match super::enrich::extract_conversation(&path_str) {
                        Ok(m) => m,
                        Err(_) => continue,
                    };
                    let count = super::enrich::split_on_compaction(messages).len();
                    seg_cache.insert(path_str.clone(), (file_size, count));
                    count
                };

                // No extractable messages — skip entirely
                if seg_count == 0 {
                    already_mined += 1;
                    continue;
                }

                let fname_key = super::enrich::transcript_filename_key(&path_str);
                let has_whole_file_key = mined.contains(&fname_key);

                // Check per-segment keys, find unmined segments
                let mut unmined_segs: Vec<usize> = Vec::new();
                let mut has_any_seg_key = false;
                for i in 0..seg_count {
                    let seg_key = format!("{}.{}", fname_key, i);
                    if mined.contains(&seg_key) {
                        has_any_seg_key = true;
                    } else {
                        unmined_segs.push(i);
                    }
                }

                // Migrate old whole-file key: if it exists but no per-segment keys,
                // write per-segment keys for all current segments (they were mined
                // under the old scheme)
                if has_whole_file_key && !has_any_seg_key && seg_count > 0 {
                    migrate_keys.push((fname_key.clone(), path_str.clone(), seg_count));
                    // After migration, all current segments are covered
                    unmined_segs.clear();
                }

                if unmined_segs.is_empty() {
                    // All segments mined — check fact-mining
                    let fact_key = format!("_facts-{}", filename.trim_end_matches(".jsonl"));
                    if !fact_keys.contains(&fact_key) {
                        let task_name = format!("fact-mine:{}", filename);
                        if !active.contains(&task_name) {
                            needs_fact.push((filename, path_str));
                        }
                    } else {
                        already_mined += 1;
                    }
                } else {
                    // Queue unmined segments
                    for i in unmined_segs {
                        let task_name = format!("extract:{}.{}", filename, i);
                        if active.contains(&task_name) { continue; }
                        needs_extract.push((
                            format!("{}.{}", filename, i),
                            path_str.clone(),
                            Some(i),
                        ));
                    }
                }
            }

            // Migrate old whole-file keys to per-segment keys
            if !migrate_keys.is_empty() {
                match crate::store::Store::load() {
                    Ok(mut store) => {
                        let mut ok = 0;
                        let mut fail = 0;
                        for (fname_key, path_str, seg_count) in &migrate_keys {
                            for i in 0..*seg_count {
                                let seg_key = format!("{}.{}", fname_key, i);
                                let content = format!("Migrated from whole-file key for {}", path_str);
                                let mut node = crate::store::new_node(&seg_key, &content);
                                node.provenance = "experience-mine:write".to_string();
                                match store.upsert_node(node) {
                                    Ok(()) => ok += 1,
                                    Err(e) => {
                                        if fail == 0 {
                                            eprintln!("migration upsert_node error: {}", e);
                                        }
                                        fail += 1;
                                    }
                                }
                            }
                        }
                        if let Err(e) = store.save() {
                            eprintln!("migration save error: {}", e);
                        }
                        log_event("session-watcher", "migrated",
                            &format!("{} whole-file keys → per-segment ({} ok, {} fail)",
                                migrate_keys.len(), ok, fail));
                    }
                    Err(e) => {
                        eprintln!("migration store load error: {}", e);
                    }
                }
            }

            // Spawn experience-mine jobs (priority)
            for (task_label, path_str, segment) in &needs_extract {
                if extract_queued >= MAX_NEW_PER_TICK {
                    extract_remaining += 1;
                    continue;
                }
                let task_name = format!("extract:{}", task_label);
                log_event("extract", "queued", &task_name);
                let path = path_str.clone();
                let seg = *segment;
                choir_sw.spawn(task_name)
                    .resource(&llm_sw)
                    .retries(2)
                    .init(move |ctx| {
                        job_experience_mine(ctx, &path, seg)
                    });
                extract_queued += 1;
            }

            // Only queue fact-mine when experience backlog is clear
            needs_fact.sort_by_key(|(_, path_str)| {
                fs::metadata(path_str).map(|m| m.len()).unwrap_or(u64::MAX)
            });
            let mut fact_queued = 0;
            if needs_extract.len() == extract_queued {
                let fact_budget = MAX_NEW_PER_TICK.saturating_sub(extract_queued);
                for (filename, path_str) in &needs_fact {
                    if fact_queued >= fact_budget {
                        fact_remaining += 1;
                        continue;
                    }
                    let task_name = format!("fact-mine:{}", filename);
                    log_event("fact-mine", "queued", path_str);
                    let path = path_str.clone();
                    choir_sw.spawn(task_name)
                        .resource(&llm_sw)
                        .retries(1)
                        .init(move |ctx| {
                            job_fact_mine(ctx, &path)
                        });
                    fact_queued += 1;
                }
            } else {
                fact_remaining = needs_fact.len();
            }

            let extract_pending = extract_queued + extract_remaining;
            let fact_pending = fact_queued + fact_remaining;
            if extract_pending > 0 || fact_pending > 0 || still_open > 0 || backed_off > 0 {
                log_event("session-watcher", "tick",
                    &format!("{} stale, {} mined, {} extract, {} fact, {} open, {} backoff",
                        total_stale, already_mined, extract_pending, fact_pending, still_open, backed_off));
                let mut parts = Vec::new();
                if extract_pending > 0 { parts.push(format!("{} extract", extract_pending)); }
                if fact_pending > 0 { parts.push(format!("{} fact", fact_pending)); }
                if still_open > 0 { parts.push(format!("{} open", still_open)); }
                if backed_off > 0 { parts.push(format!("{} backoff", backed_off)); }
                ctx.set_progress(parts.join(", "));
            } else {
                ctx.set_progress("idle");
            }

            write_status(&choir_sw, *last_daily_sw.lock().unwrap(), &graph_health_sw);
            std::thread::sleep(SCHEDULER_INTERVAL);
        }
    });

    // Scheduler: runs daily jobs based on filesystem state
    let choir_sched = Arc::clone(&choir);
    let llm_sched = Arc::clone(&llm);
    let last_daily_sched = Arc::clone(&last_daily);
    let graph_health_sched = Arc::clone(&graph_health);
    choir.spawn("scheduler").init(move |ctx| {
        let mut last_health = std::time::Instant::now() - HEALTH_INTERVAL;
        ctx.set_progress("idle");

        loop {
            if ctx.is_cancelled() {
                return Err(TaskError::Fatal("cancelled".into()));
            }

            let today = chrono::Local::now().date_naive();

            // Health check: every hour — also updates graph health metrics
            if last_health.elapsed() >= HEALTH_INTERVAL {
                let gh = Arc::clone(&graph_health_sched);
                choir_sched.spawn("health").init(move |ctx| {
                    job_daily_check(ctx, &gh)
                });
                last_health = std::time::Instant::now();
            }

            // Daily jobs: once per day (wait for health check to cache metrics first)
            let last = *last_daily_sched.lock().unwrap();
            let gh = graph_health_sched.lock().unwrap().clone();
            if last.is_none_or(|d| d < today) && gh.is_some() {
                log_event("scheduler", "daily-trigger", &today.to_string());

                // Use cached graph health plan (from consolidation_plan_quick).
                let h = gh.as_ref().unwrap(); // guarded by gh.is_some() above
                let plan = crate::neuro::ConsolidationPlan {
                    replay_count: h.plan_replay,
                    linker_count: h.plan_linker,
                    separator_count: h.plan_separator,
                    transfer_count: h.plan_transfer,
                    run_health: true,
                    rationale: Vec::new(),
                };
                let runs = plan.to_agent_runs(5);

                log_event("scheduler", "consolidation-plan",
                    &format!("{} agents ({}r {}l {}s {}t)",
                        runs.len(), h.plan_replay, h.plan_linker,
                        h.plan_separator, h.plan_transfer));

                // Phase 1: Agent runs (sequential — each reloads store to see prior changes)
                let mut prev_agent = None;
                for (i, (agent_type, batch)) in runs.iter().enumerate() {
                    let agent = agent_type.to_string();
                    let b = *batch;
                    let task_name = format!("c-{}-{}:{}", agent, i, today);
                    let mut builder = choir_sched.spawn(task_name)
                        .resource(&llm_sched)
                        .retries(1)
                        .init(move |ctx| {
                            job_consolidation_agent(ctx, &agent, b)
                        });
                    if let Some(ref dep) = prev_agent {
                        builder.depend_on(dep);
                    }
                    prev_agent = Some(builder.run());
                }

                // Phase 2: Link orphans (CPU-only, no LLM)
                let mut orphans = choir_sched.spawn(format!("c-orphans:{}", today))
                    .retries(1)
                    .init(move |ctx| job_link_orphans(ctx));
                if let Some(ref dep) = prev_agent {
                    orphans.depend_on(dep);
                }
                let orphans = orphans.run();

                // Phase 3: Cap degree
                let mut cap = choir_sched.spawn(format!("c-cap:{}", today))
                    .retries(1)
                    .init(move |ctx| job_cap_degree(ctx));
                cap.depend_on(&orphans);
                let cap = cap.run();

                // Phase 4: Generate digests
                let mut digest = choir_sched.spawn(format!("c-digest:{}", today))
                    .resource(&llm_sched)
                    .retries(1)
                    .init(move |ctx| job_digest(ctx));
                digest.depend_on(&cap);
                let digest = digest.run();

                // Phase 5: Apply digest links
                let mut digest_links = choir_sched.spawn(format!("c-digest-links:{}", today))
                    .retries(1)
                    .init(move |ctx| job_digest_links(ctx));
                digest_links.depend_on(&digest);
                let digest_links = digest_links.run();

                // Phase 7: Knowledge loop
                let mut knowledge = choir_sched.spawn(format!("c-knowledge:{}", today))
                    .resource(&llm_sched)
                    .retries(1)
                    .init(move |ctx| job_knowledge_loop(ctx));
                knowledge.depend_on(&digest_links);

                *last_daily_sched.lock().unwrap() = Some(today);
                ctx.set_progress(format!("daily pipeline triggered ({today})"));
            }

            // Prune finished tasks from registry
            let pruned = choir_sched.gc_finished();
            if pruned > 0 {
                log::trace!("pruned {} finished tasks", pruned);
            }

            write_status(&choir_sched, *last_daily_sched.lock().unwrap(), &graph_health_sched);
            std::thread::sleep(SCHEDULER_INTERVAL);
        }
    });

    // Main thread: listen on status socket + wait for signals
    let choir_main = Arc::clone(&choir);
    let last_daily_main = Arc::clone(&last_daily);
    let graph_health_main = Arc::clone(&graph_health);
    status_socket_loop(&choir_main, &last_daily_main, &graph_health_main, &llm);

    log_event("daemon", "stopping", "");
    eprintln!("Shutting down...");

    // Clean up socket
    let _ = fs::remove_file(status_sock_path());

    log_event("daemon", "stopped", "");

    // Exit immediately — PR_SET_PDEATHSIG on child processes ensures
    // claude subprocesses get SIGTERM when we die.
    std::process::exit(0)
}

fn send_rpc(cmd: &str) -> Option<String> {
    use std::io::{Read as _, Write as _};
    use std::os::unix::net::UnixStream;

    let mut stream = UnixStream::connect(status_sock_path()).ok()?;
    stream.set_read_timeout(Some(Duration::from_secs(5))).ok();
    stream.write_all(cmd.as_bytes()).ok()?;
    stream.shutdown(std::net::Shutdown::Write).ok()?;
    let mut buf = String::new();
    stream.read_to_string(&mut buf).ok()?;
    Some(buf)
}

pub fn rpc_consolidate() -> Result<(), String> {
    match send_rpc("consolidate") {
        Some(resp) => {
            println!("{}", resp.trim());
            Ok(())
        }
        None => Err("Daemon not running.".into()),
    }
}

/// Record search hits for the given keys (fire-and-forget from memory-search).
pub fn rpc_record_hits(keys: &[&str]) -> Result<(), String> {
    if keys.is_empty() { return Ok(()); }
    let cmd = format!("record-hits {}", keys.join("\t"));
    match send_rpc(&cmd) {
        Some(_) => Ok(()),
        None => Err("Daemon not running.".into()),
    }
}

pub fn rpc_run_agent(agent: &str, count: usize) -> Result<(), String> {
    let cmd = format!("run-agent {} {}", agent, count);
    match send_rpc(&cmd) {
        Some(resp) => {
            println!("{}", resp.trim());
            Ok(())
        }
        None => Err("Daemon not running.".into()),
    }
}

fn read_status_socket() -> Option<DaemonStatus> {
    use std::io::Read as _;
    use std::os::unix::net::UnixStream;

    let mut stream = UnixStream::connect(status_sock_path()).ok()?;
    stream.set_read_timeout(Some(Duration::from_secs(2))).ok();
    let mut buf = String::new();
    stream.read_to_string(&mut buf).ok()?;
    serde_json::from_str(&buf).ok()
}

fn status_sock_path() -> PathBuf {
    crate::config::get().data_dir.join("daemon.sock")
}

/// Listen on a Unix domain socket for status requests.
/// Any connection gets the live status JSON written and closed.
/// Also handles SIGINT/SIGTERM for clean shutdown.
fn status_socket_loop(
    choir: &Arc<Choir>,
    last_daily: &Arc<Mutex<Option<chrono::NaiveDate>>>,
    graph_health: &Arc<Mutex<Option<GraphHealth>>>,
    llm: &Arc<ResourcePool>,
) {
    use std::io::{Read as _, Write as _};
    use std::os::unix::net::UnixListener;
    use std::sync::atomic::{AtomicBool, Ordering};

    static STOP: AtomicBool = AtomicBool::new(false);

    unsafe {
        libc::signal(libc::SIGINT, handle_signal as libc::sighandler_t);
        libc::signal(libc::SIGTERM, handle_signal as libc::sighandler_t);
    }

    let sock_path = status_sock_path();
    let _ = fs::remove_file(&sock_path); // clean up stale socket

    let listener = match UnixListener::bind(&sock_path) {
        Ok(l) => l,
        Err(e) => {
            eprintln!("Warning: couldn't bind status socket {}: {}", sock_path.display(), e);
            // Fall back to just waiting for signals
            while !STOP.load(Ordering::Acquire) {
                std::thread::sleep(Duration::from_millis(500));
            }
            return;
        }
    };

    // Non-blocking so we can check STOP flag
    listener.set_nonblocking(true).ok();

    while !STOP.load(Ordering::Acquire) {
        match listener.accept() {
            Ok((mut stream, _)) => {
                // Read command from client (with short timeout)
                stream.set_read_timeout(Some(Duration::from_millis(100))).ok();
                let mut cmd = String::new();
                let _ = stream.read_to_string(&mut cmd);
                let cmd = cmd.trim().to_string();

                match cmd.as_str() {
                    "consolidate" => {
                        *last_daily.lock().unwrap() = None;
                        let _ = stream.write_all(b"{\"ok\":true,\"action\":\"consolidation scheduled\"}\n");
                        log_event("rpc", "consolidate", "triggered via socket");
                    }
                    cmd if cmd.starts_with("record-hits ") => {
                        let keys: Vec<&str> = cmd.strip_prefix("record-hits ")
                            .unwrap_or("")
                            .split('\t')
                            .filter(|k| !k.is_empty())
                            .collect();
                        if keys.is_empty() {
                            let _ = stream.write_all(b"{\"ok\":false,\"error\":\"no keys\"}\n");
                        } else {
                            let n = keys.len();
                            match crate::counters::record_search_hits(&keys) {
                                Ok(()) => {
                                    let msg = format!("{{\"ok\":true,\"recorded\":{}}}\n", n);
                                    let _ = stream.write_all(msg.as_bytes());
                                }
                                Err(e) => {
                                    let msg = format!("{{\"ok\":false,\"error\":\"{}\"}}\n",
                                        e.replace('"', "'"));
                                    let _ = stream.write_all(msg.as_bytes());
                                }
                            }
                        }
                    }
                    cmd if cmd.starts_with("run-agent ") => {
                        let parts: Vec<&str> = cmd.splitn(3, ' ').collect();
                        let agent_type = parts.get(1).unwrap_or(&"replay");
                        let count: usize = parts.get(2)
                            .and_then(|s| s.parse().ok())
                            .unwrap_or(1);
                        let batch_size = 5;

                        let today = chrono::Local::now().format("%Y-%m-%d");
                        let ts = chrono::Local::now().format("%H%M%S");
                        let mut prev = None;
                        let mut spawned = 0;
                        let mut remaining = count;

                        let is_rename = *agent_type == "rename";
                        let is_split = *agent_type == "split";

                        if is_split {
                            // Split: load candidates upfront, spawn independent
                            // parallel tasks — one per node, no dependencies.
                            let store = crate::store::Store::load().ok();
                            let candidates = store.as_ref()
                                .map(|s| super::prompts::split_candidates(s))
                                .unwrap_or_default();
                            let to_split: Vec<String> = candidates.into_iter()
                                .take(count)
                                .collect();
                            for key in &to_split {
                                let key = key.clone();
                                let task_name = format!("c-split-{}:{}", key, today);
                                choir.spawn(task_name)
                                    .resource(llm)
                                    .retries(1)
                                    .init(move |ctx| {
                                        job_split_one(ctx, key.clone())
                                    })
                                    .run();
                                spawned += 1;
                            }
                            remaining = 0;
                        }

                        while remaining > 0 {
                            let batch = remaining.min(batch_size);
                            let agent = agent_type.to_string();
                            let task_name = format!("c-{}-rpc{}:{}", agent, ts, today);
                            let mut builder = choir.spawn(task_name)
                                .resource(llm)
                                .retries(1)
                                .init(move |ctx| {
                                    if is_rename {
                                        job_rename_agent(ctx, batch)
                                    } else {
                                        job_consolidation_agent(ctx, &agent, batch)
                                    }
                                });
                            if let Some(ref dep) = prev {
                                builder.depend_on(dep);
                            }
                            prev = Some(builder.run());
                            remaining -= batch;
                            spawned += 1;
                        }

                        let msg = format!("{{\"ok\":true,\"action\":\"queued {} {} run(s) ({} tasks)\"}}\n",
                            count, agent_type, spawned);
                        let _ = stream.write_all(msg.as_bytes());
                        log_event("rpc", "run-agent",
                            &format!("{} x{}", agent_type, count));
                    }
                    _ => {
                        // Default: return status
                        let status = build_status(choir, *last_daily.lock().unwrap(), graph_health);
                        if let Ok(json) = serde_json::to_string_pretty(&status) {
                            let _ = stream.write_all(json.as_bytes());
                        }
                    }
                }
                // Connection closes when stream is dropped
            }
            Err(ref e) if e.kind() == std::io::ErrorKind::WouldBlock => {
                std::thread::sleep(Duration::from_millis(100));
            }
            Err(_) => {
                std::thread::sleep(Duration::from_millis(100));
            }
        }
    }

    extern "C" fn handle_signal(_: libc::c_int) {
        STOP.store(true, std::sync::atomic::Ordering::Release);
    }
}

fn build_status(
    choir: &Choir,
    last_daily: Option<chrono::NaiveDate>,
    graph_health: &Arc<Mutex<Option<GraphHealth>>>,
) -> DaemonStatus {
    DaemonStatus {
        pid: std::process::id(),
        tasks: choir.task_statuses(),
        last_daily: last_daily.map(|d| d.to_string()),
        graph_health: graph_health.lock().unwrap().clone(),
    }
}

// --- Status display ---

fn format_duration_human(ms: u128) -> String {
    if ms < 1_000 {
        format!("{}ms", ms)
    } else if ms < 60_000 {
        format!("{:.1}s", ms as f64 / 1000.0)
    } else if ms < 3_600_000 {
        format!("{:.0}m{:.0}s", ms / 60_000, (ms % 60_000) / 1000)
    } else {
        format!("{:.0}h{:.0}m", ms / 3_600_000, (ms % 3_600_000) / 60_000)
    }
}

fn task_group(name: &str) -> &str {
    if name == "session-watcher" || name == "scheduler" { "core" }
    else if name.starts_with("extract:") || name.starts_with("fact-mine:") { "extract" }
    else if name.starts_with("c-") || name.starts_with("consolidate:")
        || name.starts_with("knowledge-loop:") || name.starts_with("digest:")
        || name.starts_with("decay:") { "daily" }
    else if name == "health" { "health" }
    else { "other" }
}

/// Compute elapsed time for a task, using absolute started_at if available.
fn task_elapsed(t: &TaskInfo) -> Duration {
    if matches!(t.status, TaskStatus::Running) {
        if let Some(started) = t.started_at {
            let now = SystemTime::now()
                .duration_since(SystemTime::UNIX_EPOCH)
                .unwrap_or_default()
                .as_secs_f64();
            Duration::from_secs_f64((now - started).max(0.0))
        } else {
            t.elapsed
        }
    } else {
        t.result.as_ref()
            .map(|r| r.duration)
            .unwrap_or(t.elapsed)
    }
}

fn status_symbol(t: &TaskInfo) -> &'static str {
    if t.cancelled { return "✗" }
    match t.status {
        TaskStatus::Running => "▶",
        TaskStatus::Completed => "✓",
        TaskStatus::Failed => "✗",
        TaskStatus::Pending => "·",
    }
}

/// Shorten a job name for display: "experience-mine /long/path/uuid.jsonl" → "experience-mine uuid…"
fn short_job_name(job: &str) -> String {
    // Split "verb path" or just return as-is
    if let Some((verb, path)) = job.split_once(' ') {
        let file = path.rsplit('/').next().unwrap_or(path);
        let file = file.strip_suffix(".jsonl").unwrap_or(file);
        let short = if file.len() > 12 { &file[..12] } else { file };
        format!("{} {}", verb, short)
    } else {
        job.to_string()
    }
}

fn show_recent_completions(n: usize) {
    let path = log_path();
    let content = match fs::read_to_string(&path) {
        Ok(c) => c,
        Err(_) => return,
    };

    let recent: Vec<&str> = content.lines().rev()
        .filter(|line| {
            line.contains("\"event\":\"completed\"") || line.contains("\"event\":\"failed\"")
        })
        .take(n)
        .collect();

    if recent.is_empty() { return; }

    eprintln!("  Recent:");
    for line in recent.iter().rev() {
        if let Ok(obj) = serde_json::from_str::<serde_json::Value>(line) {
            let ts = obj.get("ts").and_then(|v| v.as_str()).unwrap_or("?");
            let job = obj.get("job").and_then(|v| v.as_str()).unwrap_or("?");
            let event = obj.get("event").and_then(|v| v.as_str()).unwrap_or("?");
            let detail = obj.get("detail").and_then(|v| v.as_str()).unwrap_or("");

            let time = if ts.len() >= 19 { &ts[11..19] } else { ts };
            let sym = if event == "completed" { "✓" } else { "✗" };
            let name = short_job_name(job);

            eprintln!("    {} {}  {:30} {}", sym, time, name, detail);
        }
    }
    eprintln!();
}

pub fn show_status() -> Result<(), String> {
    let status = match read_status_socket() {
        Some(s) => s,
        None => {
            eprintln!("Daemon not running.");
            return Ok(());
        }
    };

    let uptime_str = proc_uptime(status.pid).unwrap_or_default();
    if uptime_str.is_empty() {
        eprintln!("poc-memory daemon  pid={}", status.pid);
    } else {
        eprintln!("poc-memory daemon  pid={}  uptime {}", status.pid, uptime_str);
    }

    if status.tasks.is_empty() {
        eprintln!("\n  No tasks");
        return Ok(());
    }

    // Count by status
    let running = status.tasks.iter().filter(|t| matches!(t.status, TaskStatus::Running)).count();
    let pending = status.tasks.iter().filter(|t| matches!(t.status, TaskStatus::Pending)).count();
    let completed = status.tasks.iter().filter(|t| matches!(t.status, TaskStatus::Completed)).count();
    let failed = status.tasks.iter().filter(|t| matches!(t.status, TaskStatus::Failed)).count();
    eprintln!("  tasks: {} running, {} pending, {} done, {} failed",
        running, pending, completed, failed);

    // Graph health
    if let Some(ref gh) = status.graph_health {
        eprintln!();
        fn indicator(val: f32, target: f32, higher_is_better: bool) -> &'static str {
            let ok = if higher_is_better { val >= target } else { val <= target };
            if ok { "✓" } else { "✗" }
        }
        eprintln!("  Graph health ({})", gh.computed_at);
        eprintln!("    {} nodes, {} edges, {} communities",
            gh.nodes, gh.edges, gh.communities);
        eprintln!("    {} α={:.2} (≥2.5)  {} gini={:.3} (≤0.4)  {} cc={:.3} (≥0.2)",
            indicator(gh.alpha, 2.5, true), gh.alpha,
            indicator(gh.gini, 0.4, false), gh.gini,
            indicator(gh.avg_cc, 0.2, true), gh.avg_cc);
        eprintln!("    {} episodic={:.0}% (<40%)  σ={:.1}",
            indicator(gh.episodic_ratio, 0.4, false), gh.episodic_ratio * 100.0,
            gh.sigma);

        let total = gh.plan_replay + gh.plan_linker + gh.plan_separator + gh.plan_transfer + 1;
        eprintln!("    consolidation plan: {} agents ({}r {}l {}s {}t +health)",
            total, gh.plan_replay, gh.plan_linker, gh.plan_separator, gh.plan_transfer);
    }
    eprintln!();

    // Group and display
    let groups: &[(&str, &str)] = &[
        ("core", "Core"),
        ("daily", "Daily pipeline"),
        ("extract", "Session extraction"),
        ("health", "Health"),
        ("other", "Other"),
    ];

    // In-flight tasks first (running + pending)
    let in_flight: Vec<&TaskInfo> = status.tasks.iter()
        .filter(|t| matches!(t.status, TaskStatus::Running | TaskStatus::Pending))
        .collect();

    if !in_flight.is_empty() {
        eprintln!("  In flight:");
        for t in &in_flight {
            let sym = status_symbol(t);
            let e = task_elapsed(t);
            let elapsed = if !e.is_zero() {
                format!("  {}", format_duration_human(e.as_millis()))
            } else {
                String::new()
            };
            let progress = t.progress.as_deref()
                .filter(|p| *p != "idle")
                .map(|p| format!("  {}", p))
                .unwrap_or_default();
            let name = short_job_name(&t.name);
            eprintln!("    {} {:30}{}{}", sym, name, elapsed, progress);
            if matches!(t.status, TaskStatus::Running) && !t.output_log.is_empty() {
                let skip = t.output_log.len().saturating_sub(3);
                for line in &t.output_log[skip..] {
                    eprintln!("      │ {}", line);
                }
            }
        }
        eprintln!();
    }

    // Recent completions from log file
    show_recent_completions(20);

    // Detailed group view only if there are failures worth showing
    for (group_id, group_label) in groups {
        let tasks: Vec<&TaskInfo> = status.tasks.iter()
            .filter(|t| task_group(&t.name) == *group_id)
            .collect();
        if tasks.is_empty() { continue; }

        // For extract group, show summary instead of individual tasks
        if *group_id == "extract" {
            let n_pending = tasks.iter().filter(|t| matches!(t.status, TaskStatus::Pending)).count();
            let n_running = tasks.iter().filter(|t| matches!(t.status, TaskStatus::Running)).count();
            let n_done = tasks.iter().filter(|t| matches!(t.status, TaskStatus::Completed)).count();
            let n_failed = tasks.iter().filter(|t| matches!(t.status, TaskStatus::Failed)).count();
            eprintln!("  {}  ({} total)", group_label, tasks.len());

            if n_running > 0 {
                for t in tasks.iter().filter(|t| matches!(t.status, TaskStatus::Running)) {
                    let e = task_elapsed(t);
                    let elapsed = if !e.is_zero() {
                        format!("  ({})", format_duration_human(e.as_millis()))
                    } else {
                        String::new()
                    };
                    let progress = t.progress.as_deref().map(|p| format!("  {}", p)).unwrap_or_default();
                    eprintln!("    {} {}{}{}", status_symbol(t), t.name, elapsed, progress);
                    if !t.output_log.is_empty() {
                        let skip = t.output_log.len().saturating_sub(3);
                        for line in &t.output_log[skip..] {
                            eprintln!("      │ {}", line);
                        }
                    }
                }
            }
            let mut parts = Vec::new();
            if n_done > 0 { parts.push(format!("{} done", n_done)); }
            if n_running > 0 { parts.push(format!("{} running", n_running)); }
            if n_pending > 0 { parts.push(format!("{} queued", n_pending)); }
            if n_failed > 0 { parts.push(format!("{} FAILED", n_failed)); }
            eprintln!("    {}", parts.join(", "));

            // Show recent failures
            for t in tasks.iter().filter(|t| matches!(t.status, TaskStatus::Failed)).take(3) {
                if let Some(ref r) = t.result {
                    if let Some(ref err) = r.error {
                        let short = if err.len() > 80 { &err[..80] } else { err };
                        eprintln!("    ✗ {}: {}", t.name, short);
                    }
                }
            }
            eprintln!();
            continue;
        }

        eprintln!("  {}", group_label);
        for t in &tasks {
            let sym = status_symbol(t);
            let e = task_elapsed(t);
            let duration = if !e.is_zero() {
                format_duration_human(e.as_millis())
            } else {
                String::new()
            };

            let retry = if t.max_retries > 0 && t.retry_count > 0 {
                format!(" retry {}/{}", t.retry_count, t.max_retries)
            } else {
                String::new()
            };

            let detail = if matches!(t.status, TaskStatus::Failed) {
                t.result.as_ref()
                    .and_then(|r| r.error.as_ref())
                    .map(|e| {
                        let short = if e.len() > 60 { &e[..60] } else { e };
                        format!("  err: {}", short)
                    })
                    .unwrap_or_default()
            } else {
                String::new()
            };

            if duration.is_empty() {
                eprintln!("    {} {:30}{}{}", sym, t.name, retry, detail);
            } else {
                eprintln!("    {} {:30} {:>8}{}{}", sym, t.name, duration, retry, detail);
            }

            // Show output log tail for running tasks
            if matches!(t.status, TaskStatus::Running) && !t.output_log.is_empty() {
                let skip = t.output_log.len().saturating_sub(5);
                for line in &t.output_log[skip..] {
                    eprintln!("      │ {}", line);
                }
            }
        }
        eprintln!();
    }

    Ok(())
}

pub fn install_service() -> Result<(), String> {
    let exe = std::env::current_exe()
        .map_err(|e| format!("current_exe: {}", e))?;
    let home = std::env::var("HOME").map_err(|e| format!("HOME: {}", e))?;

    let unit_dir = PathBuf::from(&home).join(".config/systemd/user");
    fs::create_dir_all(&unit_dir)
        .map_err(|e| format!("create {}: {}", unit_dir.display(), e))?;

    let unit = format!(
r#"[Unit]
Description=poc-memory daemon — background memory maintenance
After=default.target

[Service]
Type=simple
ExecStart={exe} daemon
Restart=on-failure
RestartSec=30
Environment=HOME={home}
Environment=PATH={home}/.cargo/bin:{home}/.local/bin:{home}/bin:/usr/local/bin:/usr/bin:/bin

[Install]
WantedBy=default.target
"#, exe = exe.display(), home = home);

    let unit_path = unit_dir.join("poc-memory.service");
    fs::write(&unit_path, &unit)
        .map_err(|e| format!("write {}: {}", unit_path.display(), e))?;
    eprintln!("Wrote {}", unit_path.display());

    let status = std::process::Command::new("systemctl")
        .args(["--user", "daemon-reload"])
        .status()
        .map_err(|e| format!("systemctl daemon-reload: {}", e))?;
    if !status.success() {
        return Err("systemctl daemon-reload failed".into());
    }

    let status = std::process::Command::new("systemctl")
        .args(["--user", "enable", "--now", "poc-memory"])
        .status()
        .map_err(|e| format!("systemctl enable: {}", e))?;
    if !status.success() {
        return Err("systemctl enable --now failed".into());
    }

    eprintln!("Service enabled and started");

    // Install poc-daemon service
    install_notify_daemon(&unit_dir, &home)?;

    // Install memory-search + poc-hook into Claude settings
    install_hook()?;

    Ok(())
}

/// Install the poc-daemon (notification/idle) systemd user service.
fn install_notify_daemon(unit_dir: &Path, home: &str) -> Result<(), String> {
    let poc_daemon = PathBuf::from(home).join(".cargo/bin/poc-daemon");
    if !poc_daemon.exists() {
        eprintln!("Warning: poc-daemon not found at {} — skipping service install", poc_daemon.display());
        eprintln!("  Build with: cargo install --path .");
        return Ok(());
    }

    let unit = format!(
r#"[Unit]
Description=poc-daemon — notification routing and idle management
After=default.target

[Service]
Type=simple
ExecStart={exe} daemon
Restart=on-failure
RestartSec=10
Environment=HOME={home}
Environment=PATH={home}/.cargo/bin:{home}/.local/bin:{home}/bin:/usr/local/bin:/usr/bin:/bin

[Install]
WantedBy=default.target
"#, exe = poc_daemon.display(), home = home);

    let unit_path = unit_dir.join("poc-daemon.service");
    fs::write(&unit_path, &unit)
        .map_err(|e| format!("write {}: {}", unit_path.display(), e))?;
    eprintln!("Wrote {}", unit_path.display());

    let status = std::process::Command::new("systemctl")
        .args(["--user", "daemon-reload"])
        .status()
        .map_err(|e| format!("systemctl daemon-reload: {}", e))?;
    if !status.success() {
        return Err("systemctl daemon-reload failed".into());
    }

    let status = std::process::Command::new("systemctl")
        .args(["--user", "enable", "--now", "poc-daemon"])
        .status()
        .map_err(|e| format!("systemctl enable: {}", e))?;
    if !status.success() {
        return Err("systemctl enable --now poc-daemon failed".into());
    }

    eprintln!("poc-daemon service enabled and started");
    Ok(())
}

/// Install memory-search and poc-hook into Claude Code settings.json.
/// Public so `poc-memory init` can call it too.
///
/// Hook layout:
///   UserPromptSubmit: memory-search (10s), poc-hook (5s)
///   PostToolUse:      poc-hook (5s)
///   Stop:             poc-hook (5s)
pub fn install_hook() -> Result<(), String> {
    let home = std::env::var("HOME").map_err(|e| format!("HOME: {}", e))?;
    let exe = std::env::current_exe()
        .map_err(|e| format!("current_exe: {}", e))?;
    let settings_path = PathBuf::from(&home).join(".claude/settings.json");

    let memory_search = exe.with_file_name("memory-search");
    let poc_hook = exe.with_file_name("poc-hook");

    let mut settings: serde_json::Value = if settings_path.exists() {
        let content = fs::read_to_string(&settings_path)
            .map_err(|e| format!("read settings: {}", e))?;
        serde_json::from_str(&content)
            .map_err(|e| format!("parse settings: {}", e))?
    } else {
        serde_json::json!({})
    };

    let obj = settings.as_object_mut().ok_or("settings not an object")?;
    let hooks_obj = obj.entry("hooks")
        .or_insert_with(|| serde_json::json!({}))
        .as_object_mut().ok_or("hooks not an object")?;

    let mut changed = false;

    // Helper: ensure a hook binary is present in an event's hook list
    let ensure_hook = |hooks_obj: &mut serde_json::Map<String, serde_json::Value>,
                       event: &str,
                       binary: &Path,
                       timeout: u32,
                       changed: &mut bool| {
        if !binary.exists() {
            eprintln!("Warning: {} not found — skipping", binary.display());
            return;
        }
        let cmd = binary.to_string_lossy().to_string();
        let name = binary.file_name().unwrap().to_string_lossy().to_string();

        let event_array = hooks_obj.entry(event)
            .or_insert_with(|| serde_json::json!([{"hooks": []}]))
            .as_array_mut().unwrap();
        if event_array.is_empty() {
            event_array.push(serde_json::json!({"hooks": []}));
        }
        let inner = event_array[0]
            .as_object_mut().unwrap()
            .entry("hooks")
            .or_insert_with(|| serde_json::json!([]))
            .as_array_mut().unwrap();

        // Remove legacy load-memory.sh
        let before = inner.len();
        inner.retain(|h| {
            let c = h.get("command").and_then(|c| c.as_str()).unwrap_or("");
            !c.contains("load-memory")
        });
        if inner.len() < before {
            eprintln!("Removed load-memory.sh from {event}");
            *changed = true;
        }

        let already = inner.iter().any(|h| {
            h.get("command").and_then(|c| c.as_str())
                .is_some_and(|c| c.contains(&name))
        });

        if !already {
            inner.push(serde_json::json!({
                "type": "command",
                "command": cmd,
                "timeout": timeout
            }));
            *changed = true;
            eprintln!("Installed {name} in {event}");
        }
    };

    // UserPromptSubmit: memory-search + poc-hook
    ensure_hook(hooks_obj, "UserPromptSubmit", &memory_search, 10, &mut changed);
    ensure_hook(hooks_obj, "UserPromptSubmit", &poc_hook, 5, &mut changed);

    // PostToolUse + Stop: poc-hook only
    ensure_hook(hooks_obj, "PostToolUse", &poc_hook, 5, &mut changed);
    ensure_hook(hooks_obj, "Stop", &poc_hook, 5, &mut changed);

    if changed {
        let json = serde_json::to_string_pretty(&settings)
            .map_err(|e| format!("serialize settings: {}", e))?;
        fs::write(&settings_path, json)
            .map_err(|e| format!("write settings: {}", e))?;
        eprintln!("Updated {}", settings_path.display());
    } else {
        eprintln!("All hooks already installed in {}", settings_path.display());
    }

    Ok(())
}

pub fn show_log(job_filter: Option<&str>, lines: usize) -> Result<(), String> {
    let path = log_path();
    if !path.exists() {
        eprintln!("No daemon log found.");
        return Ok(());
    }

    let content = fs::read_to_string(&path)
        .map_err(|e| format!("read log: {}", e))?;

    let filtered: Vec<&str> = content.lines().rev()
        .filter(|line| {
            if let Some(job) = job_filter {
                line.contains(&format!("\"job\":\"{}\"", job))
            } else {
                true
            }
        })
        .take(lines)
        .collect();

    if filtered.is_empty() {
        eprintln!("No log entries{}", job_filter.map(|j| format!(" for job '{}'", j)).unwrap_or_default());
        return Ok(());
    }

    // Pretty-print: parse JSON and format as "TIME  JOB  EVENT  [DETAIL]"
    for line in filtered.into_iter().rev() {
        if let Ok(obj) = serde_json::from_str::<serde_json::Value>(line) {
            let ts = obj.get("ts").and_then(|v| v.as_str()).unwrap_or("?");
            let job = obj.get("job").and_then(|v| v.as_str()).unwrap_or("?");
            let event = obj.get("event").and_then(|v| v.as_str()).unwrap_or("?");
            let detail = obj.get("detail").and_then(|v| v.as_str()).unwrap_or("");

            // Shorten timestamp to just time portion
            let time = if ts.len() >= 19 { &ts[11..19] } else { ts };

            if detail.is_empty() {
                eprintln!("  {}  {:20} {}", time, job, event);
            } else {
                // Truncate long details (file paths)
                let short = if detail.len() > 60 {
                    let last = detail.rfind('/').map(|i| &detail[i+1..]).unwrap_or(detail);
                    if last.len() > 60 { &last[..60] } else { last }
                } else {
                    detail
                };
                eprintln!("  {}  {:20} {:12} {}", time, job, event, short);
            }
        } else {
            eprintln!("{}", line);
        }
    }
    Ok(())
}