diff --git a/Cargo.lock b/Cargo.lock index 0c3d5a7..97a5f80 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1805,25 +1805,13 @@ checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" [[package]] name = "jobkit" -version = "0.1.0" -source = "git+https://evilpiepirate.org/git/jobkit.git/#2cdf0d5c3dd55f3d1783c40211a7eb96707d1ab6" -dependencies = [ - "crossbeam-deque", - "log", - "profiling", - "serde", - "serde_json", -] - -[[package]] -name = "jobkit-daemon" -version = "0.4.0" -source = "git+https://evilpiepirate.org/git/jobkit-daemon.git/#6faa0a06e3b2d69ea22fc691b4e8e4760b0772f7" +version = "0.2.0" dependencies = [ "chrono", - "jobkit", + "crossbeam-deque", "libc", "log", + "profiling", "serde", "serde_json", ] @@ -2655,7 +2643,6 @@ dependencies = [ "crossterm 0.28.1", "faer", "jobkit", - "jobkit-daemon", "libc", "log", "memmap2", diff --git a/doc/logging.md b/doc/logging.md new file mode 100644 index 0000000..7728ca7 --- /dev/null +++ b/doc/logging.md @@ -0,0 +1,76 @@ +# Logging Architecture + +poc-memory has multiple logging channels serving different purposes. +Understanding which log to check is essential for debugging. + +## Log files + +### daemon.log — structured event log +- **Path**: `$data_dir/daemon.log` (default: `~/.claude/memory/daemon.log`) +- **Format**: JSONL — `{"ts", "job", "event", "detail"}` +- **Written by**: `jobkit_daemon::event_log::log()`, wrapped by `log_event()` in daemon.rs +- **Rotation**: truncates to last half when file exceeds 1MB +- **Contains**: task lifecycle events (started, completed, failed, progress), + session-watcher ticks, scheduler events +- **View**: `poc-memory agent daemon log [--job NAME] [--lines N]` +- **Note**: the "daemon log" command reads this file and formats the JSONL + as human-readable lines with timestamps. The `--job` filter shows only + entries for a specific job name. + +### daemon-status.json — live snapshot +- **Path**: `$data_dir/daemon-status.json` +- **Format**: pretty-printed JSON +- **Written by**: `write_status()` in daemon.rs, called periodically +- **Contains**: current task list with states (pending/running/completed), + graph health metrics, consolidation plan, uptime +- **View**: `poc-memory agent daemon status` + +### llm-logs/ — per-agent LLM call transcripts +- **Path**: `$data_dir/llm-logs/{agent_name}/{timestamp}.txt` +- **Format**: plaintext sections: `=== PROMPT ===`, `=== CALLING LLM ===`, + `=== RESPONSE ===` +- **Written by**: `run_one_agent_inner()` in knowledge.rs +- **Contains**: full prompt sent to the LLM and full response received. + One file per agent invocation. Invaluable for debugging agent quality — + shows exactly what the model saw and what it produced. +- **Volume**: can be large — 292 files for distill alone as of Mar 19. + +### retrieval.log — memory search queries +- **Path**: `$data_dir/retrieval.log` +- **Format**: plaintext, one line per search: `[date] q="..." hits=N` +- **Contains**: every memory search query and hit count. Useful for + understanding what the memory-search hook is doing and whether + queries are finding useful results. + +### daily-check.log — graph health history +- **Path**: `$data_dir/daily-check.log` +- **Format**: plaintext, multi-line entries with metrics +- **Contains**: graph topology metrics over time (σ, α, gini, cc, fit). + Only ~10 entries — appended by the daily health check. + +## In-memory state (redundant with daemon.log) + +### ctx.log_line() — task output log +- **Stored in**: jobkit task state (last 20 lines per task) +- **Also writes to**: daemon.log via `log_event()` (as of Mar 19) +- **View**: `daemon-status.json` → task → output_log, or just tail daemon.log +- **Design note**: the in-memory buffer is redundant now that progress + events go to daemon.log. The status viewer should eventually just + tail daemon.log filtered by job name, eliminating the in-memory state. + +### ctx.set_progress() — current activity string +- **Stored in**: jobkit task state +- **View**: shown in status display next to the task name +- **Note**: overwritten by each `ctx.log_line()` call. + +## What to check when + +| Problem | Check | +|----------------------------------|------------------------------------| +| Task not starting | daemon-status.json (task states) | +| Task failing | daemon.log (failed events) | +| Agent producing bad output | llm-logs/{agent}/{timestamp}.txt | +| Agent not finding right nodes | retrieval.log (search queries) | +| Graph health declining | daily-check.log | +| Resource pool / parallelism | **currently no log** — need to add | +| Which LLM backend is being used | daemon.log (llm-backend event) | diff --git a/poc-memory/Cargo.toml b/poc-memory/Cargo.toml index 29e11f6..5fbc91f 100644 --- a/poc-memory/Cargo.toml +++ b/poc-memory/Cargo.toml @@ -19,8 +19,7 @@ memmap2 = "0.9" rayon = "1" peg = "0.8" paste = "1" -jobkit = { git = "https://evilpiepirate.org/git/jobkit.git/" } -jobkit-daemon = { git = "https://evilpiepirate.org/git/jobkit-daemon.git/" } +jobkit = { path = "/home/kent/jobkit", features = ["daemon"] } poc-agent = { path = "../poc-agent" } tokio = { version = "1", features = ["rt-multi-thread"] } redb = "2" diff --git a/poc-memory/src/agents/daemon.rs b/poc-memory/src/agents/daemon.rs index e05850b..69aadb7 100644 --- a/poc-memory/src/agents/daemon.rs +++ b/poc-memory/src/agents/daemon.rs @@ -29,14 +29,19 @@ fn log_path() -> PathBuf { // --- Logging --- fn log_event(job: &str, event: &str, detail: &str) { - jobkit_daemon::event_log::log(&crate::config::get().data_dir, job, event, detail); + jobkit::daemon::event_log::log(&crate::config::get().data_dir, job, event, detail); +} + +/// Public wrapper for logging from other agent modules. +pub fn log_event_pub(job: &str, event: &str, detail: &str) { + log_event(job, event, detail); } // --- Job functions (direct, no subprocess) --- /// Run a named job with logging, progress reporting, and error mapping. fn run_job(ctx: &ExecutionContext, name: &str, f: impl FnOnce() -> Result<(), String>) -> Result<(), TaskError> { - jobkit_daemon::Daemon::run_job(&crate::config::get().data_dir, ctx, name, f) + jobkit::daemon::Daemon::run_job(&crate::config::get().data_dir, ctx, name, f) } // experience_mine and fact_mine removed — observation.agent handles all transcript mining @@ -49,11 +54,15 @@ fn job_targeted_agent( ) -> Result<(), TaskError> { let agent = agent_type.to_string(); let key = target_key.to_string(); - let job_name = format!("c-{}-target", agent); + let job_name = format!("c-{}-target({})", agent, key); run_job(ctx, &job_name, || { let mut store = crate::store::Store::load()?; ctx.log_line(&format!("targeting: {}", key)); - let log = |msg: &str| { ctx.log_line(msg); }; + let job = job_name.clone(); + let log = |msg: &str| { + ctx.log_line(msg); + log_event(&job, "progress", msg); + }; super::knowledge::run_one_agent_with_keys( &mut store, &agent, &[key.clone()], 5, "daemon", &log, false, )?; @@ -576,7 +585,7 @@ fn write_status( graph_health: &Arc>>, ) { let status = build_status(choir, last_daily, graph_health); - jobkit_daemon::status::write(&crate::config::get().data_dir, &status); + jobkit::daemon::status::write(&crate::config::get().data_dir, &status); } #[derive(Clone, Default, serde::Serialize, serde::Deserialize)] @@ -617,7 +626,7 @@ struct DaemonStatus { pub fn run_daemon() -> Result<(), String> { let config = crate::config::get(); - let mut daemon = jobkit_daemon::Daemon::new(jobkit_daemon::DaemonConfig { + let mut daemon = jobkit::daemon::Daemon::new(jobkit::daemon::DaemonConfig { data_dir: config.data_dir.clone(), resource_slots: config.llm_concurrency, resource_name: "llm".to_string(), @@ -626,10 +635,12 @@ pub fn run_daemon() -> Result<(), String> { let choir = Arc::clone(&daemon.choir); let llm = Arc::clone(&daemon.resource); + let task_log_dir = config.data_dir.join("logs"); + let _ = fs::create_dir_all(&task_log_dir); // Recover last_daily from previous status file let last_daily: Arc>> = Arc::new(Mutex::new( - jobkit_daemon::status::load::(&config.data_dir) + jobkit::daemon::status::load::(&config.data_dir) .and_then(|s| s.last_daily) .and_then(|d| d.parse().ok()) )); @@ -907,6 +918,7 @@ pub fn run_daemon() -> Result<(), String> { let llm_sched = Arc::clone(&llm); let last_daily_sched = Arc::clone(&last_daily); let graph_health_sched = Arc::clone(&graph_health); + let log_dir_sched = task_log_dir.clone(); const CONSOLIDATION_INTERVAL: Duration = Duration::from_secs(6 * 3600); // 6 hours choir.spawn("scheduler").init(move |ctx| { @@ -967,6 +979,7 @@ pub fn run_daemon() -> Result<(), String> { let task_name = format!("c-{}-{}:{}", agent, i, today); let mut builder = choir_sched.spawn(task_name) .resource(&llm_sched) + .log_dir(&log_dir_sched) .retries(1) .init(move |ctx| { job_consolidation_agent(ctx, &agent, b) @@ -1067,6 +1080,7 @@ pub fn run_daemon() -> Result<(), String> { { let choir_rpc = Arc::clone(&choir); let llm_rpc = Arc::clone(&llm); + let log_dir_rpc = task_log_dir.clone(); daemon.add_rpc_handler(move |cmd, _ctx| { if !cmd.starts_with("run-agent ") { return None; } let parts: Vec<&str> = cmd.splitn(4, ' ').collect(); @@ -1093,8 +1107,11 @@ pub fn run_daemon() -> Result<(), String> { let agent = agent_type.to_string(); let key = key.clone(); let task_name = format!("c-{}-{}:{}", agent, key.chars().take(30).collect::(), today); + log_event("daemon", "spawn-targeted", + &format!("{} (available slots: {})", task_name, llm_rpc.available())); choir_rpc.spawn(task_name) .resource(&llm_rpc) + .log_dir(&log_dir_rpc) .retries(1) .init(move |ctx| { job_targeted_agent(ctx, &agent, &key) @@ -1133,6 +1150,7 @@ pub fn run_daemon() -> Result<(), String> { let task_name = format!("c-{}-rpc{}:{}", agent, ts, today); let mut builder = choir_rpc.spawn(task_name) .resource(&llm_rpc) + .log_dir(&log_dir_rpc) .retries(1) .init(move |ctx| { if is_rename { @@ -1174,7 +1192,7 @@ pub fn send_rpc_pub(cmd: &str) -> Option { } fn send_rpc(cmd: &str) -> Option { - jobkit_daemon::socket::send_rpc(&crate::config::get().data_dir, cmd) + jobkit::daemon::socket::send_rpc(&crate::config::get().data_dir, cmd) } pub fn rpc_consolidate() -> Result<(), String> { @@ -1209,7 +1227,7 @@ pub fn rpc_run_agent(agent: &str, count: usize) -> Result<(), String> { } fn read_status_socket() -> Option { - let json = jobkit_daemon::socket::send_rpc(&crate::config::get().data_dir, "")?; + let json = jobkit::daemon::socket::send_rpc(&crate::config::get().data_dir, "")?; serde_json::from_str(&json).ok() } @@ -1411,10 +1429,10 @@ pub fn show_status() -> Result<(), String> { .unwrap_or_default(); let name = short_job_name(&t.name); eprintln!(" {} {:30}{}{}", sym, name, elapsed, progress); - if matches!(t.status, TaskStatus::Running) && !t.output_log.is_empty() { - let skip = t.output_log.len().saturating_sub(3); - for line in &t.output_log[skip..] { - eprintln!(" │ {}", line); + if let Some(ref lp) = t.log_path { + // tail from log file + if matches!(t.status, TaskStatus::Running) { + eprintln!(" │ log: {}", lp); } } } @@ -1449,11 +1467,8 @@ pub fn show_status() -> Result<(), String> { }; let progress = t.progress.as_deref().map(|p| format!(" {}", p)).unwrap_or_default(); eprintln!(" {} {}{}{}", status_symbol(t), t.name, elapsed, progress); - if !t.output_log.is_empty() { - let skip = t.output_log.len().saturating_sub(3); - for line in &t.output_log[skip..] { - eprintln!(" │ {}", line); - } + if let Some(ref lp) = t.log_path { + eprintln!(" │ log: {}", lp); } } } @@ -1512,10 +1527,10 @@ pub fn show_status() -> Result<(), String> { } // Show output log tail for running tasks - if matches!(t.status, TaskStatus::Running) && !t.output_log.is_empty() { - let skip = t.output_log.len().saturating_sub(5); - for line in &t.output_log[skip..] { - eprintln!(" │ {}", line); + if let Some(ref lp) = t.log_path { + // tail from log file + if matches!(t.status, TaskStatus::Running) { + eprintln!(" │ log: {}", lp); } } } @@ -1739,6 +1754,58 @@ pub fn install_hook() -> Result<(), String> { Ok(()) } +/// Drill down into a task's log file. Finds the log path from: +/// 1. Running task status (daemon-status.json) +/// 2. daemon.log started events (for completed/failed tasks) +pub fn show_task_log(task_name: &str, lines: usize) -> Result<(), String> { + // Try running tasks first + if let Some(status_json) = send_rpc_pub("") { + if let Ok(status) = serde_json::from_str::(&status_json) { + if let Some(tasks) = status.get("tasks").and_then(|t| t.as_array()) { + for t in tasks { + let name = t.get("name").and_then(|n| n.as_str()).unwrap_or(""); + if name.contains(task_name) { + if let Some(lp) = t.get("log_path").and_then(|p| p.as_str()) { + return tail_file(lp, lines); + } + } + } + } + } + } + + // Fall back to searching daemon.log for the most recent started event with a log path + let log = log_path(); + if log.exists() { + let content = fs::read_to_string(&log).map_err(|e| format!("read log: {}", e))?; + for line in content.lines().rev() { + if let Ok(obj) = serde_json::from_str::(line) { + let job = obj.get("job").and_then(|v| v.as_str()).unwrap_or(""); + let event = obj.get("event").and_then(|v| v.as_str()).unwrap_or(""); + let detail = obj.get("detail").and_then(|v| v.as_str()).unwrap_or(""); + if job.contains(task_name) && event == "started" && detail.starts_with("log: ") { + let path = &detail[5..]; + return tail_file(path, lines); + } + } + } + } + + Err(format!("no log file found for task '{}'", task_name)) +} + +fn tail_file(path: &str, lines: usize) -> Result<(), String> { + let content = fs::read_to_string(path) + .map_err(|e| format!("read {}: {}", path, e))?; + let all_lines: Vec<&str> = content.lines().collect(); + let skip = all_lines.len().saturating_sub(lines); + eprintln!("--- {} ({} lines) ---", path, all_lines.len()); + for line in &all_lines[skip..] { + eprintln!("{}", line); + } + Ok(()) +} + pub fn show_log(job_filter: Option<&str>, lines: usize) -> Result<(), String> { let path = log_path(); if !path.exists() { diff --git a/poc-memory/src/agents/llm.rs b/poc-memory/src/agents/llm.rs index 9dee69d..dd4e685 100644 --- a/poc-memory/src/agents/llm.rs +++ b/poc-memory/src/agents/llm.rs @@ -187,10 +187,15 @@ pub(crate) fn call_haiku(agent: &str, prompt: &str) -> Result { /// Uses the direct API backend when api_base_url is configured, /// otherwise falls back to claude CLI subprocess. pub(crate) fn call_for_def(def: &super::defs::AgentDef, prompt: &str) -> Result { - if crate::config::get().api_base_url.is_some() && !def.tools.is_empty() { + let config = crate::config::get(); + if config.api_base_url.is_some() && !def.tools.is_empty() { + super::daemon::log_event_pub(&def.agent, "llm-backend", + &format!("API: {}", config.api_base_url.as_deref().unwrap_or("?"))); let log = |msg: &str| eprintln!("[{}] {}", def.agent, msg); super::api::call_api_with_tools_sync(&def.agent, prompt, &log) } else { + super::daemon::log_event_pub(&def.agent, "llm-backend", + &format!("claude -p (model={}, tools={})", def.model, def.tools.len())); call_model_with_tools(&def.agent, &def.model, prompt, &def.tools) } } diff --git a/poc-memory/src/main.rs b/poc-memory/src/main.rs index 99f99fd..6ea7822 100644 --- a/poc-memory/src/main.rs +++ b/poc-memory/src/main.rs @@ -447,6 +447,9 @@ enum DaemonCmd { Log { /// Job name to filter by job: Option, + /// Tail a task's log file (drill down from daemon log) + #[arg(long)] + task: Option, /// Number of lines to show #[arg(long, default_value_t = 20)] lines: usize, @@ -1043,7 +1046,13 @@ fn cmd_daemon(sub: DaemonCmd) -> Result<(), String> { match sub { DaemonCmd::Start => daemon::run_daemon(), DaemonCmd::Status => daemon::show_status(), - DaemonCmd::Log { job, lines } => daemon::show_log(job.as_deref(), lines), + DaemonCmd::Log { job, task, lines } => { + if let Some(ref task_name) = task { + daemon::show_task_log(task_name, lines) + } else { + daemon::show_log(job.as_deref(), lines) + } + } DaemonCmd::Install => daemon::install_service(), DaemonCmd::Consolidate => daemon::rpc_consolidate(), DaemonCmd::Run { agent, count } => daemon::rpc_run_agent(&agent, count), diff --git a/poc-memory/src/tui.rs b/poc-memory/src/tui.rs index 5c290d2..a7d947b 100644 --- a/poc-memory/src/tui.rs +++ b/poc-memory/src/tui.rs @@ -52,7 +52,7 @@ struct DaemonStatus { } fn fetch_status() -> Option { - let json = jobkit_daemon::socket::send_rpc(&crate::config::get().data_dir, "")?; + let json = jobkit::daemon::socket::send_rpc(&crate::config::get().data_dir, "")?; serde_json::from_str(&json).ok() } @@ -427,11 +427,8 @@ fn render_overview(frame: &mut Frame, app: &App, area: Rect) { ), Span::raw(format!(" {}", progress)), ])); - if matches!(t.status, TaskStatus::Running) && !t.output_log.is_empty() { - let skip = t.output_log.len().saturating_sub(2); - for line in &t.output_log[skip..] { - lines.push(Line::from(format!(" │ {}", line)).fg(Color::DarkGray)); - } + if let Some(ref lp) = t.log_path { + lines.push(Line::from(format!(" │ log: {}", lp)).fg(Color::DarkGray)); } } } @@ -685,11 +682,9 @@ fn render_agent_tab(frame: &mut Frame, app: &App, agent_type: &str, area: Rect) ])); } - // Output log - if !t.output_log.is_empty() { - for log_line in &t.output_log { - lines.push(Line::from(format!(" │ {}", log_line)).fg(Color::DarkGray)); - } + // Log file path + if let Some(ref lp) = t.log_path { + lines.push(Line::from(format!(" │ log: {}", lp)).fg(Color::DarkGray)); } // Error @@ -785,7 +780,7 @@ fn short_name(name: &str) -> String { } fn send_rpc(cmd: &str) -> Option { - jobkit_daemon::socket::send_rpc(&crate::config::get().data_dir, cmd) + jobkit::daemon::socket::send_rpc(&crate::config::get().data_dir, cmd) } // --- Entry point ---